In [1]:
!git clone https://github.com/MicroPyramid/Django-CRM.git

Cloning into 'Django-CRM'...


In [1]:
import os
repo_path = "./Django-CRM"

In [6]:
import os, subprocess
import pandas as pd

repo_path = "./Django-CRM"
python_files = []
for root, dirs, files in os.walk(repo_path):
    for file in files:
        if file.endswith(".py"):
            python_files.append(os.path.abspath(os.path.join(root, file)))
print(f"Found {len(python_files)} Python files")

def get_python_complexity(fp):
    try:
        res = subprocess.run(['radon','cc','-s','-a',fp], capture_output=True, text=True)
        for line in res.stdout.splitlines():
            if "Average complexity" in line:
                return float(line.split()[-1].strip("()"))
    except Exception as e:
        print("Complexity error:", e)
    return None

df = pd.DataFrame({"file_path": python_files})
df["module_complexity"] = df["file_path"].apply(get_python_complexity)
df.dropna(subset=["module_complexity"], inplace=True)


Found 204 Python files


In [8]:
df.head()

Unnamed: 0,file_path,module_complexity
2,C:\Users\HP\Django\Django-CRM\accounts\apps.py,1.0
3,C:\Users\HP\Django\Django-CRM\accounts\models.py,1.333333
4,C:\Users\HP\Django\Django-CRM\accounts\seriali...,2.357143
6,C:\Users\HP\Django\Django-CRM\accounts\tasks.py,5.666667
7,C:\Users\HP\Django\Django-CRM\accounts\tests_c...,1.5


In [10]:
import git
from collections import defaultdict

repo = git.Repo(repo_path)
file_churn = defaultdict(int)

default_branch = repo.head.reference.name
for commit in repo.iter_commits(default_branch, max_count=300):
    for f, stats in commit.stats.files.items():
        if f.endswith(".py"):
            abs_path = os.path.abspath(os.path.join(repo_path, f))
            file_churn[abs_path] += stats.get("lines", 0)

df["code_churn"] = df["file_path"].map(lambda fp: file_churn.get(fp, 0)).astype(int)


In [12]:
df.head()

Unnamed: 0,file_path,module_complexity,code_churn
2,C:\Users\HP\Django\Django-CRM\accounts\apps.py,1.0,2
3,C:\Users\HP\Django\Django-CRM\accounts\models.py,1.333333,706
4,C:\Users\HP\Django\Django-CRM\accounts\seriali...,2.357143,433
6,C:\Users\HP\Django\Django-CRM\accounts\tasks.py,5.666667,494
7,C:\Users\HP\Django\Django-CRM\accounts\tests_c...,1.5,117


In [14]:
# Initialize bug count dictionary
bug_counts = defaultdict(int)

# Common bug-related keywords
bug_keywords = ["fix", "bug", "issue", "error", "defect", "patch"]

# Iterate over commits
for commit in repo.iter_commits(repo.head.reference.name, max_count=1000):
    if any(word in commit.message.lower() for word in bug_keywords):
        for file_path in commit.stats.files:
            if file_path.endswith(".py"):
                full_path = os.path.abspath(os.path.join(repo_path, file_path))
                bug_counts[full_path] += 1

# Map to your df
df["past_bugs"] = df["file_path"].map(lambda fp: bug_counts.get(fp, 0)).astype(int)

In [16]:
df.head()

Unnamed: 0,file_path,module_complexity,code_churn,past_bugs
2,C:\Users\HP\Django\Django-CRM\accounts\apps.py,1.0,2,0
3,C:\Users\HP\Django\Django-CRM\accounts\models.py,1.333333,706,10
4,C:\Users\HP\Django\Django-CRM\accounts\seriali...,2.357143,433,0
6,C:\Users\HP\Django\Django-CRM\accounts\tasks.py,5.666667,494,4
7,C:\Users\HP\Django\Django-CRM\accounts\tests_c...,1.5,117,1


In [18]:
def count_test_failures(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().lower()
            return content.count('assert') + content.count('self.fail')
    except:
        return 0

df["test_failures"] = df["file_path"].apply(count_test_failures)


In [20]:
df.head()

Unnamed: 0,file_path,module_complexity,code_churn,past_bugs,test_failures
2,C:\Users\HP\Django\Django-CRM\accounts\apps.py,1.0,2,0,0
3,C:\Users\HP\Django\Django-CRM\accounts\models.py,1.333333,706,10,0
4,C:\Users\HP\Django\Django-CRM\accounts\seriali...,2.357143,433,0,0
6,C:\Users\HP\Django\Django-CRM\accounts\tasks.py,5.666667,494,4,0
7,C:\Users\HP\Django\Django-CRM\accounts\tests_c...,1.5,117,1,3


In [22]:
df["defect_likely"] = (
    (df["past_bugs"] > 10) |
    (df["module_complexity"] > 5) |
    (df["code_churn"] > 150)
).astype(int)


In [26]:
df.head()

Unnamed: 0,file_path,module_complexity,code_churn,past_bugs,test_failures,defect_likely
2,C:\Users\HP\Django\Django-CRM\accounts\apps.py,1.0,2,0,0,0
3,C:\Users\HP\Django\Django-CRM\accounts\models.py,1.333333,706,10,0,1
4,C:\Users\HP\Django\Django-CRM\accounts\seriali...,2.357143,433,0,0,1
6,C:\Users\HP\Django\Django-CRM\accounts\tasks.py,5.666667,494,4,0,1
7,C:\Users\HP\Django\Django-CRM\accounts\tests_c...,1.5,117,1,3,0


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle


In [28]:
X = df[["code_churn","module_complexity","past_bugs","test_failures"]]
y = df["defect_likely"]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [34]:
print(classification_report(y_test, model.predict(X_test)))
pickle.dump(model, open("defect_model.pkl","wb"))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        15

    accuracy                           1.00        25
   macro avg       1.00      1.00      1.00        25
weighted avg       1.00      1.00      1.00        25



In [36]:
df.to_csv("module_data.csv", index=False)