# Importer les packages

In [158]:
import pandas as pd
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Charger les données

La dernière colonne `Bug` est créée de manière aléatoire : elle prend la valeur 1 si il y a un bug et 0 sinon.

In [159]:
data = pd.read_csv('file.csv')
data['Bug'] = data["Fichier"]//10%2
data

Unnamed: 0,Version,CommitId,Fichier,AvgCyclomatic,AvgCyclomaticModified,Bug
2.1.0,id1,f1,21,41,209,0
2.2.0,id1,f2,91,4,9,1
2.3.0,id2,f1,31,51,19,1
2.4.0,id3,f2,41,41,9,0
2.5.0,id1,f1,21,41,209,0
2.6.0,id1,f2,91,4,9,1
2.7.0,id2,f1,31,51,19,1
2.8.0,id3,f2,41,41,9,0


# Préparation des variables
Modifier les variables indépendantes en fonction de la table. On ne prend pas en compte la version et l'id des commits dans le modèle.

In [160]:
X = data.drop(columns=['Bug','Version','CommitId']) # variables indépendantes, à modifier
y = data['Bug'] # présence d'un bug

# Données d'entraînements et de test
Modifier test_size à **0.2** pour 20% de test, et 80% de train

In [173]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)

# Entrainement du modèle

In [174]:
model1 = LogisticRegression()
model2 = RandomForestClassifier(n_estimators=100, random_state=42)

model1.fit(X_train, y_train)
model2.fit(X_train, y_train)

print(y_train.value_counts())

Bug
1    2
0    2
Name: count, dtype: int64


# Prédictions

In [175]:
y_pred1 = model1.predict(X_test)
print("Prédictions logistic regression :", y_pred1)
print("Valeurs réelles :", y_test.values)
y_pred2 = model2.predict(X_test)
print("Prédictions random forest :", y_pred2)
print("Valeurs réelles :", y_test.values)

Prédictions logistic regression : [1 1 0 0]
Valeurs réelles : [0 0 1 1]
Prédictions random forest : [1 1 0 0]
Valeurs réelles : [0 0 1 1]


In [176]:
# Calcul de l'AUC pour chaque modèle
auc1 = roc_auc_score(y_test, y_pred1)
auc2 = roc_auc_score(y_test, y_pred2)

# Evaluer la performance du modèle

In [177]:
accuracy = accuracy_score(y_test, y_pred1)
print(f"Accuracy logistic regression: {accuracy}")
print(classification_report(y_test, y_pred1))
print(f"AUC Logistic Regression: {auc1}")
print("-"*60)

accuracy = accuracy_score(y_test, y_pred2)
print(f"Accuracy random forest: {accuracy}")
print(classification_report(y_test, y_pred2))
print(f"AUC Random Forest: {auc2}")

Accuracy logistic regression: 0.0
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       2.0
           1       0.00      0.00      0.00       2.0

    accuracy                           0.00       4.0
   macro avg       0.00      0.00      0.00       4.0
weighted avg       0.00      0.00      0.00       4.0

AUC Logistic Regression: 0.0
------------------------------------------------------------
Accuracy random forest: 0.0
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       2.0
           1       0.00      0.00      0.00       2.0

    accuracy                           0.00       4.0
   macro avg       0.00      0.00      0.00       4.0
weighted avg       0.00      0.00      0.00       4.0

AUC Random Forest: 0.0


In [178]:
print("Indices de X_train :", X_train.index)
print("Indices de X_test :", X_test.index)

Indices de X_train : Index(['2.7.0', '2.4.0', '2.8.0', '2.3.0'], dtype='object')
Indices de X_test : Index(['2.5.0', '2.1.0', '2.6.0', '2.2.0'], dtype='object')
