In [None]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc, precision_recall_curve, average_precision_score, brier_score_loss
import matplotlib.pyplot as plt

In [None]:
# Read file .csv cleaned
data = pd.read_csv('sample_data/NewSample', sep=";", on_bad_lines='skip')
data

Unnamed: 0,ID:,COMMIT:,CATEGORIES:
0,1,build-system: don't always build qtserialbluet...,BUILD
1,2,Update dependency versions Fix test build brok...,BUILD
2,3,"Downgrade gradle to 2.2.1 , https://github.com...",BUILD
3,4,Translated using Weblate (Italian) Currently t...,BUILD
4,5,Merge branch 'master' of https://Bananeweizen@...,BUILD
...,...,...,...
2021,2022,Let git clones without a google-service to bui...,NO LABEL
2022,2023,"MMS support , https://github.com/jberkel/sms-b...",NO LABEL
2023,2024,"Disabled --debug for travis , https://github.c...",NO LABEL
2024,2025,"类重命名 , https://github.com/TakWolf/CNode-Materi...",NO LABEL


In [None]:
# Separate the messages and labels
X = data['COMMIT:']
y = data['CATEGORIES:']

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [None]:
from sklearn.metrics import precision_recall_fscore_support

models = {
    'svc': SVC(kernel='linear', C = 1.0),
    'Decision_Tree': DecisionTreeClassifier(criterion='entropy'),
    'Naive_Bayes' : MultinomialNB(),
    'SGD': SGDClassifier(),
    'NN': MLPClassifier(max_iter=1500),
    'RFC': RandomForestClassifier(n_estimators=3, max_depth=2)
}

# Dictionary for memorising measurements
scores = {}

for model_key in models:
    model = models[model_key]
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', model),
    ])

    # Model training
    text_clf.fit(X_train, y_train)

    # Prediction on the test set
    y_pred = text_clf.predict(X_test)

    # Calculation of precision, recall, F1-score and accuracy measurements
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)

    # Storing measurements in the dictionary
    scores[model_key] = {'Precision': precision, 'Recall': recall, 'F1-Score': f1, 'Accuracy': accuracy}

# Print measurements for each model
for model_key, metrics in scores.items():
    print(f"Modello: {model_key}")
    print(f"Precision: {metrics['Precision']:.2f}")
    print(f"Recall: {metrics['Recall']:.2f}")
    print(f"F1-Score: {metrics['F1-Score']:.2f}")
    print(f"Accuracy: {metrics['Accuracy']:.2f}")
    print("\n")

  _warn_prf(average, modifier, msg_start, len(result))


Modello: svc
Precision: 0.82
Recall: 0.82
F1-Score: 0.78
Accuracy: 0.82


Modello: Decision_Tree
Precision: 0.79
Recall: 0.79
F1-Score: 0.79
Accuracy: 0.79


Modello: Naive_Bayes
Precision: 0.68
Recall: 0.68
F1-Score: 0.58
Accuracy: 0.68


Modello: SGD
Precision: 0.80
Recall: 0.81
F1-Score: 0.80
Accuracy: 0.81


Modello: NN
Precision: 0.77
Recall: 0.78
F1-Score: 0.76
Accuracy: 0.78


Modello: RFC
Precision: 0.43
Recall: 0.66
F1-Score: 0.52
Accuracy: 0.66




  _warn_prf(average, modifier, msg_start, len(result))
