In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

In [2]:
# Read file .csv cleaned
data = pd.read_csv('sample_data/NewDF_cleaned_AllCategories.csv')
data

Unnamed: 0.1,Unnamed: 0,ID:,COMMIT:,CATEGORIES:
0,0,1,build-system: don't always build qtserialbluet...,BUILD
1,1,2,Update dependency versions Fix test build brok...,BUILD
2,2,3,"Downgrade gradle to 2.2.1 , https://github.com...",BUILD
3,3,4,Translated using Weblate (Italian) Currently t...,BUILD
4,4,5,Merge branch 'master' of https://Bananeweizen@...,BUILD
...,...,...,...,...
2021,2021,2022,Let git clones without a google-service to bui...,NO LABEL
2022,2022,2023,"MMS support , https://github.com/jberkel/sms-b...",NO LABEL
2023,2023,2024,"Disabled --debug for travis , https://github.c...",NO LABEL
2024,2024,2025,"类重命名 , https://github.com/TakWolf/CNode-Materi...",NO LABEL


In [3]:
# Separate the messages and labels
X = data['COMMIT:']
y = data['CATEGORIES:']

In [4]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
y_test.shape

(406,)

In [6]:
# Create a pipeline to vectorize the data, then train and fit a model
models = {
    'svc': SVC(kernel='linear', C = 1.0),
    'Decision_Tree': DecisionTreeClassifier(criterion='entropy'),
    'Naive_Bayes' : MultinomialNB(),
    'SDG': SGDClassifier(),
    'NN': MLPClassifier(max_iter=1500),
    'RFC': RandomForestClassifier(n_estimators=3, max_depth=2)
}

scores = {}

for model_key in models:
  model = models[model_key]
  text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', model),
  ])

  # Validation
  cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
  scores[model_key] = cross_val_score(text_clf, X, y, cv=cv)

  print("%s : %0.3f accuracy with a standard deviation of %0.2f" % (model_key, scores[model_key].mean(), scores[model_key].std()))

svc : 0.815 accuracy with a standard deviation of 0.01
Decision_Tree : 0.768 accuracy with a standard deviation of 0.01
Naive_Bayes : 0.689 accuracy with a standard deviation of 0.01
SDG : 0.812 accuracy with a standard deviation of 0.01
NN : 0.776 accuracy with a standard deviation of 0.01
RFC : 0.664 accuracy with a standard deviation of 0.01


In [7]:
scores

{'svc': array([0.82730263, 0.81085526, 0.81414474, 0.81907895, 0.80592105]),
 'Decision_Tree': array([0.75164474, 0.76809211, 0.77631579, 0.77631579, 0.76644737]),
 'Naive_Bayes': array([0.68092105, 0.70559211, 0.69901316, 0.68585526, 0.67598684]),
 'SDG': array([0.82565789, 0.8125    , 0.81578947, 0.80592105, 0.80098684]),
 'NN': array([0.79276316, 0.76809211, 0.77302632, 0.78125   , 0.76480263]),
 'RFC': array([0.64967105, 0.67763158, 0.66776316, 0.68092105, 0.64473684])}