In [None]:
# Importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

In [None]:
# Read file .csv cleaned
data = pd.read_csv('sample_data/DataframeCleaned.csv')
data

Unnamed: 0.1,Unnamed: 0,COMMIT:,CATEGORIES:
0,1,build-system: don't always build qtserialbluet...,BUILD
1,2,Update dependency versions\n \n Fix test build...,BUILD
2,3,"Downgrade gradle to 2.2.1 , https://github.com...",BUILD
3,4,Translated using Weblate (Italian)\n \n Curren...,BUILD
4,5,Merge branch 'master' of https://Bananeweizen@...,BUILD
...,...,...,...
1748,1819,test(server:sequence): fixed SequenceServiceSp...,TEST
1749,1820,[test suite] added Rijndael/HMAC compatibility...,TEST
1750,1821,"Pass BaseVehicleDataSink tests. , https://gith...",TEST
1751,1822,autotools: support running the Android tests f...,TEST


In [None]:
# Separate the messages and labels
X = data['COMMIT:']
y = data['CATEGORIES:']


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:

y_test.shape


(351,)

In [None]:
# Create a pipeline to vectorize the data, then train and fit a model
models = {
    'svc': SVC(kernel='linear', C = 1.0),
    'Decision_Tree': DecisionTreeClassifier(criterion='entropy'),
    'Naive_Bayes' : MultinomialNB(),
    'SDG': SGDClassifier(),
    'NN': MLPClassifier(max_iter=1500),
    'RFC': RandomForestClassifier(n_estimators=3, max_depth=2)
}

scores = {}

for model_key in models:
  model = models[model_key]
  text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', model),
  ])

  # Validation
  cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
  scores[model_key] = cross_val_score(text_clf, X, y, cv=cv)

  print("%s : %0.3f accuracy with a standard deviation of %0.2f" % (model_key, scores[model_key].mean(), scores[model_key].std()))

svc : 0.906 accuracy with a standard deviation of 0.01
Decision_Tree : 0.892 accuracy with a standard deviation of 0.01
Naive_Bayes : 0.784 accuracy with a standard deviation of 0.02
SDG : 0.914 accuracy with a standard deviation of 0.01
NN : 0.879 accuracy with a standard deviation of 0.01
RFC : 0.753 accuracy with a standard deviation of 0.01


In [None]:
scores

{'svc': array([0.91825095, 0.90684411, 0.90494297, 0.89163498, 0.90684411]),
 'Decision_Tree': array([0.88973384, 0.89923954, 0.90114068, 0.86692015, 0.90494297]),
 'Naive_Bayes': array([0.79657795, 0.79087452, 0.75855513, 0.79847909, 0.77376426]),
 'SDG': array([0.92775665, 0.90684411, 0.92585551, 0.89543726, 0.91444867]),
 'NN': array([0.8973384 , 0.86501901, 0.88212928, 0.87262357, 0.87642586]),
 'RFC': array([0.75475285, 0.76045627, 0.73193916, 0.77376426, 0.74524715])}