<a href="https://colab.research.google.com/github/Kalloniatis/Humour_Recognition_in_Greek/blob/main/MLmodels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
!pip install spacy

In [None]:
!python -m spacy download el_core_news_sm

In [None]:
import spacy

In [None]:
nlp = spacy.load("el_core_news_sm")

In [None]:
df = pd.read_excel("/GHD.xlsx","Φύλλο1")

In [None]:
def clean_data(x):
    puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#',
              '*', '+', '\\', '•', '~', '@', '£', '·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›',
              '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…',
              '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―',
              '¥', '▓', '—', '‹', '─',
              '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸',
              '¾', 'Ã', '⋅', '‘', '∞',
              '∙', '）', '↓', '、', '│', '（', '»', '«', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
              '¹', '≤', '‡', '√', ]

    x = str(x)
    for punct in puncts:
        x = x.replace(punct, " ")

    x = x.lower()

    return x

In [None]:
df["cleanText"] = df["text"].apply(lambda x: clean_data(x))

In [None]:
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower()

In [None]:
df['cleanText'] = df['cleanText'].map(preprocess)

In [None]:
df.cleanText[1]

'δάνεια με επιτόκιο 0 35 για μικρές επιχειρήσεις'

In [None]:
df["cleanText"] = df["cleanText"].apply(nlp)

In [None]:
df

In [None]:
def preprocess_nlp(nlp_text):

    # First we make sure that the input is of correct type
    assert type(nlp_text) == spacy.tokens.doc.Doc
    # set up a placeholder list
    filtered_tokens = []
    # remove puncuations then lower case the text
    for token in nlp_text:
        if token.pos_ not in ["SPACE", "PUNCT", "X"]:
            filtered_tokens.append(token.lemma_)

    return " ".join(filtered_tokens)

In [None]:
df["cleanText"] = df["cleanText"].apply(preprocess_nlp)

In [None]:
df

Unnamed: 0,text,label,cleanText
0,Μέσα σε 27 μήνες εκδόθηκαν 570 χιλιάδες εκκρεμ...,0,μέσα σε 27 μήνας εκδόθηκαν 570 χιλιάδα εκκρεμή...
1,"«Δάνεια με επιτόκιο 0,35% για μικρές επιχειρήσ...",0,δάνεια με επιτόκιος 0 35 για μικρός επιχείρηση
2,ΟΙ 234 ΝΕΟΙ ΥΠΟΨΗΦΙΟΙ»,0,ο 234 νεοι υποψηφιος
3,«ΤΑ ΠΑΙΔΙΑ ΤΗΣ ΜΕΓΑΛΗΣ ΦΥΓΗΣ»,0,ο παιδια ο μεγαλη φυγη
4,««ΜΑΣΤΡΟΠΟΣ Ο ΑΝΗΨΙΟΣ Πορνείο ανηλίκων το Μαξί...,0,μαστροπος ο ανηψιος πορνείο ανηλίκων ο μαξίμου
...,...,...,...
9998,"η μονη Δευτερα που αγαπησα στη ζωη μου,ηταν η ...",1,ο μονη δευτερα που αγαπησα σε ο ζωη μου ηταν ο...
9999,Εχω μαύρο μάτι στο καράτε.,1,εχω μαύρος μάτι σε ο καράτε
10000,με τσίμπησε μέλισσα και μου κεντρισε το ενδιαφ...,1,με τσίμπησε μέλισσα και μου κεντρισε ο ενδιαφέρον
10001,Που θα μείνουμε; -Στα λόγια.,1,που θα μείνω σε ο λόγος


# **Pipeline**

In [None]:
from sklearn.pipeline import Pipeline

# **TF/IDF Vectorizer**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# **10-fold-cross validation technique**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# **TF-IDF Feature with BernoulliNB**

In [None]:
from sklearn.naive_bayes import BernoulliNB

#create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('Bernoulli NB', BernoulliNB())
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.906093906093906
Accuracy for this fold is:  0.8901098901098901
Accuracy for this fold is:  0.916083916083916
Accuracy for this fold is:  0.898
Accuracy for this fold is:  0.92
Accuracy for this fold is:  0.915
Accuracy for this fold is:  0.893
Accuracy for this fold is:  0.899
Accuracy for this fold is:  0.882
Accuracy for this fold is:  0.896
 Mean accuracy over all folds is:  0.9015287712287712

F1 for this fold is:  0.8991416309012876
F1 for this fold is:  0.8834745762711864
F1 for this fold is:  0.9115789473684212
F1 for this fold is:  0.8903225806451613
F1 for this fold is:  0.9164926931106472
F1 for this fold is:  0.911550468262227
F1 for this fold is:  0.8895768833849329
F1 for this fold is:  0.8997020854021847
F1 for this fold is:  0.8773388773388774
F1 for this fold is:  0.8943089430894309
 Mean F1 over all folds is:  0.8973487685774357

Recall for this fold is:  0.8729166666666667
Recall for this fold is:  0.8475609756097561
Recall for this fold 

# **TF-IDF Feature with MultinomialNB**

In [None]:
from sklearn.naive_bayes import MultinomialNB

#create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('Multi NB', MultinomialNB())
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8661338661338661
Accuracy for this fold is:  0.8641358641358642
Accuracy for this fold is:  0.8631368631368631
Accuracy for this fold is:  0.855
Accuracy for this fold is:  0.876
Accuracy for this fold is:  0.871
Accuracy for this fold is:  0.872
Accuracy for this fold is:  0.893
Accuracy for this fold is:  0.859
Accuracy for this fold is:  0.862
 Mean accuracy over all folds is:  0.8681406593406594

F1 for this fold is:  0.8738229755178908
F1 for this fold is:  0.8731343283582088
F1 for this fold is:  0.872794800371402
F1 for this fold is:  0.8638497652582158
F1 for this fold is:  0.8862385321100918
F1 for this fold is:  0.8817598533455546
F1 for this fold is:  0.882998171846435
F1 for this fold is:  0.9050576752440107
F1 for this fold is:  0.8733153638814016
F1 for this fold is:  0.8758992805755397
 Mean F1 over all folds is:  0.8788870746508751

Recall for this fold is:  0.9666666666666667
Recall for this fold is:  0.9512195121951219
Recall for this fol

# **TF-IDF Feature with Random Forest (RF)**

In [None]:
from sklearn.ensemble import RandomForestClassifier

#create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('Random Forest RF', RandomForestClassifier(n_estimators=100, max_depth=10,
                                                 random_state=0))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8421578421578422
Accuracy for this fold is:  0.8131868131868132
Accuracy for this fold is:  0.8331668331668332
Accuracy for this fold is:  0.825
Accuracy for this fold is:  0.832
Accuracy for this fold is:  0.803
Accuracy for this fold is:  0.797
Accuracy for this fold is:  0.793
Accuracy for this fold is:  0.801
Accuracy for this fold is:  0.8
 Mean accuracy over all folds is:  0.8139511488511489

F1 for this fold is:  0.8244444444444445
F1 for this fold is:  0.7969598262757872
F1 for this fold is:  0.8213903743315508
F1 for this fold is:  0.8116254036598493
F1 for this fold is:  0.8197424892703863
F1 for this fold is:  0.7897545357524012
F1 for this fold is:  0.786540483701367
F1 for this fold is:  0.7823343848580442
F1 for this fold is:  0.7848648648648647
F1 for this fold is:  0.7858672376873662
 Mean F1 over all folds is:  0.8003524044846062

Recall for this fold is:  0.7729166666666667
Recall for this fold is:  0.7459349593495935
Recall for this fold

# **TF-IDF Feature with SVM with rbf Kernel**

In [None]:
from sklearn.svm import SVC

#create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('SVM rbf', SVC())
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.9000999000999002
Accuracy for this fold is:  0.8961038961038961
Accuracy for this fold is:  0.8821178821178821
Accuracy for this fold is:  0.88
Accuracy for this fold is:  0.903
Accuracy for this fold is:  0.893
Accuracy for this fold is:  0.866
Accuracy for this fold is:  0.899
Accuracy for this fold is:  0.88
Accuracy for this fold is:  0.88
 Mean accuracy over all folds is:  0.8879321678321679

F1 for this fold is:  0.8977505112474438
F1 for this fold is:  0.8953722334004024
F1 for this fold is:  0.8817635270541081
F1 for this fold is:  0.8778004073319756
F1 for this fold is:  0.9046214355948868
F1 for this fold is:  0.8945812807881773
F1 for this fold is:  0.8701550387596899
F1 for this fold is:  0.9049858889934148
F1 for this fold is:  0.8828125000000001
F1 for this fold is:  0.8832684824902723
 Mean F1 over all folds is:  0.8893111305660373

Recall for this fold is:  0.9145833333333333
Recall for this fold is:  0.9044715447154471
Recall for this fold

# **TF-IDF Feature with SVM with linear Kernel**

In [None]:
from sklearn.svm import SVC

#create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('SVM rbf', SVC(kernel='linear', gamma='auto'))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8971028971028971
Accuracy for this fold is:  0.8921078921078921
Accuracy for this fold is:  0.8901098901098901
Accuracy for this fold is:  0.89
Accuracy for this fold is:  0.907
Accuracy for this fold is:  0.895
Accuracy for this fold is:  0.882
Accuracy for this fold is:  0.907
Accuracy for this fold is:  0.887
Accuracy for this fold is:  0.886
 Mean accuracy over all folds is:  0.893332067932068

F1 for this fold is:  0.8950050968399592
F1 for this fold is:  0.8922155688622755
F1 for this fold is:  0.8906560636182902
F1 for this fold is:  0.8884381338742394
F1 for this fold is:  0.9097963142580019
F1 for this fold is:  0.8973607038123166
F1 for this fold is:  0.8854368932038836
F1 for this fold is:  0.9130028063610851
F1 for this fold is:  0.8918660287081339
F1 for this fold is:  0.8901734104046243
 Mean F1 over all folds is:  0.895395101994281

Recall for this fold is:  0.9145833333333333
Recall for this fold is:  0.9085365853658537
Recall for this fold

# **TF-IDF Feature with KNNeighbors Classifier**

In [None]:
from  sklearn.neighbors import KNeighborsClassifier

#create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('KNeighbors', KNeighborsClassifier(n_neighbors = 20, metric = 'euclidean'))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8481518481518482
Accuracy for this fold is:  0.8111888111888111
Accuracy for this fold is:  0.8231768231768232
Accuracy for this fold is:  0.839
Accuracy for this fold is:  0.844
Accuracy for this fold is:  0.829
Accuracy for this fold is:  0.835
Accuracy for this fold is:  0.822
Accuracy for this fold is:  0.825
Accuracy for this fold is:  0.814
 Mean accuracy over all folds is:  0.8290517482517481

F1 for this fold is:  0.8396624472573839
F1 for this fold is:  0.799575821845175
F1 for this fold is:  0.8184615384615385
F1 for this fold is:  0.8310598111227702
F1 for this fold is:  0.8430583501006036
F1 for this fold is:  0.8246153846153846
F1 for this fold is:  0.8348348348348348
F1 for this fold is:  0.8278529980657641
F1 for this fold is:  0.8251748251748252
F1 for this fold is:  0.8143712574850299
 Mean F1 over all folds is:  0.8258667268963309

Recall for this fold is:  0.8291666666666667
Recall for this fold is:  0.766260162601626
Recall for this fol

# **TF-IDF Feature with Gradient Boosted Decision Trees (GBDT)**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

#create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('GBDT', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                         max_depth=1, random_state=0))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8371628371628371
Accuracy for this fold is:  0.8131868131868132
Accuracy for this fold is:  0.8401598401598401
Accuracy for this fold is:  0.806
Accuracy for this fold is:  0.833
Accuracy for this fold is:  0.811
Accuracy for this fold is:  0.819
Accuracy for this fold is:  0.804
Accuracy for this fold is:  0.82
Accuracy for this fold is:  0.816
 Mean accuracy over all folds is:  0.8199509490509491

F1 for this fold is:  0.8249194414607949
F1 for this fold is:  0.802534318901795
F1 for this fold is:  0.8322851153039832
F1 for this fold is:  0.7863436123348017
F1 for this fold is:  0.8247639034627493
F1 for this fold is:  0.804953560371517
F1 for this fold is:  0.8169868554095046
F1 for this fold is:  0.804
F1 for this fold is:  0.812889812889813
F1 for this fold is:  0.8163672654690619
 Mean F1 over all folds is:  0.812604388560402

Recall for this fold is:  0.8
Recall for this fold is:  0.7723577235772358
Recall for this fold is:  0.8151950718685832
Recal

# **TF-IDF Feature with Stochastic Gradient Descent (SGDC)**

In [None]:
from sklearn.linear_model import SGDClassifier

#create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('SGD', SGDClassifier(loss="hinge", penalty="l2", max_iter=100))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8981018981018981
Accuracy for this fold is:  0.8941058941058941
Accuracy for this fold is:  0.8861138861138861
Accuracy for this fold is:  0.89
Accuracy for this fold is:  0.907
Accuracy for this fold is:  0.897
Accuracy for this fold is:  0.884
Accuracy for this fold is:  0.908
Accuracy for this fold is:  0.884
Accuracy for this fold is:  0.887
 Mean accuracy over all folds is:  0.8935321678321678

F1 for this fold is:  0.8952187182095626
F1 for this fold is:  0.8919999999999999
F1 for this fold is:  0.886025768087215
F1 for this fold is:  0.8895643363728469
F1 for this fold is:  0.9059165858389913
F1 for this fold is:  0.9009708737864079
F1 for this fold is:  0.8869395711500975
F1 for this fold is:  0.9130028063610851
F1 for this fold is:  0.8908045977011494
F1 for this fold is:  0.8903846153846156
 Mean F1 over all folds is:  0.8950827872891971

Recall for this fold is:  0.9145833333333333
Recall for this fold is:  0.9065040650406504
Recall for this fol

# **TF-IDF Feature with Decision Trees (DT)**

In [None]:
from sklearn import tree

#create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('Decision Trees', tree.DecisionTreeClassifier(max_depth=30, random_state=0))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.7932067932067932
Accuracy for this fold is:  0.7532467532467533
Accuracy for this fold is:  0.8051948051948052
Accuracy for this fold is:  0.789
Accuracy for this fold is:  0.796
Accuracy for this fold is:  0.782
Accuracy for this fold is:  0.758
Accuracy for this fold is:  0.767
Accuracy for this fold is:  0.785
Accuracy for this fold is:  0.768
 Mean accuracy over all folds is:  0.7796648351648352

F1 for this fold is:  0.7722772277227723
F1 for this fold is:  0.7341227125941873
F1 for this fold is:  0.7887323943661972
F1 for this fold is:  0.7713976164680392
F1 for this fold is:  0.7811158798283262
F1 for this fold is:  0.7635574837310195
F1 for this fold is:  0.7479166666666667
F1 for this fold is:  0.7695351137487636
F1 for this fold is:  0.7720042417815483
F1 for this fold is:  0.7531914893617021
 Mean F1 over all folds is:  0.7653850826269223

Recall for this fold is:  0.73125
Recall for this fold is:  0.693089430894309
Recall for this fold is:  0.7

# **TF-IDF Feature with Logistic Regression (LR)**

In [None]:
from sklearn.linear_model import LogisticRegression

#create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),
     ('Logistic Regression', LogisticRegression(solver='liblinear', multi_class='ovr',
                                                random_state=0))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8791208791208791
Accuracy for this fold is:  0.8801198801198801
Accuracy for this fold is:  0.8591408591408591
Accuracy for this fold is:  0.859
Accuracy for this fold is:  0.88
Accuracy for this fold is:  0.88
Accuracy for this fold is:  0.839
Accuracy for this fold is:  0.877
Accuracy for this fold is:  0.852
Accuracy for this fold is:  0.865
 Mean accuracy over all folds is:  0.8670381618381618

F1 for this fold is:  0.8764044943820225
F1 for this fold is:  0.8790322580645161
F1 for this fold is:  0.8594217347956131
F1 for this fold is:  0.8568527918781725
F1 for this fold is:  0.8825831702544031
F1 for this fold is:  0.8821218074656189
F1 for this fold is:  0.8462273161413563
F1 for this fold is:  0.8834123222748815
F1 for this fold is:  0.8560311284046693
F1 for this fold is:  0.8693126815101645
 Mean F1 over all folds is:  0.8691399705171419

Recall for this fold is:  0.89375
Recall for this fold is:  0.8861788617886179
Recall for this fold is:  0.88

# **Count Vectorizer for BOW and n-grams**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# **Bag-Of-Words Feature with BernoulliNB**

In [None]:
from sklearn.naive_bayes import BernoulliNB

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),
     ('Bernoulli NB', BernoulliNB())
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.906093906093906
Accuracy for this fold is:  0.8901098901098901
Accuracy for this fold is:  0.916083916083916
Accuracy for this fold is:  0.898
Accuracy for this fold is:  0.92
Accuracy for this fold is:  0.915
Accuracy for this fold is:  0.893
Accuracy for this fold is:  0.899
Accuracy for this fold is:  0.882
Accuracy for this fold is:  0.896
 Mean accuracy over all folds is:  0.9015287712287712

F1 for this fold is:  0.8991416309012876
F1 for this fold is:  0.8834745762711864
F1 for this fold is:  0.9115789473684212
F1 for this fold is:  0.8903225806451613
F1 for this fold is:  0.9164926931106472
F1 for this fold is:  0.911550468262227
F1 for this fold is:  0.8895768833849329
F1 for this fold is:  0.8997020854021847
F1 for this fold is:  0.8773388773388774
F1 for this fold is:  0.8943089430894309
 Mean F1 over all folds is:  0.8973487685774357

Recall for this fold is:  0.8729166666666667
Recall for this fold is:  0.8475609756097561
Recall for this fold 

# **Bag-Of-Words Feature with MultinomialNB**

In [None]:
from sklearn.naive_bayes import MultinomialNB

#create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),
     ('Multi NB', MultinomialNB())
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8641358641358642
Accuracy for this fold is:  0.8501498501498501
Accuracy for this fold is:  0.8641358641358642
Accuracy for this fold is:  0.859
Accuracy for this fold is:  0.879
Accuracy for this fold is:  0.87
Accuracy for this fold is:  0.867
Accuracy for this fold is:  0.89
Accuracy for this fold is:  0.857
Accuracy for this fold is:  0.861
 Mean accuracy over all folds is:  0.866142157842158

F1 for this fold is:  0.871939736346516
F1 for this fold is:  0.8592870544090057
F1 for this fold is:  0.8726591760299626
F1 for this fold is:  0.8676056338028169
F1 for this fold is:  0.8882733148661126
F1 for this fold is:  0.8809523809523809
F1 for this fold is:  0.8780934922089827
F1 for this fold is:  0.9023090586145649
F1 for this fold is:  0.8708220415537489
F1 for this fold is:  0.8751123090745732
 Mean F1 over all folds is:  0.8767054197858665

Recall for this fold is:  0.9645833333333333
Recall for this fold is:  0.9308943089430894
Recall for this fold 

# **Bag-Of-Words Feature with Random Forest (RF)**

In [None]:
from sklearn.ensemble import RandomForestClassifier

#create a pipeline object
clf = Pipeline([
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),
     ('Random Forest RF', RandomForestClassifier(n_estimators=100, max_depth=10,
                                                 random_state=0))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8371628371628371
Accuracy for this fold is:  0.8131868131868132
Accuracy for this fold is:  0.8351648351648352
Accuracy for this fold is:  0.829
Accuracy for this fold is:  0.837
Accuracy for this fold is:  0.805
Accuracy for this fold is:  0.806
Accuracy for this fold is:  0.795
Accuracy for this fold is:  0.804
Accuracy for this fold is:  0.806
 Mean accuracy over all folds is:  0.8167514485514484

F1 for this fold is:  0.8174692049272115
F1 for this fold is:  0.8008519701810436
F1 for this fold is:  0.8216216216216217
F1 for this fold is:  0.8178913738019169
F1 for this fold is:  0.8256684491978611
F1 for this fold is:  0.7932131495227996
F1 for this fold is:  0.7979166666666666
F1 for this fold is:  0.786235662148071
F1 for this fold is:  0.7896995708154506
F1 for this fold is:  0.7957894736842105
 Mean F1 over all folds is:  0.8046357142566853

Recall for this fold is:  0.7604166666666666
Recall for this fold is:  0.7642276422764228
Recall for this fo

# **Bag-Of-Words Feature with SVM with rbf Kernel**

In [None]:
from sklearn.svm import SVC

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),
     ('SVM rbf', SVC())
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8861138861138861
Accuracy for this fold is:  0.8751248751248751
Accuracy for this fold is:  0.8771228771228772
Accuracy for this fold is:  0.878
Accuracy for this fold is:  0.89
Accuracy for this fold is:  0.875
Accuracy for this fold is:  0.866
Accuracy for this fold is:  0.877
Accuracy for this fold is:  0.864
Accuracy for this fold is:  0.865
 Mean accuracy over all folds is:  0.8753361638361637

F1 for this fold is:  0.8774193548387097
F1 for this fold is:  0.869109947643979
F1 for this fold is:  0.8717413972888426
F1 for this fold is:  0.8690987124463521
F1 for this fold is:  0.8861283643892339
F1 for this fold is:  0.8701973001038422
F1 for this fold is:  0.8657314629258516
F1 for this fold is:  0.8802336903602727
F1 for this fold is:  0.8606557377049181
F1 for this fold is:  0.8651348651348651
 Mean F1 over all folds is:  0.8715450832836866

Recall for this fold is:  0.85
Recall for this fold is:  0.8434959349593496
Recall for this fold is:  0.85831

# **Bag-Of-Words Feature with SVM with linear Kernel**

In [None]:
from sklearn.svm import SVC

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),
     ('SVM rbf', SVC(kernel='linear', gamma='auto'))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.9010989010989011
Accuracy for this fold is:  0.8781218781218781
Accuracy for this fold is:  0.8911088911088911
Accuracy for this fold is:  0.892
Accuracy for this fold is:  0.892
Accuracy for this fold is:  0.901
Accuracy for this fold is:  0.883
Accuracy for this fold is:  0.894
Accuracy for this fold is:  0.889
Accuracy for this fold is:  0.879
 Mean accuracy over all folds is:  0.890032967032967

F1 for this fold is:  0.8943436499466382
F1 for this fold is:  0.8718487394957983
F1 for this fold is:  0.886576482830385
F1 for this fold is:  0.8836206896551725
F1 for this fold is:  0.8891170431211498
F1 for this fold is:  0.898876404494382
F1 for this fold is:  0.8792569659442723
F1 for this fold is:  0.8964843749999999
F1 for this fold is:  0.8873096446700508
F1 for this fold is:  0.878147029204431
 Mean F1 over all folds is:  0.886558102436228

Recall for this fold is:  0.8729166666666667
Recall for this fold is:  0.8434959349593496
Recall for this fold i

# **Bag-Of-Words Feature with KNeighbors Classifier**

In [None]:
from  sklearn.neighbors import KNeighborsClassifier

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),
     ('KNeighbors', KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean'))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.5524475524475524
Accuracy for this fold is:  0.5304695304695305
Accuracy for this fold is:  0.5554445554445554
Accuracy for this fold is:  0.546
Accuracy for this fold is:  0.536
Accuracy for this fold is:  0.531
Accuracy for this fold is:  0.525
Accuracy for this fold is:  0.504
Accuracy for this fold is:  0.525
Accuracy for this fold is:  0.513
 Mean accuracy over all folds is:  0.5318361638361638

F1 for this fold is:  0.12840466926070038
F1 for this fold is:  0.09266409266409267
F1 for this fold is:  0.16822429906542055
F1 for this fold is:  0.11673151750972764
F1 for this fold is:  0.12452830188679245
F1 for this fold is:  0.12336448598130842
F1 for this fold is:  0.12199630314232904
F1 for this fold is:  0.1298245614035088
F1 for this fold is:  0.1154562383612663
F1 for this fold is:  0.11934900542495479
 Mean F1 over all folds is:  0.12405434747001012

Recall for this fold is:  0.06875
Recall for this fold is:  0.04878048780487805
Recall for this fo

# **Bag-Of-Words Feature with Gradient Boosted Decision Trees (GBDT)**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),
     ('GBDT', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                         max_depth=1, random_state=0))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8361638361638362
Accuracy for this fold is:  0.8341658341658341
Accuracy for this fold is:  0.8351648351648352
Accuracy for this fold is:  0.825
Accuracy for this fold is:  0.834
Accuracy for this fold is:  0.801
Accuracy for this fold is:  0.815
Accuracy for this fold is:  0.818
Accuracy for this fold is:  0.804
Accuracy for this fold is:  0.824
 Mean accuracy over all folds is:  0.8226494505494506

F1 for this fold is:  0.8244111349036403
F1 for this fold is:  0.8245243128964059
F1 for this fold is:  0.8261327713382508
F1 for this fold is:  0.81203007518797
F1 for this fold is:  0.8267223382045928
F1 for this fold is:  0.7924921793534933
F1 for this fold is:  0.8133198789101915
F1 for this fold is:  0.8205128205128206
F1 for this fold is:  0.7971014492753623
F1 for this fold is:  0.8225806451612904
 Mean F1 over all folds is:  0.8159827605744019

Recall for this fold is:  0.8020833333333334
Recall for this fold is:  0.7926829268292683
Recall for this fol

# **Bag-Of-Words Feature with Stochastic Gradient Descent (SGDC)**

In [None]:
from sklearn.linear_model import SGDClassifier

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),
     ('SGD', SGDClassifier(loss="hinge", penalty="l2", max_iter=100))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.906093906093906
Accuracy for this fold is:  0.8891108891108891
Accuracy for this fold is:  0.8891108891108891
Accuracy for this fold is:  0.893
Accuracy for this fold is:  0.89
Accuracy for this fold is:  0.896
Accuracy for this fold is:  0.886
Accuracy for this fold is:  0.889
Accuracy for this fold is:  0.885
Accuracy for this fold is:  0.885
 Mean accuracy over all folds is:  0.8908315684315685

F1 for this fold is:  0.8964781216648879
F1 for this fold is:  0.8763102725366877
F1 for this fold is:  0.8916929547844376
F1 for this fold is:  0.8864864864864864
F1 for this fold is:  0.8923076923076922
F1 for this fold is:  0.8948453608247423
F1 for this fold is:  0.8811475409836065
F1 for this fold is:  0.8983543078412392
F1 for this fold is:  0.8818737270875763
F1 for this fold is:  0.8812877263581489
 Mean F1 over all folds is:  0.8880784190875506

Recall for this fold is:  0.8604166666666667
Recall for this fold is:  0.8495934959349594
Recall for this fol

# **Bag-Of-Words Feature with Decision Trees (DT)**

In [None]:
from sklearn import tree

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),
     ('Decision Trees', tree.DecisionTreeClassifier(max_depth=30, random_state=0))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8181818181818182
Accuracy for this fold is:  0.8011988011988012
Accuracy for this fold is:  0.8181818181818182
Accuracy for this fold is:  0.788
Accuracy for this fold is:  0.829
Accuracy for this fold is:  0.799
Accuracy for this fold is:  0.768
Accuracy for this fold is:  0.798
Accuracy for this fold is:  0.792
Accuracy for this fold is:  0.791
 Mean accuracy over all folds is:  0.8002562437562437

F1 for this fold is:  0.8108108108108107
F1 for this fold is:  0.7907465825446899
F1 for this fold is:  0.8051391862955032
F1 for this fold is:  0.7690631808278868
F1 for this fold is:  0.8190476190476191
F1 for this fold is:  0.7881981032665965
F1 for this fold is:  0.7547568710359408
F1 for this fold is:  0.8007889546351086
F1 for this fold is:  0.7815126050420168
F1 for this fold is:  0.7838676318510858
 Mean F1 over all folds is:  0.7903931545357258

Recall for this fold is:  0.8125
Recall for this fold is:  0.7642276422764228
Recall for this fold is:  0.7

# **Bag-Of-Words Feature with Logistic Regression (LR)**

In [None]:
from sklearn.linear_model import LogisticRegression

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_bow', CountVectorizer(ngram_range = (1, 1))),
     ('Logistic Regression', LogisticRegression(solver='liblinear', multi_class='ovr',
                                                random_state=0))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.9100899100899101
Accuracy for this fold is:  0.8891108891108891
Accuracy for this fold is:  0.8931068931068931
Accuracy for this fold is:  0.891
Accuracy for this fold is:  0.893
Accuracy for this fold is:  0.894
Accuracy for this fold is:  0.882
Accuracy for this fold is:  0.902
Accuracy for this fold is:  0.872
Accuracy for this fold is:  0.879
 Mean accuracy over all folds is:  0.8905307692307692

F1 for this fold is:  0.9038461538461539
F1 for this fold is:  0.8835257082896117
F1 for this fold is:  0.8898043254376932
F1 for this fold is:  0.8824163969795037
F1 for this fold is:  0.8898043254376932
F1 for this fold is:  0.8918367346938776
F1 for this fold is:  0.8812877263581488
F1 for this fold is:  0.904109589041096
F1 for this fold is:  0.8699186991869918
F1 for this fold is:  0.8788788788788789
 Mean F1 over all folds is:  0.8875428538149649

Recall for this fold is:  0.88125
Recall for this fold is:  0.8556910569105691
Recall for this fold is:  0.8

# **bigrams Feature with BernoulliNB**

In [None]:
from sklearn.naive_bayes import BernoulliNB

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 2))),
     ('Bernoulli NB', BernoulliNB())
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8661338661338661
Accuracy for this fold is:  0.8491508491508492
Accuracy for this fold is:  0.8571428571428571
Accuracy for this fold is:  0.868
Accuracy for this fold is:  0.857
Accuracy for this fold is:  0.851
Accuracy for this fold is:  0.849
Accuracy for this fold is:  0.824
Accuracy for this fold is:  0.839
Accuracy for this fold is:  0.841
 Mean accuracy over all folds is:  0.8501427572427571

F1 for this fold is:  0.8423529411764705
F1 for this fold is:  0.8225616921269097
F1 for this fold is:  0.8331388564760793
F1 for this fold is:  0.8450704225352111
F1 for this fold is:  0.8339140534262485
F1 for this fold is:  0.8301026225769669
F1 for this fold is:  0.8293785310734463
F1 for this fold is:  0.8061674008810573
F1 for this fold is:  0.8155784650630011
F1 for this fold is:  0.8215488215488215
 Mean F1 over all folds is:  0.8279813806884212

Recall for this fold is:  0.7458333333333333
Recall for this fold is:  0.7113821138211383
Recall for this f

# **bigrams Feature with MultinomialNB**

In [None]:
from sklearn.naive_bayes import MultinomialNB

#create a pipeline object
clf = Pipeline([
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 2))),
     ('Multi NB', MultinomialNB())
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8681318681318682
Accuracy for this fold is:  0.8611388611388612
Accuracy for this fold is:  0.8601398601398601
Accuracy for this fold is:  0.858
Accuracy for this fold is:  0.876
Accuracy for this fold is:  0.869
Accuracy for this fold is:  0.86
Accuracy for this fold is:  0.885
Accuracy for this fold is:  0.857
Accuracy for this fold is:  0.858
 Mean accuracy over all folds is:  0.865241058941059

F1 for this fold is:  0.8768656716417911
F1 for this fold is:  0.8723599632690542
F1 for this fold is:  0.8696461824953445
F1 for this fold is:  0.8672897196261681
F1 for this fold is:  0.8870673952641166
F1 for this fold is:  0.8808007279344859
F1 for this fold is:  0.8747763864042936
F1 for this fold is:  0.8992112182296231
F1 for this fold is:  0.8719785138764548
F1 for this fold is:  0.8749999999999999
 Mean F1 over all folds is:  0.8774995778741331

Recall for this fold is:  0.9791666666666666
Recall for this fold is:  0.9654471544715447
Recall for this fol

# **bigrams Feature with Random Forest (RF)**

In [None]:
from sklearn.ensemble import RandomForestClassifier

#create a pipeline object
clf = Pipeline([
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 2))),
     ('Random Forest RF', RandomForestClassifier(n_estimators=100, max_depth=10,
                                                 random_state=0))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8451548451548452
Accuracy for this fold is:  0.8091908091908092
Accuracy for this fold is:  0.8161838161838162
Accuracy for this fold is:  0.821
Accuracy for this fold is:  0.813
Accuracy for this fold is:  0.804
Accuracy for this fold is:  0.805
Accuracy for this fold is:  0.785
Accuracy for this fold is:  0.811
Accuracy for this fold is:  0.802
 Mean accuracy over all folds is:  0.811152947052947

F1 for this fold is:  0.8294829482948296
F1 for this fold is:  0.7884828349944629
F1 for this fold is:  0.7964601769911505
F1 for this fold is:  0.8043715846994536
F1 for this fold is:  0.7891770011273956
F1 for this fold is:  0.7864923747276688
F1 for this fold is:  0.7905477980665951
F1 for this fold is:  0.762954796030871
F1 for this fold is:  0.792535675082327
F1 for this fold is:  0.7838427947598253
 Mean F1 over all folds is:  0.7924347984774578

Recall for this fold is:  0.7854166666666667
Recall for this fold is:  0.7235772357723578
Recall for this fold

# **bigrams Feature with SVM with rbf Kernel**

In [None]:
from sklearn.svm import SVC

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 2))),
     ('SVM rbf', SVC())
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8731268731268731
Accuracy for this fold is:  0.8441558441558441
Accuracy for this fold is:  0.8641358641358642
Accuracy for this fold is:  0.853
Accuracy for this fold is:  0.858
Accuracy for this fold is:  0.846
Accuracy for this fold is:  0.836
Accuracy for this fold is:  0.835
Accuracy for this fold is:  0.834
Accuracy for this fold is:  0.836
 Mean accuracy over all folds is:  0.8479418581418582

F1 for this fold is:  0.8571428571428571
F1 for this fold is:  0.8247191011235955
F1 for this fold is:  0.8512035010940918
F1 for this fold is:  0.8327645051194539
F1 for this fold is:  0.8429203539823009
F1 for this fold is:  0.830396475770925
F1 for this fold is:  0.8251599147121536
F1 for this fold is:  0.8279457768508864
F1 for this fold is:  0.8191721132897603
F1 for this fold is:  0.8236559139784946
 Mean F1 over all folds is:  0.8335080513064519

Recall for this fold is:  0.79375
Recall for this fold is:  0.7459349593495935
Recall for this fold is:  0.7

# **bigrams Feature with SVM with linear Kernel**

In [None]:
from sklearn.svm import SVC

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 2))),
     ('SVM rbf', SVC(kernel='linear', gamma='auto'))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.9040959040959041
Accuracy for this fold is:  0.8901098901098901
Accuracy for this fold is:  0.9000999000999002
Accuracy for this fold is:  0.906
Accuracy for this fold is:  0.902
Accuracy for this fold is:  0.897
Accuracy for this fold is:  0.887
Accuracy for this fold is:  0.893
Accuracy for this fold is:  0.885
Accuracy for this fold is:  0.885
 Mean accuracy over all folds is:  0.8949305694305695

F1 for this fold is:  0.896551724137931
F1 for this fold is:  0.88272921108742
F1 for this fold is:  0.8947368421052632
F1 for this fold is:  0.8969298245614036
F1 for this fold is:  0.8981288981288982
F1 for this fold is:  0.8930425752855659
F1 for this fold is:  0.884102564102564
F1 for this fold is:  0.8947885939036382
F1 for this fold is:  0.8803329864724245
F1 for this fold is:  0.883248730964467
 Mean F1 over all folds is:  0.8904591950749575

Recall for this fold is:  0.8666666666666667
Recall for this fold is:  0.8414634146341463
Recall for this fold i

# **bigrams Feature with KNeighbors Classifier**

In [None]:
from  sklearn.neighbors import KNeighborsClassifier

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 2))),
     ('KNeighbors', KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean'))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.5234765234765235
Accuracy for this fold is:  0.5114885114885115
Accuracy for this fold is:  0.5154845154845155
Accuracy for this fold is:  0.519
Accuracy for this fold is:  0.508
Accuracy for this fold is:  0.5
Accuracy for this fold is:  0.496
Accuracy for this fold is:  0.469
Accuracy for this fold is:  0.499
Accuracy for this fold is:  0.483
 Mean accuracy over all folds is:  0.502444955044955

F1 for this fold is:  0.012422360248447204
F1 for this fold is:  0.012121212121212121
F1 for this fold is:  0.012219959266802444
F1 for this fold is:  0.008247422680412371
F1 for this fold is:  0.016
F1 for this fold is:  0.003984063745019919
F1 for this fold is:  0.011764705882352941
F1 for this fold is:  0.0111731843575419
F1 for this fold is:  0.00792079207920792
F1 for this fold is:  0.007677543186180422
 Mean F1 over all folds is:  0.010353124356717725

Recall for this fold is:  0.00625
Recall for this fold is:  0.006097560975609756
Recall for this fold is: 

# **bigrams Feature with Gradient Boosted Decision Trees (GBDT)**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 2))),
     ('GBDT', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                         max_depth=1, random_state=0))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8361638361638362
Accuracy for this fold is:  0.8331668331668332
Accuracy for this fold is:  0.8371628371628371
Accuracy for this fold is:  0.821
Accuracy for this fold is:  0.834
Accuracy for this fold is:  0.809
Accuracy for this fold is:  0.818
Accuracy for this fold is:  0.821
Accuracy for this fold is:  0.801
Accuracy for this fold is:  0.82
 Mean accuracy over all folds is:  0.8230493506493506

F1 for this fold is:  0.8247863247863247
F1 for this fold is:  0.822529224229543
F1 for this fold is:  0.8289611752360966
F1 for this fold is:  0.8069039913700108
F1 for this fold is:  0.8256302521008403
F1 for this fold is:  0.801661474558671
F1 for this fold is:  0.8154158215010142
F1 for this fold is:  0.8232971372161896
F1 for this fold is:  0.7929240374609782
F1 for this fold is:  0.8185483870967741
 Mean F1 over all folds is:  0.8160657825556443

Recall for this fold is:  0.8041666666666667
Recall for this fold is:  0.7865853658536586
Recall for this fold

# **bigrams Feature with Stochastic Gradient Descent (SGDC)**

In [None]:
from sklearn.linear_model import SGDClassifier

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 2))),
     ('SGD', SGDClassifier(loss="hinge", penalty="l2", max_iter=100))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8941058941058941
Accuracy for this fold is:  0.8831168831168831
Accuracy for this fold is:  0.8941058941058941
Accuracy for this fold is:  0.893
Accuracy for this fold is:  0.897
Accuracy for this fold is:  0.882
Accuracy for this fold is:  0.87
Accuracy for this fold is:  0.887
Accuracy for this fold is:  0.881
Accuracy for this fold is:  0.868
 Mean accuracy over all folds is:  0.8849328671328672

F1 for this fold is:  0.8862660944206009
F1 for this fold is:  0.8586723768736617
F1 for this fold is:  0.889348500517063
F1 for this fold is:  0.8748639825897715
F1 for this fold is:  0.8985507246376812
F1 for this fold is:  0.8858921161825726
F1 for this fold is:  0.8646464646464646
F1 for this fold is:  0.8912830558276199
F1 for this fold is:  0.8766564729867482
F1 for this fold is:  0.8696537678207739
 Mean F1 over all folds is:  0.8795833556502958

Recall for this fold is:  0.8895833333333333
Recall for this fold is:  0.8353658536585366
Recall for this fol

# **bigrams Feature with Decision Trees (DT)**

In [None]:
from sklearn import tree

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 2))),
     ('Decision Trees', tree.DecisionTreeClassifier(max_depth=30, random_state=0))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8091908091908092
Accuracy for this fold is:  0.8131868131868132
Accuracy for this fold is:  0.8111888111888111
Accuracy for this fold is:  0.797
Accuracy for this fold is:  0.812
Accuracy for this fold is:  0.803
Accuracy for this fold is:  0.771
Accuracy for this fold is:  0.803
Accuracy for this fold is:  0.792
Accuracy for this fold is:  0.787
 Mean accuracy over all folds is:  0.7998566433566434

F1 for this fold is:  0.7957219251336898
F1 for this fold is:  0.8016967126193001
F1 for this fold is:  0.7982924226254002
F1 for this fold is:  0.7781420765027323
F1 for this fold is:  0.7995735607675907
F1 for this fold is:  0.794577685088634
F1 for this fold is:  0.7556029882604056
F1 for this fold is:  0.805911330049261
F1 for this fold is:  0.7824267782426778
F1 for this fold is:  0.7774294670846394
 Mean F1 over all folds is:  0.788937494637433

Recall for this fold is:  0.775
Recall for this fold is:  0.7682926829268293
Recall for this fold is:  0.76796

# **bigrams Feature with Logistic Regression (LR)**

In [None]:
from sklearn.linear_model import LogisticRegression

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 2))),
     ('Logistic Regression', LogisticRegression(solver='liblinear', multi_class='ovr',
                                                random_state=0))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8991008991008991
Accuracy for this fold is:  0.8961038961038961
Accuracy for this fold is:  0.8861138861138861
Accuracy for this fold is:  0.89
Accuracy for this fold is:  0.892
Accuracy for this fold is:  0.888
Accuracy for this fold is:  0.881
Accuracy for this fold is:  0.89
Accuracy for this fold is:  0.876
Accuracy for this fold is:  0.874
 Mean accuracy over all folds is:  0.8872318681318682

F1 for this fold is:  0.891514500537057
F1 for this fold is:  0.8909853249475892
F1 for this fold is:  0.8807531380753137
F1 for this fold is:  0.8814655172413793
F1 for this fold is:  0.8870292887029289
F1 for this fold is:  0.8840579710144928
F1 for this fold is:  0.8794326241134752
F1 for this fold is:  0.892156862745098
F1 for this fold is:  0.8724279835390946
F1 for this fold is:  0.8722109533468559
 Mean F1 over all folds is:  0.8832034164263286

Recall for this fold is:  0.8645833333333334
Recall for this fold is:  0.8638211382113821
Recall for this fold 


# **trigrams Feature with BernoulliNB**

In [None]:
from sklearn.naive_bayes import BernoulliNB

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 3))),
     ('Bernoulli NB', BernoulliNB())
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.7912087912087912
Accuracy for this fold is:  0.7702297702297702
Accuracy for this fold is:  0.7882117882117882
Accuracy for this fold is:  0.786
Accuracy for this fold is:  0.77
Accuracy for this fold is:  0.757
Accuracy for this fold is:  0.76
Accuracy for this fold is:  0.73
Accuracy for this fold is:  0.768
Accuracy for this fold is:  0.747
 Mean accuracy over all folds is:  0.766765034965035

F1 for this fold is:  0.726797385620915
F1 for this fold is:  0.6973684210526315
F1 for this fold is:  0.7246753246753247
F1 for this fold is:  0.7161803713527851
F1 for this fold is:  0.6981627296587927
F1 for this fold is:  0.6840052015604682
F1 for this fold is:  0.6938775510204082
F1 for this fold is:  0.6641791044776121
F1 for this fold is:  0.7040816326530612
F1 for this fold is:  0.6785260482846253
 Mean F1 over all folds is:  0.6987853770356623

Recall for this fold is:  0.5791666666666667
Recall for this fold is:  0.5386178861788617
Recall for this fold i

# **trigrams Feature with MultinomialNB**

In [None]:
from sklearn.naive_bayes import MultinomialNB

#create a pipeline object
clf = Pipeline([
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 3))),
     ('Multi NB', MultinomialNB())
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8691308691308691
Accuracy for this fold is:  0.8551448551448552
Accuracy for this fold is:  0.8591408591408591
Accuracy for this fold is:  0.853
Accuracy for this fold is:  0.872
Accuracy for this fold is:  0.867
Accuracy for this fold is:  0.865
Accuracy for this fold is:  0.886
Accuracy for this fold is:  0.859
Accuracy for this fold is:  0.858
 Mean accuracy over all folds is:  0.8643416583416583

F1 for this fold is:  0.8776844070961718
F1 for this fold is:  0.867579908675799
F1 for this fold is:  0.8688372093023256
F1 for this fold is:  0.8635097493036211
F1 for this fold is:  0.8840579710144928
F1 for this fold is:  0.8794197642792385
F1 for this fold is:  0.8789237668161436
F1 for this fold is:  0.900523560209424
F1 for this fold is:  0.873769024171889
F1 for this fold is:  0.875219683655536
 Mean F1 over all folds is:  0.8769525044524642

Recall for this fold is:  0.9791666666666666
Recall for this fold is:  0.9654471544715447
Recall for this fold 

# **trigrams Feature with Random Forest (RF)**

In [None]:
from sklearn.ensemble import RandomForestClassifier

#create a pipeline object
clf = Pipeline([
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 3))),
     ('Random Forest RF', RandomForestClassifier(n_estimators=100, max_depth=10,
                                                 random_state=0))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8441558441558441
Accuracy for this fold is:  0.8111888111888111
Accuracy for this fold is:  0.8261738261738262
Accuracy for this fold is:  0.827
Accuracy for this fold is:  0.832
Accuracy for this fold is:  0.804
Accuracy for this fold is:  0.803
Accuracy for this fold is:  0.78
Accuracy for this fold is:  0.808
Accuracy for this fold is:  0.809
 Mean accuracy over all folds is:  0.8144518481518481

F1 for this fold is:  0.8262806236080179
F1 for this fold is:  0.7920792079207921
F1 for this fold is:  0.8075221238938053
F1 for this fold is:  0.8071348940914158
F1 for this fold is:  0.8161925601750548
F1 for this fold is:  0.7831858407079646
F1 for this fold is:  0.7865655471289275
F1 for this fold is:  0.7582417582417583
F1 for this fold is:  0.7926565874730023
F1 for this fold is:  0.7894156560088202
 Mean F1 over all folds is:  0.7959274799249559

Recall for this fold is:  0.7729166666666667
Recall for this fold is:  0.7317073170731707
Recall for this fo

# **trigrams Feature with SVM with rbf Kernel**

In [None]:
from sklearn.svm import SVC

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 3))),
     ('SVM rbf', SVC())
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8431568431568431
Accuracy for this fold is:  0.8111888111888111
Accuracy for this fold is:  0.8311688311688312
Accuracy for this fold is:  0.818
Accuracy for this fold is:  0.821
Accuracy for this fold is:  0.818
Accuracy for this fold is:  0.799
Accuracy for this fold is:  0.805
Accuracy for this fold is:  0.808
Accuracy for this fold is:  0.803
 Mean accuracy over all folds is:  0.8157514485514487

F1 for this fold is:  0.8128724672228844
F1 for this fold is:  0.7763313609467456
F1 for this fold is:  0.8041714947856315
F1 for this fold is:  0.7780487804878049
F1 for this fold is:  0.7896592244418331
F1 for this fold is:  0.7903225806451613
F1 for this fold is:  0.7728813559322034
F1 for this fold is:  0.7845303867403315
F1 for this fold is:  0.7777777777777777
F1 for this fold is:  0.7779030439684329
 Mean F1 over all folds is:  0.7864498472948807

Recall for this fold is:  0.7104166666666667
Recall for this fold is:  0.6666666666666666
Recall for this f

# **trigrams Feature with SVM with linear Kernel**

In [None]:
from sklearn.svm import SVC

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 3))),
     ('SVM rbf', SVC(kernel='linear', gamma='auto'))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.9020979020979021
Accuracy for this fold is:  0.8831168831168831
Accuracy for this fold is:  0.8901098901098901
Accuracy for this fold is:  0.895
Accuracy for this fold is:  0.892
Accuracy for this fold is:  0.891
Accuracy for this fold is:  0.877
Accuracy for this fold is:  0.886
Accuracy for this fold is:  0.872
Accuracy for this fold is:  0.873
 Mean accuracy over all folds is:  0.8861324675324674

F1 for this fold is:  0.8941684665226782
F1 for this fold is:  0.8740581270182993
F1 for this fold is:  0.8839662447257384
F1 for this fold is:  0.8847420417124039
F1 for this fold is:  0.8863157894736842
F1 for this fold is:  0.886339937434828
F1 for this fold is:  0.874361593462717
F1 for this fold is:  0.8873517786561265
F1 for this fold is:  0.8663883089770356
F1 for this fold is:  0.8697435897435898
 Mean F1 over all folds is:  0.8807435877727101

Recall for this fold is:  0.8625
Recall for this fold is:  0.8252032520325203
Recall for this fold is:  0.860

# **trigrams Feature with KNeighbors Classifier**

In [None]:
from  sklearn.neighbors import KNeighborsClassifier

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 3))),
     ('KNeighbors', KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean'))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.5204795204795205
Accuracy for this fold is:  0.5084915084915085
Accuracy for this fold is:  0.5134865134865135
Accuracy for this fold is:  0.517
Accuracy for this fold is:  0.504
Accuracy for this fold is:  0.499
Accuracy for this fold is:  0.493
Accuracy for this fold is:  0.468
Accuracy for this fold is:  0.498
Accuracy for this fold is:  0.481
 Mean accuracy over all folds is:  0.5002457542457542

F1 for this fold is:  0.0
F1 for this fold is:  0.0
F1 for this fold is:  0.0
F1 for this fold is:  0.0
F1 for this fold is:  0.0
F1 for this fold is:  0.0
F1 for this fold is:  0.0
F1 for this fold is:  0.0037453183520599247
F1 for this fold is:  0.003968253968253968
F1 for this fold is:  0.0
 Mean F1 over all folds is:  0.0007713572320313893

Recall for this fold is:  0.0
Recall for this fold is:  0.0
Recall for this fold is:  0.0
Recall for this fold is:  0.0
Recall for this fold is:  0.0
Recall for this fold is:  0.0
Recall for this fold is:  0.0
Recall fo

# **trigrams Feature with Gradient Boosted Decision Trees (GBDT)**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 3))),
     ('GBDT', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                         max_depth=1, random_state=0))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8401598401598401
Accuracy for this fold is:  0.8321678321678322
Accuracy for this fold is:  0.8371628371628371
Accuracy for this fold is:  0.821
Accuracy for this fold is:  0.836
Accuracy for this fold is:  0.804
Accuracy for this fold is:  0.813
Accuracy for this fold is:  0.82
Accuracy for this fold is:  0.799
Accuracy for this fold is:  0.82
 Mean accuracy over all folds is:  0.8222490509490509

F1 for this fold is:  0.8294243070362474
F1 for this fold is:  0.8216560509554142
F1 for this fold is:  0.8289611752360966
F1 for this fold is:  0.8064864864864865
F1 for this fold is:  0.8277310924369748
F1 for this fold is:  0.7962577962577962
F1 for this fold is:  0.8097660223804679
F1 for this fold is:  0.8221343873517787
F1 for this fold is:  0.7904066736183525
F1 for this fold is:  0.8185483870967741
 Mean F1 over all folds is:  0.815137237885639

Recall for this fold is:  0.8104166666666667
Recall for this fold is:  0.7865853658536586
Recall for this fold

# **trigrams Feature with Stochastic Gradient Descent (SGDC)**

In [None]:
from sklearn.linear_model import SGDClassifier

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 3))),
     ('SGD', SGDClassifier(loss="hinge", penalty="l2", max_iter=100))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8881118881118881
Accuracy for this fold is:  0.8721278721278721
Accuracy for this fold is:  0.8781218781218781
Accuracy for this fold is:  0.881
Accuracy for this fold is:  0.885
Accuracy for this fold is:  0.877
Accuracy for this fold is:  0.855
Accuracy for this fold is:  0.872
Accuracy for this fold is:  0.875
Accuracy for this fold is:  0.863
 Mean accuracy over all folds is:  0.8746361638361637

F1 for this fold is:  0.8834224598930481
F1 for this fold is:  0.8673684210526315
F1 for this fold is:  0.8706624605678234
F1 for this fold is:  0.8737864077669903
F1 for this fold is:  0.8824742268041237
F1 for this fold is:  0.8711018711018711
F1 for this fold is:  0.8580060422960725
F1 for this fold is:  0.8710317460317459
F1 for this fold is:  0.8674203494347378
F1 for this fold is:  0.8612244897959183
 Mean F1 over all folds is:  0.8706498474744964

Recall for this fold is:  0.8583333333333333
Recall for this fold is:  0.823170731707317
Recall for this fo

# **trigrams Feature with Decision Trees (DT)**

In [None]:
from sklearn import tree

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 3))),
     ('Decision Trees', tree.DecisionTreeClassifier(max_depth=30, random_state=0))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8161838161838162
Accuracy for this fold is:  0.8171828171828172
Accuracy for this fold is:  0.8091908091908092
Accuracy for this fold is:  0.796
Accuracy for this fold is:  0.823
Accuracy for this fold is:  0.802
Accuracy for this fold is:  0.774
Accuracy for this fold is:  0.796
Accuracy for this fold is:  0.793
Accuracy for this fold is:  0.789
 Mean accuracy over all folds is:  0.8015557442557444

F1 for this fold is:  0.8034188034188033
F1 for this fold is:  0.8042780748663102
F1 for this fold is:  0.7961579509071505
F1 for this fold is:  0.7768052516411378
F1 for this fold is:  0.8123011664899258
F1 for this fold is:  0.7941787941787942
F1 for this fold is:  0.7616033755274263
F1 for this fold is:  0.8
F1 for this fold is:  0.7836990595611284
F1 for this fold is:  0.7813471502590673
 Mean F1 over all folds is:  0.7913789626849743

Recall for this fold is:  0.7833333333333333
Recall for this fold is:  0.7642276422764228
Recall for this fold is:  0.7659

# **trigrams Feature with Logistic Regression (LR)**

In [None]:
from sklearn.linear_model import LogisticRegression

#create a pipeline object
clf = Pipeline([
      #using the ngram_range parameter
     ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 3))),
     ('Logistic Regression', LogisticRegression(solver='liblinear', multi_class='ovr',
                                                random_state=0))
])

# prepare the cross-validation procedure
folds  = KFold(n_splits=10, random_state=42, shuffle=True)

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='accuracy', cv=folds, n_jobs=-1)

for score in scores:
    print("Accuracy for this fold is: ", score)

# Mean accuracy
print(' Mean accuracy over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='f1', cv=folds, n_jobs=-1)

for score in scores:
    print("F1 for this fold is: ", score)

# Mean f1
print(' Mean F1 over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='recall', cv=folds, n_jobs=-1)

for score in scores:
    print("Recall for this fold is: ", score)

# Mean Recall
print(' Mean recall over all folds is: ', (np.mean(scores)))
print()

# evaluate model
scores = cross_val_score(clf, df.cleanText, df.label, scoring='precision', cv=folds, n_jobs=-1)

for score in scores:
    print("Precision for this fold is: ", score)

# Mean precision
print(' Mean precision over all folds is: ', (np.mean(scores)))

Accuracy for this fold is:  0.8971028971028971
Accuracy for this fold is:  0.8881118881118881
Accuracy for this fold is:  0.8791208791208791
Accuracy for this fold is:  0.881
Accuracy for this fold is:  0.889
Accuracy for this fold is:  0.88
Accuracy for this fold is:  0.869
Accuracy for this fold is:  0.88
Accuracy for this fold is:  0.869
Accuracy for this fold is:  0.868
 Mean accuracy over all folds is:  0.8800335664335664

F1 for this fold is:  0.888888888888889
F1 for this fold is:  0.8818565400843882
F1 for this fold is:  0.8732984293193718
F1 for this fold is:  0.8707926167209554
F1 for this fold is:  0.8835257082896117
F1 for this fold is:  0.8755186721991702
F1 for this fold is:  0.8675429726996966
F1 for this fold is:  0.8823529411764707
F1 for this fold is:  0.8648090815273477
F1 for this fold is:  0.8653061224489795
 Mean F1 over all folds is:  0.875389197335488

Recall for this fold is:  0.8583333333333333
Recall for this fold is:  0.8495934959349594
Recall for this fold 