In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import numpy as np
from tqdm import tqdm
import string
import scipy

In [7]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
stop_words = stopwords.words('english')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/jimbo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jimbo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jimbo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
def preprocess_sentences(sentences, labels=None, mode=-1, lower=False, remove_stopwords=False, lemmatize=False, remove_punctuation=False):
    sentences = sentences.to_list()
    # Remove stopwords and extra spaces
    if mode == 0:
        for i, sentence in enumerate(sentences):
            for word in sentence.split():
                if word in stop_words:
                    sentence = sentence.replace(word, '')
                else:
                    sentence = sentence.replace(word, word.lower())
            sentence = sentence.replace('  ', ' ')
            sentences[i] = sentence
    # Replace (c), (C), © with COPYRIGHT_SYMBOL then use the word_tokenize function instaed of split + previous
    elif mode == 1:
        for i, sentence in enumerate(sentences):
            sentence = sentence.replace('(c)', 'COPYRIGHT_SYMBOL') # (c) -> '(' 'c' ')
            sentence = sentence.replace('(C)', 'COPYRIGHT_SYMBOL')
            sentence = sentence.replace('©', 'COPYRIGHT_SYMBOL')
            tokens = word_tokenize(sentence)
            if lower:
                tokens = [token.lower() for token in tokens]
            if remove_stopwords:
                tokens = [token for token in tokens if token not in stop_words]
            if remove_punctuation:
                tokens = [token for token in tokens if token not in string.punctuation]
            if lemmatize:
                tokens = [lemmatizer.lemmatize(token) for token in tokens]
            sentences[i] = tokens
    return sentences

In [9]:
data_0 = pd.read_csv('../datasets/fossology-master.csv')
X_0 = data_0["copyright"]
y_0 = data_0["falsePositive"]
X_0 = X_0.drop_duplicates()
y_0 = y_0[X_0.index]

data_1 = pd.read_csv('../datasets/kubernetes-master.csv')
X_1 = data_1["copyright"]
y_1 = data_1["falsePositive"]
X_1 = X_1.drop_duplicates()
y_1 = y_1[X_1.index]

data_2 = pd.read_csv('../datasets/tensorflow-master.csv')
X_2 = data_2["copyright"]
y_2 = data_2["falsePositive"]
X_2 = X_2.drop_duplicates()
y_2 = y_2[X_2.index]

data_3 = pd.read_csv('../datasets/Fossology-Provided-Dataset-1.csv')

X_3 = data_3['scanner_content']
y_3 = data_3['falsePositive']
X_3 = X_3.drop_duplicates()
y_3 = y_3[X_3.index]

X = pd.concat([X_0, X_1, X_2, X_3])
y = pd.concat([y_0, y_1, y_2, y_3])

print('Class 0 Percentage: ', len(y[y == 0]) / len(y))
print('Class 1 Percentage: ', len(y[y == 1]) / len(y))

X_train, X_test, y_train, y_test = train_test_split(X_0, y_0, test_size=0.2, random_state=42)

Class 0 Percentage:  0.7385852090032154
Class 1 Percentage:  0.26141479099678455


In [10]:
def aggregate_reports(reports, print_aggregates=True):
    import pandas as pd
    import numpy as np
    dfs = []
    for metric in ['precision', 'recall', 'f1-score']:
        scores = []
        for report in reports:
            scores.append([report['0'][metric], report['1'][metric]])
        scores = np.array(scores)
        scores = scores[:, :2]
        mean_scores = np.mean(scores, axis=0)
        mean_scores = [f"{score:.6f}" for score in mean_scores]
        df = pd.DataFrame(scores, columns=['0', '1'])
        df.loc['Mean'] = mean_scores
        df['Metric'] = metric
        dfs.append(df)
    if print_aggregates:
        print("## Precision")
        print(dfs[0].to_markdown())
        print("## Recall")
        print(dfs[1].to_markdown())
        print("## F1-score")
        print(dfs[2].to_markdown())
    else:
        return dfs[0], dfs[1], dfs[2]

## Bag Of Words

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

X_1_bow = vectorizer.transform(X_1)

X_2_bow = vectorizer.transform(X_2)

X_3_bow = vectorizer.transform(X_3)

X_bow = vectorizer.transform(X)

In [14]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_bow, y_train)
y_pred = svm.predict(X_test_bow)
y_pred_1 = svm.predict(X_1_bow)
y_pred_2 = svm.predict(X_2_bow)
y_pred_3 = svm.predict(X_3_bow)
y_pred_4 = svm.predict(X_bow)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.994346 | 0.947368 | precision |
| 1    | 0.986911 | 0.651282 | precision |
| 2    | 1        | 0.751634 | precision |
| 3    | 1        | 0.908451 | precision |
| 4    | 0.996347 | 0.956042 | precision |
| Mean | 0.995521 | 0.842955 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.980488 | 0.984375 | recall   |
| 1    | 0.847191 | 0.962121 | recall   |
| 2    | 0.716418 | 1        | recall   |
| 3    | 0.978671 | 1        | recall   |
| 4    | 0.983892 | 0.989808 | recall   |
| Mean | 0.901332 | 0.987261 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.987368 | 0.965517 | f1-score |
| 1    | 0.911729 | 0.776758 | f1-score |
| 2    | 0.834783 | 0.858209 | f1-score |
| 3    | 0.989221 | 0.95203  | f1-score |
| 4    | 0.99008  | 0.972632 | f1

In [24]:
# Test out mode 0 only
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(preprocess_sentences(X_train, y_train, mode=0))
X_test_bow = vectorizer.transform(preprocess_sentences(X_test, y_test, mode=0))
X_1_bow = vectorizer.transform(preprocess_sentences(X_1, y_1, mode=0))
X_2_bow = vectorizer.transform(preprocess_sentences(X_2, y_2, mode=0))
X_3_bow = vectorizer.transform(preprocess_sentences(X_3, y_3, mode=0))
X_bow = vectorizer.transform(preprocess_sentences(X, y, mode=0))

svm = SVC()
svm.fit(X_train_bow, y_train)
y_pred = svm.predict(X_test_bow)
y_pred_1 = svm.predict(X_1_bow)
y_pred_2 = svm.predict(X_2_bow)
y_pred_3 = svm.predict(X_3_bow)
y_pred_4 = svm.predict(X_bow)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.995048 | 0.946579 | precision |
| 1    | 0.981818 | 0.651042 | precision |
| 2    | 0.99     | 0.765101 | precision |
| 3    | 0.999158 | 0.886207 | precision |
| 4    | 0.996469 | 0.953146 | precision |
| Mean | 0.992498 | 0.840415 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.980139 | 0.986328 | recall   |
| 1    | 0.849438 | 0.94697  | recall   |
| 2    | 0.738806 | 0.991304 | recall   |
| 3    | 0.972929 | 0.996124 | recall   |
| 4    | 0.982773 | 0.99016  | recall   |
| Mean | 0.904817 | 0.982177 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.987537 | 0.966045 | f1-score |
| 1    | 0.910843 | 0.771605 | f1-score |
| 2    | 0.846154 | 0.863636 | f1-score |
| 3    | 0.985869 | 0.937956 | f1-score |
| 4    | 0.989573 | 0.971301 | f1

In [28]:
# Test out mode 1, lower=True
vectorizer = CountVectorizer()
X_train_bow = preprocess_sentences(X_train, y_train, mode=1, lower=True)
X_trainB_bow = [' '.join(sentence_words) for sentence_words in X_train_bow]
X_train_bow = vectorizer.fit_transform(X_trainB_bow)
X_test_bow = preprocess_sentences(X_test, y_test, mode=1, lower=True)
X_testB_bow = [' '.join(sentence_words) for sentence_words in X_test_bow]
X_test_bow = vectorizer.transform(X_testB_bow)
X_1_bow = preprocess_sentences(X_1, y_1, mode=1, lower=True)
X_1B_bow = [' '.join(sentence_words) for sentence_words in X_1_bow]
X_1_bow = vectorizer.transform(X_1B_bow)
X_2_bow = preprocess_sentences(X_2, y_2, mode=1, lower=True)
X_2B_bow = [' '.join(sentence_words) for sentence_words in X_2_bow]
X_2_bow = vectorizer.transform(X_2B_bow)
X_3_bow = preprocess_sentences(X_3, y_3, mode=1, lower=True)
X_3B_bow = [' '.join(sentence_words) for sentence_words in X_3_bow]
X_3_bow = vectorizer.transform(X_3B_bow)
X_bow = preprocess_sentences(X, y, mode=1, lower=True)
XB_bow = [' '.join(sentence_words) for sentence_words in X_bow]
X_bow = vectorizer.transform(XB_bow)

svm = SVC()
svm.fit(X_train_bow, y_train)
y_pred = svm.predict(X_test_bow)
y_pred_1 = svm.predict(X_1_bow)
y_pred_2 = svm.predict(X_2_bow)
y_pred_3 = svm.predict(X_3_bow)
y_pred_4 = svm.predict(X_bow)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.993651 | 0.949953 | precision |
| 1    | 0.994667 | 0.643564 | precision |
| 2    | 0.988235 | 0.695122 | precision |
| 3    | 0.991756 | 0.939394 | precision |
| 4    | 0.994847 | 0.957821 | precision |
| Mean | 0.992631 | 0.837171 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.981533 | 0.982422 | recall   |
| 1    | 0.838202 | 0.984848 | recall   |
| 2    | 0.626866 | 0.991304 | recall   |
| 3    | 0.986874 | 0.96124  | recall   |
| 4    | 0.984638 | 0.985591 | recall   |
| Mean | 0.883623 | 0.981081 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.987555 | 0.965915 | f1-score |
| 1    | 0.909756 | 0.778443 | f1-score |
| 2    | 0.767123 | 0.817204 | f1-score |
| 3    | 0.989309 | 0.950192 | f1-score |
| 4    | 0.989717 | 0.971508 | f1

In [29]:
# Test out mode 1, lower=True, remove_stopwords=True
vectorizer = CountVectorizer()
X_train_bow = preprocess_sentences(X_train, y_train, mode=1, lower=True, remove_stopwords=True)
X_trainB_bow = [' '.join(sentence_words) for sentence_words in X_train_bow]
X_train_bow = vectorizer.fit_transform(X_trainB_bow)
X_test_bow = preprocess_sentences(X_test, y_test, mode=1, lower=True, remove_stopwords=True)
X_testB_bow = [' '.join(sentence_words) for sentence_words in X_test_bow]
X_test_bow = vectorizer.transform(X_testB_bow)
X_1_bow = preprocess_sentences(X_1, y_1, mode=1, lower=True, remove_stopwords=True)
X_1B_bow = [' '.join(sentence_words) for sentence_words in X_1_bow]
X_1_bow = vectorizer.transform(X_1B_bow)
X_2_bow = preprocess_sentences(X_2, y_2, mode=1, lower=True, remove_stopwords=True)
X_2B_bow = [' '.join(sentence_words) for sentence_words in X_2_bow]
X_2_bow = vectorizer.transform(X_2B_bow)
X_3_bow = preprocess_sentences(X_3, y_3, mode=1, lower=True, remove_stopwords=True)
X_3B_bow = [' '.join(sentence_words) for sentence_words in X_3_bow]
X_3_bow = vectorizer.transform(X_3B_bow)
X_bow = preprocess_sentences(X, y, mode=1, lower=True, remove_stopwords=True)
XB_bow = [' '.join(sentence_words) for sentence_words in X_bow]
X_bow = vectorizer.transform(XB_bow)

svm = SVC()
svm.fit(X_train_bow, y_train)
y_pred = svm.predict(X_test_bow)
y_pred_1 = svm.predict(X_1_bow)
y_pred_2 = svm.predict(X_2_bow)
y_pred_3 = svm.predict(X_3_bow)
y_pred_4 = svm.predict(X_bow)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.99541  | 0.951977 | precision |
| 1    | 0.99455  | 0.619048 | precision |
| 2    | 1        | 0.692771 | precision |
| 3    | 0.993394 | 0.93985  | precision |
| 4    | 0.996031 | 0.954384 | precision |
| Mean | 0.995877 | 0.831606 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.98223  | 0.987305 | recall   |
| 1    | 0.820225 | 0.984848 | recall   |
| 2    | 0.619403 | 1        | recall   |
| 3    | 0.986874 | 0.968992 | recall   |
| 4    | 0.98327  | 0.98893  | recall   |
| Mean | 0.8784   | 0.986015 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.988776 | 0.969319 | f1-score |
| 1    | 0.899015 | 0.760234 | f1-score |
| 2    | 0.764977 | 0.818505 | f1-score |
| 3    | 0.990123 | 0.954198 | f1-score |
| 4    | 0.989609 | 0.97135  | f1

In [30]:
# Test out mode 1, lower=False, remove_stopwords=True
vectorizer = CountVectorizer()
X_train_bow = preprocess_sentences(X_train, y_train, mode=1, lower=False, remove_stopwords=True)
X_trainB_bow = [' '.join(sentence_words) for sentence_words in X_train_bow]
X_train_bow = vectorizer.fit_transform(X_trainB_bow)
X_test_bow = preprocess_sentences(X_test, y_test, mode=1, lower=False, remove_stopwords=True)
X_testB_bow = [' '.join(sentence_words) for sentence_words in X_test_bow]
X_test_bow = vectorizer.transform(X_testB_bow)
X_1_bow = preprocess_sentences(X_1, y_1, mode=1, lower=False, remove_stopwords=True)
X_1B_bow = [' '.join(sentence_words) for sentence_words in X_1_bow]
X_1_bow = vectorizer.transform(X_1B_bow)
X_2_bow = preprocess_sentences(X_2, y_2, mode=1, lower=False, remove_stopwords=True)
X_2B_bow = [' '.join(sentence_words) for sentence_words in X_2_bow]
X_2_bow = vectorizer.transform(X_2B_bow)
X_3_bow = preprocess_sentences(X_3, y_3, mode=1, lower=False, remove_stopwords=True)
X_3B_bow = [' '.join(sentence_words) for sentence_words in X_3_bow]
X_3_bow = vectorizer.transform(X_3B_bow)
X_bow = preprocess_sentences(X, y, mode=1, lower=False, remove_stopwords=True)
XB_bow = [' '.join(sentence_words) for sentence_words in X_bow]
X_bow = vectorizer.transform(XB_bow)

svm = SVC()
svm.fit(X_train_bow, y_train)
y_pred = svm.predict(X_test_bow)
y_pred_1 = svm.predict(X_1_bow)
y_pred_2 = svm.predict(X_2_bow)
y_pred_3 = svm.predict(X_3_bow)
y_pred_4 = svm.predict(X_bow)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.995405 | 0.949296 | precision |
| 1    | 0.991979 | 0.635468 | precision |
| 2    | 1        | 0.737179 | precision |
| 3    | 0.993394 | 0.93985  | precision |
| 4    | 0.995908 | 0.95599  | precision |
| Mean | 0.995337 | 0.843557 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.981185 | 0.987305 | recall   |
| 1    | 0.833708 | 0.977273 | recall   |
| 2    | 0.69403  | 1        | recall   |
| 3    | 0.986874 | 0.968992 | recall   |
| 4    | 0.983892 | 0.988578 | recall   |
| Mean | 0.895938 | 0.98443  | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.988244 | 0.967927 | f1-score |
| 1    | 0.905983 | 0.770149 | f1-score |
| 2    | 0.819383 | 0.848708 | f1-score |
| 3    | 0.990123 | 0.954198 | f1-score |
| 4    | 0.989864 | 0.972011 | f1

In [31]:
# Test out mode 1, lower=False, remove_stopwords=False, and remove_punctuation=True
vectorizer = CountVectorizer()
X_train_bow = preprocess_sentences(X_train, y_train, mode=1, lower=False, remove_stopwords=False, remove_punctuation=True)
X_trainB_bow = [' '.join(sentence_words) for sentence_words in X_train_bow]
X_train_bow = vectorizer.fit_transform(X_trainB_bow)
X_test_bow = preprocess_sentences(X_test, y_test, mode=1, lower=False, remove_stopwords=False, remove_punctuation=True)
X_testB_bow = [' '.join(sentence_words) for sentence_words in X_test_bow]
X_test_bow = vectorizer.transform(X_testB_bow)
X_1_bow = preprocess_sentences(X_1, y_1, mode=1, lower=False, remove_stopwords=False, remove_punctuation=True)
X_1B_bow = [' '.join(sentence_words) for sentence_words in X_1_bow]
X_1_bow = vectorizer.transform(X_1B_bow)
X_2_bow = preprocess_sentences(X_2, y_2, mode=1, lower=False, remove_stopwords=False, remove_punctuation=True)
X_2B_bow = [' '.join(sentence_words) for sentence_words in X_2_bow]
X_2_bow = vectorizer.transform(X_2B_bow)
X_3_bow = preprocess_sentences(X_3, y_3, mode=1, lower=False, remove_stopwords=False, remove_punctuation=True)
X_3B_bow = [' '.join(sentence_words) for sentence_words in X_3_bow]
X_3_bow = vectorizer.transform(X_3B_bow)
X_bow = preprocess_sentences(X, y, mode=1, lower=False, remove_stopwords=False, remove_punctuation=True)
XB_bow = [' '.join(sentence_words) for sentence_words in X_bow]
X_bow = vectorizer.transform(XB_bow)

svm = SVC()
svm.fit(X_train_bow, y_train)
y_pred = svm.predict(X_test_bow)
y_pred_1 = svm.predict(X_1_bow)
y_pred_2 = svm.predict(X_2_bow)
y_pred_3 = svm.predict(X_3_bow)
y_pred_4 = svm.predict(X_bow)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.993651 | 0.949953 | precision |
| 1    | 0.994667 | 0.643564 | precision |
| 2    | 0.988235 | 0.695122 | precision |
| 3    | 0.991756 | 0.939394 | precision |
| 4    | 0.994847 | 0.957821 | precision |
| Mean | 0.992631 | 0.837171 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.981533 | 0.982422 | recall   |
| 1    | 0.838202 | 0.984848 | recall   |
| 2    | 0.626866 | 0.991304 | recall   |
| 3    | 0.986874 | 0.96124  | recall   |
| 4    | 0.984638 | 0.985591 | recall   |
| Mean | 0.883623 | 0.981081 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.987555 | 0.965915 | f1-score |
| 1    | 0.909756 | 0.778443 | f1-score |
| 2    | 0.767123 | 0.817204 | f1-score |
| 3    | 0.989309 | 0.950192 | f1-score |
| 4    | 0.989717 | 0.971508 | f1

In [34]:
# Test out mode 1, lower=True, remove_stopwords=False, and remove_punctuation=False, lemmatize=True
vectorizer = CountVectorizer()
X_train_bow = preprocess_sentences(X_train, y_train, mode=1, lower=True, remove_stopwords=False, remove_punctuation=False, lemmatize=True)
X_trainB_bow = [' '.join(sentence_words) for sentence_words in X_train_bow]
X_train_bow = vectorizer.fit_transform(X_trainB_bow)
X_test_bow = preprocess_sentences(X_test, y_test, mode=1, lower=True, remove_stopwords=False, remove_punctuation=False, lemmatize=True)
X_testB_bow = [' '.join(sentence_words) for sentence_words in X_test_bow]
X_test_bow = vectorizer.transform(X_testB_bow)
X_1_bow = preprocess_sentences(X_1, y_1, mode=1, lower=True, remove_stopwords=False, remove_punctuation=False, lemmatize=True)
X_1B_bow = [' '.join(sentence_words) for sentence_words in X_1_bow]
X_1_bow = vectorizer.transform(X_1B_bow)
X_2_bow = preprocess_sentences(X_2, y_2, mode=1, lower=True, remove_stopwords=False, remove_punctuation=False, lemmatize=True)
X_2B_bow = [' '.join(sentence_words) for sentence_words in X_2_bow]
X_2_bow = vectorizer.transform(X_2B_bow)
X_3_bow = preprocess_sentences(X_3, y_3, mode=1, lower=True, remove_stopwords=False, remove_punctuation=False, lemmatize=True)
X_3B_bow = [' '.join(sentence_words) for sentence_words in X_3_bow]
X_3_bow = vectorizer.transform(X_3B_bow)
X_bow = preprocess_sentences(X, y, mode=1, lower=True, remove_stopwords=False, remove_punctuation=False, lemmatize=True)
XB_bow = [' '.join(sentence_words) for sentence_words in X_bow]
X_bow = vectorizer.transform(XB_bow)

svm = SVC()
svm.fit(X_train_bow, y_train)
y_pred = svm.predict(X_test_bow)
y_pred_1 = svm.predict(X_1_bow)
y_pred_2 = svm.predict(X_2_bow)
y_pred_3 = svm.predict(X_3_bow)
y_pred_4 = svm.predict(X_bow)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.994004 | 0.950897 | precision |
| 1    | 0.99162  | 0.589041 | precision |
| 2    | 1        | 0.684524 | precision |
| 3    | 0.991756 | 0.939394 | precision |
| 4    | 0.994966 | 0.954739 | precision |
| Mean | 0.994469 | 0.823719 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.981882 | 0.983398 | recall   |
| 1    | 0.797753 | 0.977273 | recall   |
| 2    | 0.604478 | 1        | recall   |
| 3    | 0.986874 | 0.96124  | recall   |
| 4    | 0.983457 | 0.985943 | recall   |
| Mean | 0.870889 | 0.981571 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.987905 | 0.966875 | f1-score |
| 1    | 0.884184 | 0.735043 | f1-score |
| 2    | 0.753488 | 0.812721 | f1-score |
| 3    | 0.989309 | 0.950192 | f1-score |
| 4    | 0.989178 | 0.97009  | f1

In [38]:
# Test out mode 1, lower=True, remove_stopwords=True, and remove_punctuation=False, lemmatize=True
vectorizer = CountVectorizer()
X_train_bow = preprocess_sentences(X_train, y_train, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=True)
X_trainB_bow = [' '.join(sentence_words) for sentence_words in X_train_bow]
X_train_bow = vectorizer.fit_transform(X_trainB_bow)
X_test_bow = preprocess_sentences(X_test, y_test, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=True)
X_testB_bow = [' '.join(sentence_words) for sentence_words in X_test_bow]
X_test_bow = vectorizer.transform(X_testB_bow)
X_1_bow = preprocess_sentences(X_1, y_1, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=True)
X_1B_bow = [' '.join(sentence_words) for sentence_words in X_1_bow]
X_1_bow = vectorizer.transform(X_1B_bow)
X_2_bow = preprocess_sentences(X_2, y_2, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=True)
X_2B_bow = [' '.join(sentence_words) for sentence_words in X_2_bow]
X_2_bow = vectorizer.transform(X_2B_bow)
X_3_bow = preprocess_sentences(X_3, y_3, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=True)
X_3B_bow = [' '.join(sentence_words) for sentence_words in X_3_bow]
X_3_bow = vectorizer.transform(X_3B_bow)
X_bow = preprocess_sentences(X, y, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=True)
XB_bow = [' '.join(sentence_words) for sentence_words in X_bow]
X_bow = vectorizer.transform(XB_bow)

svm = SVC()
svm.fit(X_train_bow, y_train)
y_pred = svm.predict(X_test_bow)
y_pred_1 = svm.predict(X_1_bow)
y_pred_2 = svm.predict(X_2_bow)
y_pred_3 = svm.predict(X_3_bow)
y_pred_4 = svm.predict(X_bow)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.99541  | 0.951977 | precision |
| 1    | 0.994318 | 0.577778 | precision |
| 2    | 1        | 0.680473 | precision |
| 3    | 0.993388 | 0.93633  | precision |
| 4    | 0.995963 | 0.95131  | precision |
| Mean | 0.995816 | 0.819574 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.98223  | 0.987305 | recall   |
| 1    | 0.786517 | 0.984848 | recall   |
| 2    | 0.597015 | 1        | recall   |
| 3    | 0.986054 | 0.968992 | recall   |
| 4    | 0.982088 | 0.988754 | recall   |
| Mean | 0.866781 | 0.98598  | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.988776 | 0.969319 | f1-score |
| 1    | 0.878294 | 0.728291 | f1-score |
| 2    | 0.747664 | 0.809859 | f1-score |
| 3    | 0.989708 | 0.952381 | f1-score |
| 4    | 0.988977 | 0.969671 | f1

In [37]:
# Test out mode 1, lower=True, remove_stopwords=True, and remove_punctuation=True, lemmatize=True
vectorizer = CountVectorizer()
X_train_bow = preprocess_sentences(X_train, y_train, mode=1, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=True)
X_trainB_bow = [' '.join(sentence_words) for sentence_words in X_train_bow]
X_train_bow = vectorizer.fit_transform(X_trainB_bow)
X_test_bow = preprocess_sentences(X_test, y_test, mode=1, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=True)
X_testB_bow = [' '.join(sentence_words) for sentence_words in X_test_bow]
X_test_bow = vectorizer.transform(X_testB_bow)
X_1_bow = preprocess_sentences(X_1, y_1, mode=1, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=True)
X_1B_bow = [' '.join(sentence_words) for sentence_words in X_1_bow]
X_1_bow = vectorizer.transform(X_1B_bow)
X_2_bow = preprocess_sentences(X_2, y_2, mode=1, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=True)
X_2B_bow = [' '.join(sentence_words) for sentence_words in X_2_bow]
X_2_bow = vectorizer.transform(X_2B_bow)
X_3_bow = preprocess_sentences(X_3, y_3, mode=1, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=True)
X_3B_bow = [' '.join(sentence_words) for sentence_words in X_3_bow]
X_3_bow = vectorizer.transform(X_3B_bow)
X_bow = preprocess_sentences(X, y, mode=1, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=True)
XB_bow = [' '.join(sentence_words) for sentence_words in X_bow]
X_bow = vectorizer.transform(XB_bow)

svm = SVC()
svm.fit(X_train_bow, y_train)
y_pred = svm.predict(X_test_bow)
y_pred_1 = svm.predict(X_1_bow)
y_pred_2 = svm.predict(X_2_bow)
y_pred_3 = svm.predict(X_3_bow)
y_pred_4 = svm.predict(X_bow)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.99541  | 0.951977 | precision |
| 1    | 0.994318 | 0.577778 | precision |
| 2    | 1        | 0.680473 | precision |
| 3    | 0.993388 | 0.93633  | precision |
| 4    | 0.995963 | 0.95131  | precision |
| Mean | 0.995816 | 0.819574 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.98223  | 0.987305 | recall   |
| 1    | 0.786517 | 0.984848 | recall   |
| 2    | 0.597015 | 1        | recall   |
| 3    | 0.986054 | 0.968992 | recall   |
| 4    | 0.982088 | 0.988754 | recall   |
| Mean | 0.866781 | 0.98598  | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.988776 | 0.969319 | f1-score |
| 1    | 0.878294 | 0.728291 | f1-score |
| 2    | 0.747664 | 0.809859 | f1-score |
| 3    | 0.989708 | 0.952381 | f1-score |
| 4    | 0.988977 | 0.969671 | f1

## TF-IDF

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

X_1_tfidf = vectorizer.transform(X_1)

X_2_tfidf = vectorizer.transform(X_2)

X_3_tfidf = vectorizer.transform(X_3)

X_tfidf = vectorizer.transform(X)

In [42]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict(X_test_tfidf)
y_pred_1 = svm.predict(X_1_tfidf)
y_pred_2 = svm.predict(X_2_tfidf)
y_pred_3 = svm.predict(X_3_tfidf)
y_pred_4 = svm.predict(X_tfidf)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.991262 | 0.967086 | precision |
| 1    | 0.97284  | 0.703488 | precision |
| 2    | 0.945312 | 0.892562 | precision |
| 3    | 0.991701 | 0.911765 | precision |
| 4    | 0.995004 | 0.974809 | precision |
| Mean | 0.979224 | 0.889942 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.988153 | 0.975586 | recall   |
| 1    | 0.885393 | 0.916667 | recall   |
| 2    | 0.902985 | 0.93913  | recall   |
| 3    | 0.980312 | 0.96124  | recall   |
| 4    | 0.990982 | 0.985943 | recall   |
| Mean | 0.949565 | 0.955713 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.989705 | 0.971317 | f1-score |
| 1    | 0.927059 | 0.796053 | f1-score |
| 2    | 0.923664 | 0.915254 | f1-score |
| 3    | 0.985974 | 0.935849 | f1-score |
| 4    | 0.992989 | 0.980344 | f1

In [43]:
# Test out mode 0 only
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(preprocess_sentences(X_train, y_train, mode=0))
X_test_tfidf = vectorizer.transform(preprocess_sentences(X_test, y_test, mode=0))
X_1_tfidf = vectorizer.transform(preprocess_sentences(X_1, y_1, mode=0))
X_2_tfidf = vectorizer.transform(preprocess_sentences(X_2, y_2, mode=0))
X_3_tfidf = vectorizer.transform(preprocess_sentences(X_3, y_3, mode=0))
X_tfidf = vectorizer.transform(preprocess_sentences(X, y, mode=0))

svm = SVC()
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict(X_test_tfidf)
y_pred_1 = svm.predict(X_1_tfidf)
y_pred_2 = svm.predict(X_2_tfidf)
y_pred_3 = svm.predict(X_3_tfidf)
y_pred_4 = svm.predict(X_tfidf)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.99335  | 0.969142 | precision |
| 1    | 0.972705 | 0.695402 | precision |
| 2    | 0.949153 | 0.832061 | precision |
| 3    | 0.996667 | 0.916968 | precision |
| 4    | 0.995748 | 0.973511 | precision |
| Mean | 0.981524 | 0.877417 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.98885  | 0.981445 | recall   |
| 1    | 0.880899 | 0.916667 | recall   |
| 2    | 0.835821 | 0.947826 | recall   |
| 3    | 0.981132 | 0.984496 | recall   |
| 4    | 0.990484 | 0.988051 | recall   |
| Mean | 0.935437 | 0.963697 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.991095 | 0.975255 | f1-score |
| 1    | 0.924528 | 0.79085  | f1-score |
| 2    | 0.888889 | 0.886179 | f1-score |
| 3    | 0.988838 | 0.949533 | f1-score |
| 4    | 0.993109 | 0.980727 | f1

In [44]:
# Test out mode 1, lower=True, remove_stopwords=False, and remove_punctuation=False, lemmatize=False
vectorizer = TfidfVectorizer()
X_train_tfidf = preprocess_sentences(X_train, y_train, mode=1, lower=True, remove_stopwords=False, remove_punctuation=False, lemmatize=False)
X_trainB_tfidf = [' '.join(sentence_words) for sentence_words in X_train_tfidf]
X_train_tfidf = vectorizer.fit_transform(X_trainB_tfidf)
X_test_tfidf = preprocess_sentences(X_test, y_test, mode=1, lower=True, remove_stopwords=False, remove_punctuation=False, lemmatize=False)
X_testB_tfidf = [' '.join(sentence_words) for sentence_words in X_test_tfidf]
X_test_tfidf = vectorizer.transform(X_testB_tfidf)
X_1_tfidf = preprocess_sentences(X_1, y_1, mode=1, lower=True, remove_stopwords=False, remove_punctuation=False, lemmatize=False)
X_1B_tfidf = [' '.join(sentence_words) for sentence_words in X_1_tfidf]
X_1_tfidf = vectorizer.transform(X_1B_tfidf)
X_2_tfidf = preprocess_sentences(X_2, y_2, mode=1, lower=True, remove_stopwords=False, remove_punctuation=False, lemmatize=False)
X_2B_tfidf = [' '.join(sentence_words) for sentence_words in X_2_tfidf]
X_2_tfidf = vectorizer.transform(X_2B_tfidf)
X_3_tfidf = preprocess_sentences(X_3, y_3, mode=1, lower=True, remove_stopwords=False, remove_punctuation=False, lemmatize=False)
X_3B_tfidf = [' '.join(sentence_words) for sentence_words in X_3_tfidf]
X_3_tfidf = vectorizer.transform(X_3B_tfidf)
X_tfidf = preprocess_sentences(X, y, mode=1, lower=True, remove_stopwords=False, remove_punctuation=False, lemmatize=False)
XB_tfidf = [' '.join(sentence_words) for sentence_words in X_tfidf]
X_tfidf = vectorizer.transform(XB_tfidf)

svm = SVC()
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict(X_test_tfidf)
y_pred_1 = svm.predict(X_1_tfidf)
y_pred_2 = svm.predict(X_2_tfidf)
y_pred_3 = svm.predict(X_3_tfidf)
y_pred_4 = svm.predict(X_tfidf)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.989878 | 0.966958 | precision |
| 1    | 0.977612 | 0.702857 | precision |
| 2    | 0.933333 | 0.829457 | precision |
| 3    | 0.966909 | 0.911765 | precision |
| 4    | 0.992832 | 0.973634 | precision |
| Mean | 0.972113 | 0.876934 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.988153 | 0.97168  | recall   |
| 1    | 0.883146 | 0.931818 | recall   |
| 2    | 0.835821 | 0.930435 | recall   |
| 3    | 0.982773 | 0.841085 | recall   |
| 4    | 0.990609 | 0.979793 | recall   |
| Mean | 0.9361   | 0.930962 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.989015 | 0.969313 | f1-score |
| 1    | 0.927981 | 0.801303 | f1-score |
| 2    | 0.88189  | 0.877049 | f1-score |
| 3    | 0.974776 | 0.875    | f1-score |
| 4    | 0.991719 | 0.976703 | f1

In [45]:
# Test out mode 1, lower=True, remove_stopwords=True, and remove_punctuation=False, lemmatize=False
vectorizer = TfidfVectorizer()
X_train_tfidf = preprocess_sentences(X_train, y_train, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=False)
X_trainB_tfidf = [' '.join(sentence_words) for sentence_words in X_train_tfidf]
X_train_tfidf = vectorizer.fit_transform(X_trainB_tfidf)
X_test_tfidf = preprocess_sentences(X_test, y_test, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=False)
X_testB_tfidf = [' '.join(sentence_words) for sentence_words in X_test_tfidf]
X_test_tfidf = vectorizer.transform(X_testB_tfidf)
X_1_tfidf = preprocess_sentences(X_1, y_1, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=False)
X_1B_tfidf = [' '.join(sentence_words) for sentence_words in X_1_tfidf]
X_1_tfidf = vectorizer.transform(X_1B_tfidf)
X_2_tfidf = preprocess_sentences(X_2, y_2, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=False)
X_2B_tfidf = [' '.join(sentence_words) for sentence_words in X_2_tfidf]
X_2_tfidf = vectorizer.transform(X_2B_tfidf)
X_3_tfidf = preprocess_sentences(X_3, y_3, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=False)
X_3B_tfidf = [' '.join(sentence_words) for sentence_words in X_3_tfidf]
X_3_tfidf = vectorizer.transform(X_3B_tfidf)
X_tfidf = preprocess_sentences(X, y, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=False)
XB_tfidf = [' '.join(sentence_words) for sentence_words in X_tfidf]
X_tfidf = vectorizer.transform(XB_tfidf)

svm = SVC()
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict(X_test_tfidf)
y_pred_1 = svm.predict(X_1_tfidf)
y_pred_2 = svm.predict(X_2_tfidf)
y_pred_3 = svm.predict(X_3_tfidf)
y_pred_4 = svm.predict(X_tfidf)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.991969 | 0.971845 | precision |
| 1    | 0.975124 | 0.697143 | precision |
| 2    | 0.870229 | 0.830508 | precision |
| 3    | 0.966184 | 0.919149 | precision |
| 4    | 0.992526 | 0.974974 | precision |
| Mean | 0.959206 | 0.878724 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.989895 | 0.977539 | recall   |
| 1    | 0.880899 | 0.924242 | recall   |
| 2    | 0.850746 | 0.852174 | recall   |
| 3    | 0.984413 | 0.837209 | recall   |
| 4    | 0.991106 | 0.978914 | recall   |
| Mean | 0.939412 | 0.914016 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.990931 | 0.974684 | f1-score |
| 1    | 0.92562  | 0.794788 | f1-score |
| 2    | 0.860377 | 0.841202 | f1-score |
| 3    | 0.975213 | 0.876268 | f1-score |
| 4    | 0.991816 | 0.97694  | f1

In [46]:
# Test out mode 1, lower=False, remove_stopwords=False, and remove_punctuation=True, lemmatize=False
vectorizer = TfidfVectorizer()
X_train_tfidf = preprocess_sentences(X_train, y_train, mode=1, lower=False, remove_stopwords=False, remove_punctuation=True, lemmatize=False)
X_trainB_tfidf = [' '.join(sentence_words) for sentence_words in X_train_tfidf]
X_train_tfidf = vectorizer.fit_transform(X_trainB_tfidf)
X_test_tfidf = preprocess_sentences(X_test, y_test, mode=1, lower=False, remove_stopwords=False, remove_punctuation=True, lemmatize=False)
X_testB_tfidf = [' '.join(sentence_words) for sentence_words in X_test_tfidf]
X_test_tfidf = vectorizer.transform(X_testB_tfidf)
X_1_tfidf = preprocess_sentences(X_1, y_1, mode=1, lower=False, remove_stopwords=False, remove_punctuation=True, lemmatize=False)
X_1B_tfidf = [' '.join(sentence_words) for sentence_words in X_1_tfidf]
X_1_tfidf = vectorizer.transform(X_1B_tfidf)
X_2_tfidf = preprocess_sentences(X_2, y_2, mode=1, lower=False, remove_stopwords=False, remove_punctuation=True, lemmatize=False)
X_2B_tfidf = [' '.join(sentence_words) for sentence_words in X_2_tfidf]
X_2_tfidf = vectorizer.transform(X_2B_tfidf)
X_3_tfidf = preprocess_sentences(X_3, y_3, mode=1, lower=False, remove_stopwords=False, remove_punctuation=True, lemmatize=False)
X_3B_tfidf = [' '.join(sentence_words) for sentence_words in X_3_tfidf]
X_3_tfidf = vectorizer.transform(X_3B_tfidf)
X_tfidf = preprocess_sentences(X, y, mode=1, lower=False, remove_stopwords=False, remove_punctuation=True, lemmatize=False)
XB_tfidf = [' '.join(sentence_words) for sentence_words in X_tfidf]
X_tfidf = vectorizer.transform(XB_tfidf)

svm = SVC()
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict(X_test_tfidf)
y_pred_1 = svm.predict(X_1_tfidf)
y_pred_2 = svm.predict(X_2_tfidf)
y_pred_3 = svm.predict(X_3_tfidf)
y_pred_4 = svm.predict(X_tfidf)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.989878 | 0.966958 | precision |
| 1    | 0.977612 | 0.702857 | precision |
| 2    | 0.933333 | 0.829457 | precision |
| 3    | 0.966909 | 0.911765 | precision |
| 4    | 0.992832 | 0.973634 | precision |
| Mean | 0.972113 | 0.876934 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.988153 | 0.97168  | recall   |
| 1    | 0.883146 | 0.931818 | recall   |
| 2    | 0.835821 | 0.930435 | recall   |
| 3    | 0.982773 | 0.841085 | recall   |
| 4    | 0.990609 | 0.979793 | recall   |
| Mean | 0.9361   | 0.930962 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.989015 | 0.969313 | f1-score |
| 1    | 0.927981 | 0.801303 | f1-score |
| 2    | 0.88189  | 0.877049 | f1-score |
| 3    | 0.974776 | 0.875    | f1-score |
| 4    | 0.991719 | 0.976703 | f1

In [47]:
# Test out mode 1, lower=True, remove_stopwords=False, and remove_punctuation=True, lemmatize=False
vectorizer = TfidfVectorizer()
X_train_tfidf = preprocess_sentences(X_train, y_train, mode=1, lower=True, remove_stopwords=False, remove_punctuation=True, lemmatize=False)
X_trainB_tfidf = [' '.join(sentence_words) for sentence_words in X_train_tfidf]
X_train_tfidf = vectorizer.fit_transform(X_trainB_tfidf)
X_test_tfidf = preprocess_sentences(X_test, y_test, mode=1, lower=True, remove_stopwords=False, remove_punctuation=True, lemmatize=False)
X_testB_tfidf = [' '.join(sentence_words) for sentence_words in X_test_tfidf]
X_test_tfidf = vectorizer.transform(X_testB_tfidf)
X_1_tfidf = preprocess_sentences(X_1, y_1, mode=1, lower=True, remove_stopwords=False, remove_punctuation=True, lemmatize=False)
X_1B_tfidf = [' '.join(sentence_words) for sentence_words in X_1_tfidf]
X_1_tfidf = vectorizer.transform(X_1B_tfidf)
X_2_tfidf = preprocess_sentences(X_2, y_2, mode=1, lower=True, remove_stopwords=False, remove_punctuation=True, lemmatize=False)
X_2B_tfidf = [' '.join(sentence_words) for sentence_words in X_2_tfidf]
X_2_tfidf = vectorizer.transform(X_2B_tfidf)
X_3_tfidf = preprocess_sentences(X_3, y_3, mode=1, lower=True, remove_stopwords=False, remove_punctuation=True, lemmatize=False)
X_3B_tfidf = [' '.join(sentence_words) for sentence_words in X_3_tfidf]
X_3_tfidf = vectorizer.transform(X_3B_tfidf)
X_tfidf = preprocess_sentences(X, y, mode=1, lower=True, remove_stopwords=False, remove_punctuation=True, lemmatize=False)
XB_tfidf = [' '.join(sentence_words) for sentence_words in X_tfidf]
X_tfidf = vectorizer.transform(XB_tfidf)

svm = SVC()
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict(X_test_tfidf)
y_pred_1 = svm.predict(X_1_tfidf)
y_pred_2 = svm.predict(X_2_tfidf)
y_pred_3 = svm.predict(X_3_tfidf)
y_pred_4 = svm.predict(X_tfidf)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.989878 | 0.966958 | precision |
| 1    | 0.977612 | 0.702857 | precision |
| 2    | 0.933333 | 0.829457 | precision |
| 3    | 0.966909 | 0.911765 | precision |
| 4    | 0.992832 | 0.973634 | precision |
| Mean | 0.972113 | 0.876934 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.988153 | 0.97168  | recall   |
| 1    | 0.883146 | 0.931818 | recall   |
| 2    | 0.835821 | 0.930435 | recall   |
| 3    | 0.982773 | 0.841085 | recall   |
| 4    | 0.990609 | 0.979793 | recall   |
| Mean | 0.9361   | 0.930962 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.989015 | 0.969313 | f1-score |
| 1    | 0.927981 | 0.801303 | f1-score |
| 2    | 0.88189  | 0.877049 | f1-score |
| 3    | 0.974776 | 0.875    | f1-score |
| 4    | 0.991719 | 0.976703 | f1

In [51]:
# Test out mode 1, lower=False, remove_stopwords=False, and remove_punctuation=False, lemmatize=True
vectorizer = TfidfVectorizer()
X_train_tfidf = preprocess_sentences(X_train, y_train, mode=1, lower=False, remove_stopwords=False, remove_punctuation=False, lemmatize=True)
X_trainB_tfidf = [' '.join(sentence_words) for sentence_words in X_train_tfidf]
X_train_tfidf = vectorizer.fit_transform(X_trainB_tfidf)
X_test_tfidf = preprocess_sentences(X_test, y_test, mode=1, lower=False, remove_stopwords=False, remove_punctuation=False, lemmatize=True)
X_testB_tfidf = [' '.join(sentence_words) for sentence_words in X_test_tfidf]
X_test_tfidf = vectorizer.transform(X_testB_tfidf)
X_1_tfidf = preprocess_sentences(X_1, y_1, mode=1, lower=False, remove_stopwords=False, remove_punctuation=False, lemmatize=True)
X_1B_tfidf = [' '.join(sentence_words) for sentence_words in X_1_tfidf]
X_1_tfidf = vectorizer.transform(X_1B_tfidf)
X_2_tfidf = preprocess_sentences(X_2, y_2, mode=1, lower=False, remove_stopwords=False, remove_punctuation=False, lemmatize=True)
X_2B_tfidf = [' '.join(sentence_words) for sentence_words in X_2_tfidf]
X_2_tfidf = vectorizer.transform(X_2B_tfidf)
X_3_tfidf = preprocess_sentences(X_3, y_3, mode=1, lower=False, remove_stopwords=False, remove_punctuation=False, lemmatize=True)
X_3B_tfidf = [' '.join(sentence_words) for sentence_words in X_3_tfidf]
X_3_tfidf = vectorizer.transform(X_3B_tfidf)
X_tfidf = preprocess_sentences(X, y, mode=1, lower=False, remove_stopwords=False, remove_punctuation=False, lemmatize=True)
XB_tfidf = [' '.join(sentence_words) for sentence_words in X_tfidf]
X_tfidf = vectorizer.transform(XB_tfidf)

svm = SVC()
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict(X_test_tfidf)
y_pred_1 = svm.predict(X_1_tfidf)
y_pred_2 = svm.predict(X_2_tfidf)
y_pred_3 = svm.predict(X_3_tfidf)
y_pred_4 = svm.predict(X_tfidf)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.990227 | 0.96793  | precision |
| 1    | 0.975186 | 0.701149 | precision |
| 2    | 0.933333 | 0.829457 | precision |
| 3    | 0.966909 | 0.911765 | precision |
| 4    | 0.992894 | 0.973638 | precision |
| Mean | 0.97171  | 0.876788 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.988502 | 0.972656 | recall   |
| 1    | 0.883146 | 0.924242 | recall   |
| 2    | 0.835821 | 0.930435 | recall   |
| 3    | 0.982773 | 0.841085 | recall   |
| 4    | 0.990609 | 0.979968 | recall   |
| Mean | 0.93617  | 0.929677 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.989364 | 0.970287 | f1-score |
| 1    | 0.926887 | 0.797386 | f1-score |
| 2    | 0.88189  | 0.877049 | f1-score |
| 3    | 0.974776 | 0.875    | f1-score |
| 4    | 0.99175  | 0.976793 | f1

In [48]:
# Test out mode 1, lower=True, remove_stopwords=False, and remove_punctuation=False, lemmatize=True
vectorizer = TfidfVectorizer()
X_train_tfidf = preprocess_sentences(X_train, y_train, mode=1, lower=True, remove_stopwords=False, remove_punctuation=False, lemmatize=True)
X_trainB_tfidf = [' '.join(sentence_words) for sentence_words in X_train_tfidf]
X_train_tfidf = vectorizer.fit_transform(X_trainB_tfidf)
X_test_tfidf = preprocess_sentences(X_test, y_test, mode=1, lower=True, remove_stopwords=False, remove_punctuation=False, lemmatize=True)
X_testB_tfidf = [' '.join(sentence_words) for sentence_words in X_test_tfidf]
X_test_tfidf = vectorizer.transform(X_testB_tfidf)
X_1_tfidf = preprocess_sentences(X_1, y_1, mode=1, lower=True, remove_stopwords=False, remove_punctuation=False, lemmatize=True)
X_1B_tfidf = [' '.join(sentence_words) for sentence_words in X_1_tfidf]
X_1_tfidf = vectorizer.transform(X_1B_tfidf)
X_2_tfidf = preprocess_sentences(X_2, y_2, mode=1, lower=True, remove_stopwords=False, remove_punctuation=False, lemmatize=True)
X_2B_tfidf = [' '.join(sentence_words) for sentence_words in X_2_tfidf]
X_2_tfidf = vectorizer.transform(X_2B_tfidf)
X_3_tfidf = preprocess_sentences(X_3, y_3, mode=1, lower=True, remove_stopwords=False, remove_punctuation=False, lemmatize=True)
X_3B_tfidf = [' '.join(sentence_words) for sentence_words in X_3_tfidf]
X_3_tfidf = vectorizer.transform(X_3B_tfidf)
X_tfidf = preprocess_sentences(X, y, mode=1, lower=True, remove_stopwords=False, remove_punctuation=False, lemmatize=True)
XB_tfidf = [' '.join(sentence_words) for sentence_words in X_tfidf]
X_tfidf = vectorizer.transform(XB_tfidf)

svm = SVC()
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict(X_test_tfidf)
y_pred_1 = svm.predict(X_1_tfidf)
y_pred_2 = svm.predict(X_2_tfidf)
y_pred_3 = svm.predict(X_3_tfidf)
y_pred_4 = svm.predict(X_tfidf)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.990573 | 0.967961 | precision |
| 1    | 0.982323 | 0.690608 | precision |
| 2    | 0.90678  | 0.793893 | precision |
| 3    | 0.966882 | 0.90795  | precision |
| 4    | 0.992889 | 0.971941 | precision |
| Mean | 0.967889 | 0.866471 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.988502 | 0.973633 | recall   |
| 1    | 0.874157 | 0.94697  | recall   |
| 2    | 0.798507 | 0.904348 | recall   |
| 3    | 0.981952 | 0.841085 | recall   |
| 4    | 0.989987 | 0.979968 | recall   |
| Mean | 0.926621 | 0.929201 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.989536 | 0.970789 | f1-score |
| 1    | 0.925089 | 0.798722 | f1-score |
| 2    | 0.849206 | 0.845528 | f1-score |
| 3    | 0.974359 | 0.873239 | f1-score |
| 4    | 0.991436 | 0.975938 | f1

In [49]:
# Test out mode 1, lower=True, remove_stopwords=True, and remove_punctuation=False, lemmatize=True
vectorizer = TfidfVectorizer()
X_train_tfidf = preprocess_sentences(X_train, y_train, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=True)
X_trainB_tfidf = [' '.join(sentence_words) for sentence_words in X_train_tfidf]
X_train_tfidf = vectorizer.fit_transform(X_trainB_tfidf)
X_test_tfidf = preprocess_sentences(X_test, y_test, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=True)
X_testB_tfidf = [' '.join(sentence_words) for sentence_words in X_test_tfidf]
X_test_tfidf = vectorizer.transform(X_testB_tfidf)
X_1_tfidf = preprocess_sentences(X_1, y_1, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=True)
X_1B_tfidf = [' '.join(sentence_words) for sentence_words in X_1_tfidf]
X_1_tfidf = vectorizer.transform(X_1B_tfidf)
X_2_tfidf = preprocess_sentences(X_2, y_2, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=True)
X_2B_tfidf = [' '.join(sentence_words) for sentence_words in X_2_tfidf]
X_2_tfidf = vectorizer.transform(X_2B_tfidf)
X_3_tfidf = preprocess_sentences(X_3, y_3, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=True)
X_3B_tfidf = [' '.join(sentence_words) for sentence_words in X_3_tfidf]
X_3_tfidf = vectorizer.transform(X_3B_tfidf)
X_tfidf = preprocess_sentences(X, y, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=True)
XB_tfidf = [' '.join(sentence_words) for sentence_words in X_tfidf]
X_tfidf = vectorizer.transform(XB_tfidf)

svm = SVC()
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict(X_test_tfidf)
y_pred_1 = svm.predict(X_1_tfidf)
y_pred_2 = svm.predict(X_2_tfidf)
y_pred_3 = svm.predict(X_3_tfidf)
y_pred_4 = svm.predict(X_tfidf)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.992318 | 0.972816 | precision |
| 1    | 0.977273 | 0.679558 | precision |
| 2    | 0.837209 | 0.783333 | precision |
| 3    | 0.966989 | 0.923404 | precision |
| 4    | 0.992459 | 0.973096 | precision |
| Mean | 0.95325  | 0.866441 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.990244 | 0.978516 | recall   |
| 1    | 0.869663 | 0.931818 | recall   |
| 2    | 0.80597  | 0.817391 | recall   |
| 3    | 0.985234 | 0.841085 | recall   |
| 4    | 0.990422 | 0.978738 | recall   |
| Mean | 0.928307 | 0.90951  | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.99128  | 0.975657 | f1-score |
| 1    | 0.920333 | 0.785942 | f1-score |
| 2    | 0.821293 | 0.8      | f1-score |
| 3    | 0.976026 | 0.880325 | f1-score |
| 4    | 0.99144  | 0.975909 | f1

In [50]:
# Test out mode 1, lower=True, remove_stopwords=True, and remove_punctuation=True, lemmatize=True
vectorizer = TfidfVectorizer()
X_train_tfidf = preprocess_sentences(X_train, y_train, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=True)
X_trainB_tfidf = [' '.join(sentence_words) for sentence_words in X_train_tfidf]
X_train_tfidf = vectorizer.fit_transform(X_trainB_tfidf)
X_test_tfidf = preprocess_sentences(X_test, y_test, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=True)
X_testB_tfidf = [' '.join(sentence_words) for sentence_words in X_test_tfidf]
X_test_tfidf = vectorizer.transform(X_testB_tfidf)
X_1_tfidf = preprocess_sentences(X_1, y_1, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=True)
X_1B_tfidf = [' '.join(sentence_words) for sentence_words in X_1_tfidf]
X_1_tfidf = vectorizer.transform(X_1B_tfidf)
X_2_tfidf = preprocess_sentences(X_2, y_2, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=True)
X_2B_tfidf = [' '.join(sentence_words) for sentence_words in X_2_tfidf]
X_2_tfidf = vectorizer.transform(X_2B_tfidf)
X_3_tfidf = preprocess_sentences(X_3, y_3, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=True)
X_3B_tfidf = [' '.join(sentence_words) for sentence_words in X_3_tfidf]
X_3_tfidf = vectorizer.transform(X_3B_tfidf)
X_tfidf = preprocess_sentences(X, y, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=True)
XB_tfidf = [' '.join(sentence_words) for sentence_words in X_tfidf]
X_tfidf = vectorizer.transform(XB_tfidf)

svm = SVC()
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict(X_test_tfidf)
y_pred_1 = svm.predict(X_1_tfidf)
y_pred_2 = svm.predict(X_2_tfidf)
y_pred_3 = svm.predict(X_3_tfidf)
y_pred_4 = svm.predict(X_tfidf)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.992318 | 0.972816 | precision |
| 1    | 0.977273 | 0.679558 | precision |
| 2    | 0.837209 | 0.783333 | precision |
| 3    | 0.966989 | 0.923404 | precision |
| 4    | 0.992459 | 0.973096 | precision |
| Mean | 0.95325  | 0.866441 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.990244 | 0.978516 | recall   |
| 1    | 0.869663 | 0.931818 | recall   |
| 2    | 0.80597  | 0.817391 | recall   |
| 3    | 0.985234 | 0.841085 | recall   |
| 4    | 0.990422 | 0.978738 | recall   |
| Mean | 0.928307 | 0.90951  | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.99128  | 0.975657 | f1-score |
| 1    | 0.920333 | 0.785942 | f1-score |
| 2    | 0.821293 | 0.8      | f1-score |
| 3    | 0.976026 | 0.880325 | f1-score |
| 4    | 0.99144  | 0.975909 | f1

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
X_1_tfidf = vectorizer.transform(X_1)
X_2_tfidf = vectorizer.transform(X_2)
X_3_tfidf = vectorizer.transform(X_3)
X_tfidf = vectorizer.transform(X)

from sklearn.svm import SVC

svm = SVC(probability=True)
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict_proba(X_test_tfidf)
y_pred_1 = svm.predict_proba(X_1_tfidf)
y_pred_2 = svm.predict_proba(X_2_tfidf)
y_pred_3 = svm.predict_proba(X_3_tfidf)
y_pred_4 = svm.predict_proba(X_tfidf)
y_pred_classification = np.argmax(y_pred, axis=1)
y_pred_1_classification = np.argmax(y_pred_1, axis=1)
y_pred_2_classification = np.argmax(y_pred_2, axis=1)
y_pred_3_classification = np.argmax(y_pred_3, axis=1)
y_pred_4_classification = np.argmax(y_pred_4, axis=1)
report = classification_report(y_test, y_pred_classification, output_dict=True)
report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))
print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.991262 | 0.967086 | precision |
| 1    | 0.97284  | 0.703488 | precision |
| 2    | 0.945312 | 0.892562 | precision |
| 3    | 0.991701 | 0.911765 | precision |
| 4    | 0.995004 | 0.974809 | precision |
| Mean | 0.979224 | 0.889942 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.988153 | 0.975586 | recall   |
| 1    | 0.885393 | 0.916667 | recall   |
| 2    | 0.902985 | 0.93913  | recall   |
| 3    | 0.980312 | 0.96124  | recall   |
| 4    | 0.990982 | 0.985943 | recall   |
| Mean | 0.949565 | 0.955713 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.989705 | 0.971317 | f1-score |
| 1    | 0.927059 | 0.796053 | f1-score |
| 2    | 0.923664 | 0.915254 | f1-score |
| 3    | 0.985974 | 0.935849 | f1-score |
| 4    | 0.992989 | 0.980344 | f1

In [141]:
# Use a threshold of 0.999 otherwise set to class 0
y_pred_classification = [np.argmax(p) if np.max(p) > 0.999 else 0 for p in y_pred]
y_pred_1_classification = [np.argmax(p) if np.max(p) > 0.999 else 0 for p in y_pred_1]
y_pred_2_classification = [np.argmax(p) if np.max(p) > 0.999 else 0 for p in y_pred_2]
y_pred_3_classification = [np.argmax(p) if np.max(p) > 0.999 else 0 for p in y_pred_3]
y_pred_4_classification = [np.argmax(p) if np.max(p) > 0.999 else 0 for p in y_pred_4]
report = classification_report(y_test, y_pred_classification, output_dict=True)
report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))
print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.815647 | 0.992084 | precision |
| 1    | 0.789661 | 0.875    | precision |
| 2    | 0.580087 | 1        | precision |
| 3    | 0.848189 | 0.97561  | precision |
| 4    | 0.797865 | 0.996308 | precision |
| Mean | 0.76629  | 0.9678   | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.998955 | 0.367188 | recall   |
| 1    | 0.995506 | 0.106061 | recall   |
| 2    | 1        | 0.156522 | recall   |
| 3    | 0.99918  | 0.155039 | recall   |
| 4    | 0.999627 | 0.284484 | recall   |
| Mean | 0.998653 | 0.213859 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.898042 | 0.535994 | f1-score |
| 1    | 0.880716 | 0.189189 | f1-score |
| 2    | 0.734247 | 0.270677 | f1-score |
| 3    | 0.917514 | 0.267559 | f1-score |
| 4    | 0.887423 | 0.442592 | f1

In [142]:
# Use a threshold of 0.99 otherwise set to class 0
y_pred_classification = [np.argmax(p) if np.max(p) > 0.99 else 0 for p in y_pred]
y_pred_1_classification = [np.argmax(p) if np.max(p) > 0.99 else 0 for p in y_pred_1]
y_pred_2_classification = [np.argmax(p) if np.max(p) > 0.99 else 0 for p in y_pred_2]
y_pred_3_classification = [np.argmax(p) if np.max(p) > 0.99 else 0 for p in y_pred_3]
y_pred_4_classification = [np.argmax(p) if np.max(p) > 0.99 else 0 for p in y_pred_4]
report = classification_report(y_test, y_pred_classification, output_dict=True)
report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))
print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.905517 | 0.981081 | precision |
| 1    | 0.823091 | 0.925    | precision |
| 2    | 0.653465 | 0.957447 | precision |
| 3    | 0.901189 | 0.954198 | precision |
| 4    | 0.957014 | 0.994597 | precision |
| Mean | 0.848055 | 0.962465 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.995122 | 0.708984 | recall   |
| 1    | 0.993258 | 0.280303 | recall   |
| 2    | 0.985075 | 0.391304 | recall   |
| 3    | 0.995078 | 0.484496 | recall   |
| 4    | 0.998321 | 0.873309 | recall   |
| Mean | 0.993371 | 0.547679 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.948207 | 0.823129 | f1-score |
| 1    | 0.900204 | 0.430233 | f1-score |
| 2    | 0.785714 | 0.555556 | f1-score |
| 3    | 0.945809 | 0.642674 | f1-score |
| 4    | 0.977231 | 0.930015 | f1

In [143]:
# Use a threshold of 0.95 otherwise set to class 0

y_pred_classification = [np.argmax(p) if np.max(p) > 0.95 else 0 for p in y_pred]
y_pred_1_classification = [np.argmax(p) if np.max(p) > 0.95 else 0 for p in y_pred_1]
y_pred_2_classification = [np.argmax(p) if np.max(p) > 0.95 else 0 for p in y_pred_2]
y_pred_3_classification = [np.argmax(p) if np.max(p) > 0.95 else 0 for p in y_pred_3]
y_pred_4_classification = [np.argmax(p) if np.max(p) > 0.95 else 0 for p in y_pred_4]
report = classification_report(y_test, y_pred_classification, output_dict=True)
report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))
print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.956784 | 0.984598 | precision |
| 1    | 0.870259 | 0.881579 | precision |
| 2    | 0.747126 | 0.946667 | precision |
| 3    | 0.932308 | 0.960452 | precision |
| 4    | 0.976438 | 0.992329 | precision |
| Mean | 0.896583 | 0.953125 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.995122 | 0.874023 | recall   |
| 1    | 0.979775 | 0.507576 | recall   |
| 2    | 0.970149 | 0.617391 | recall   |
| 3    | 0.994258 | 0.658915 | recall   |
| 4    | 0.99745  | 0.931998 | recall   |
| Mean | 0.987351 | 0.717981 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.975576 | 0.926022 | f1-score |
| 1    | 0.921776 | 0.644231 | f1-score |
| 2    | 0.844156 | 0.747368 | f1-score |
| 3    | 0.962287 | 0.781609 | f1-score |
| 4    | 0.986832 | 0.961218 | f1

In [144]:
# Use a threshold of 0.90 otherwise set to class 0

y_pred_classification = [np.argmax(p) if np.max(p) > 0.90 else 0 for p in y_pred]
y_pred_1_classification = [np.argmax(p) if np.max(p) > 0.90 else 0 for p in y_pred_1]
y_pred_2_classification = [np.argmax(p) if np.max(p) > 0.90 else 0 for p in y_pred_2]
y_pred_3_classification = [np.argmax(p) if np.max(p) > 0.90 else 0 for p in y_pred_3]
y_pred_4_classification = [np.argmax(p) if np.max(p) > 0.90 else 0 for p in y_pred_4]
report = classification_report(y_test, y_pred_classification, output_dict=True)
report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))
print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.969079 | 0.981073 | precision |
| 1    | 0.891892 | 0.833333 | precision |
| 2    | 0.787879 | 0.952381 | precision |
| 3    | 0.95358  | 0.966019 | precision |
| 4    | 0.982525 | 0.989929 | precision |
| Mean | 0.916991 | 0.944547 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.993728 | 0.911133 | recall   |
| 1    | 0.964045 | 0.606061 | recall   |
| 2    | 0.970149 | 0.695652 | recall   |
| 3    | 0.994258 | 0.771318 | recall   |
| 4    | 0.996579 | 0.949921 | recall   |
| Mean | 0.983752 | 0.786817 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.981249 | 0.94481  | f1-score |
| 1    | 0.926566 | 0.701754 | f1-score |
| 2    | 0.869565 | 0.80402  | f1-score |
| 3    | 0.973494 | 0.857759 | f1-score |
| 4    | 0.989502 | 0.969512 | f1

In [145]:
# Use a threshold of 0.80 otherwise set to class 0

y_pred_classification = [np.argmax(p) if np.max(p) > 0.80 else 0 for p in y_pred]
y_pred_1_classification = [np.argmax(p) if np.max(p) > 0.80 else 0 for p in y_pred_1]
y_pred_2_classification = [np.argmax(p) if np.max(p) > 0.80 else 0 for p in y_pred_2]
y_pred_3_classification = [np.argmax(p) if np.max(p) > 0.80 else 0 for p in y_pred_3]
y_pred_4_classification = [np.argmax(p) if np.max(p) > 0.80 else 0 for p in y_pred_4]
report = classification_report(y_test, y_pred_classification, output_dict=True)
report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))
print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.981041 | 0.975831 | precision |
| 1    | 0.909292 | 0.728    | precision |
| 2    | 0.84106  | 0.928571 | precision |
| 3    | 0.964968 | 0.968326 | precision |
| 4    | 0.987771 | 0.984585 | precision |
| Mean | 0.936826 | 0.917063 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.991638 | 0.946289 | recall   |
| 1    | 0.923596 | 0.689394 | recall   |
| 2    | 0.947761 | 0.791304 | recall   |
| 3    | 0.994258 | 0.829457 | recall   |
| 4    | 0.994651 | 0.965208 | recall   |
| Mean | 0.970381 | 0.844331 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.986311 | 0.960833 | f1-score |
| 1    | 0.916388 | 0.708171 | f1-score |
| 2    | 0.891228 | 0.85446  | f1-score |
| 3    | 0.979394 | 0.893528 | f1-score |
| 4    | 0.991199 | 0.9748   | f1

In [146]:
# Use a threshold of 0.70 otherwise set to class 0

y_pred_classification = [np.argmax(p) if np.max(p) > 0.70 else 0 for p in y_pred]
y_pred_1_classification = [np.argmax(p) if np.max(p) > 0.70 else 0 for p in y_pred_1]
y_pred_2_classification = [np.argmax(p) if np.max(p) > 0.70 else 0 for p in y_pred_2]
y_pred_3_classification = [np.argmax(p) if np.max(p) > 0.70 else 0 for p in y_pred_3]
y_pred_4_classification = [np.argmax(p) if np.max(p) > 0.70 else 0 for p in y_pred_4]
report = classification_report(y_test, y_pred_classification, output_dict=True)
report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))
print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.985779 | 0.972305 | precision |
| 1    | 0.924658 | 0.71223  | precision |
| 2    | 0.862069 | 0.913462 | precision |
| 3    | 0.968498 | 0.916318 | precision |
| 4    | 0.989831 | 0.979617 | precision |
| Mean | 0.946167 | 0.898786 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.990244 | 0.959961 | recall   |
| 1    | 0.910112 | 0.75     | recall   |
| 2    | 0.932836 | 0.826087 | recall   |
| 3    | 0.983593 | 0.848837 | recall   |
| 4    | 0.992848 | 0.971183 | recall   |
| Mean | 0.961927 | 0.871214 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.988006 | 0.966093 | f1-score |
| 1    | 0.917327 | 0.730627 | f1-score |
| 2    | 0.896057 | 0.86758  | f1-score |
| 3    | 0.975987 | 0.881288 | f1-score |
| 4    | 0.991337 | 0.975382 | f1

### Test our different TF-IDF parameters to improve performance

In [5]:
# stip_accents='unicode'
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
X_1_tfidf = vectorizer.transform(X_1)
X_2_tfidf = vectorizer.transform(X_2)
X_3_tfidf = vectorizer.transform(X_3)
X_tfidf = vectorizer.transform(X)
from sklearn.svm import SVC
svm = SVC(probability=True)
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict_proba(X_test_tfidf)
y_pred_1 = svm.predict_proba(X_1_tfidf)
y_pred_2 = svm.predict_proba(X_2_tfidf)
y_pred_3 = svm.predict_proba(X_3_tfidf)
y_pred_4 = svm.predict_proba(X_tfidf)
y_pred_classification = np.argmax(y_pred, axis=1)
y_pred_1_classification = np.argmax(y_pred_1, axis=1)
y_pred_2_classification = np.argmax(y_pred_2, axis=1)
y_pred_3_classification = np.argmax(y_pred_3, axis=1)
y_pred_4_classification = np.argmax(y_pred_4, axis=1)
report = classification_report(y_test, y_pred_classification, output_dict=True)
report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))
print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.991262 | 0.967086 | precision |
| 1    | 0.97284  | 0.703488 | precision |
| 2    | 0.945312 | 0.892562 | precision |
| 3    | 0.990879 | 0.911439 | precision |
| 4    | 0.994942 | 0.974805 | precision |
| Mean | 0.979047 | 0.889876 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.988153 | 0.975586 | recall   |
| 1    | 0.885393 | 0.916667 | recall   |
| 2    | 0.902985 | 0.93913  | recall   |
| 3    | 0.980312 | 0.957364 | recall   |
| 4    | 0.990982 | 0.985767 | recall   |
| Mean | 0.949565 | 0.954903 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.989705 | 0.971317 | f1-score |
| 1    | 0.927059 | 0.796053 | f1-score |
| 2    | 0.923664 | 0.915254 | f1-score |
| 3    | 0.985567 | 0.933837 | f1-score |
| 4    | 0.992958 | 0.980255 | f1

In [6]:
# analyzer=char
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer='char')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
X_1_tfidf = vectorizer.transform(X_1)
X_2_tfidf = vectorizer.transform(X_2)
X_3_tfidf = vectorizer.transform(X_3)
X_tfidf = vectorizer.transform(X)
from sklearn.svm import SVC
svm = SVC(probability=True)
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict_proba(X_test_tfidf)
y_pred_1 = svm.predict_proba(X_1_tfidf)
y_pred_2 = svm.predict_proba(X_2_tfidf)
y_pred_3 = svm.predict_proba(X_3_tfidf)
y_pred_4 = svm.predict_proba(X_tfidf)
y_pred_classification = np.argmax(y_pred, axis=1)
y_pred_1_classification = np.argmax(y_pred_1, axis=1)
y_pred_2_classification = np.argmax(y_pred_2, axis=1)
y_pred_3_classification = np.argmax(y_pred_3, axis=1)
y_pred_4_classification = np.argmax(y_pred_4, axis=1)
report = classification_report(y_test, y_pred_classification, output_dict=True)
report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))
print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.985925 | 0.935361 | precision |
| 1    | 0.981579 | 0.634518 | precision |
| 2    | 0.962617 | 0.78169  | precision |
| 3    | 0.983389 | 0.871795 | precision |
| 4    | 0.988243 | 0.925366 | precision |
| Mean | 0.980351 | 0.829746 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.976307 | 0.960938 | recall   |
| 1    | 0.838202 | 0.94697  | recall   |
| 2    | 0.768657 | 0.965217 | recall   |
| 3    | 0.971288 | 0.922481 | recall   |
| 4    | 0.972386 | 0.967317 | recall   |
| Mean | 0.905368 | 0.952584 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.981092 | 0.947977 | f1-score |
| 1    | 0.904242 | 0.759878 | f1-score |
| 2    | 0.854772 | 0.863813 | f1-score |
| 3    | 0.977301 | 0.896422 | f1-score |
| 4    | 0.980251 | 0.945876 | f1

In [7]:
# analyzer=char_wb
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer='char_wb')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
X_1_tfidf = vectorizer.transform(X_1)
X_2_tfidf = vectorizer.transform(X_2)
X_3_tfidf = vectorizer.transform(X_3)
X_tfidf = vectorizer.transform(X)
from sklearn.svm import SVC
svm = SVC(probability=True)
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict_proba(X_test_tfidf)
y_pred_1 = svm.predict_proba(X_1_tfidf)
y_pred_2 = svm.predict_proba(X_2_tfidf)
y_pred_3 = svm.predict_proba(X_3_tfidf)
y_pred_4 = svm.predict_proba(X_tfidf)
y_pred_classification = np.argmax(y_pred, axis=1)
y_pred_1_classification = np.argmax(y_pred_1, axis=1)
y_pred_2_classification = np.argmax(y_pred_2, axis=1)
y_pred_3_classification = np.argmax(y_pred_3, axis=1)
y_pred_4_classification = np.argmax(y_pred_4, axis=1)
report = classification_report(y_test, y_pred_classification, output_dict=True)
report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))
print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.983462 | 0.928707 | precision |
| 1    | 0.980663 | 0.581395 | precision |
| 2    | 0.971429 | 0.777778 | precision |
| 3    | 0.985845 | 0.873188 | precision |
| 4    | 0.986952 | 0.916917 | precision |
| Mean | 0.98167  | 0.815597 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.973868 | 0.954102 | recall   |
| 1    | 0.797753 | 0.94697  | recall   |
| 2    | 0.761194 | 0.973913 | recall   |
| 3    | 0.971288 | 0.934109 | recall   |
| 4    | 0.96909  | 0.963802 | recall   |
| Mean | 0.894638 | 0.954579 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.978641 | 0.941233 | f1-score |
| 1    | 0.879802 | 0.720461 | f1-score |
| 2    | 0.853556 | 0.864865 | f1-score |
| 3    | 0.978512 | 0.902622 | f1-score |
| 4    | 0.97794  | 0.939776 | f1

In [8]:
# ngram_range=(1, 3)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
X_1_tfidf = vectorizer.transform(X_1)
X_2_tfidf = vectorizer.transform(X_2)
X_3_tfidf = vectorizer.transform(X_3)
X_tfidf = vectorizer.transform(X)
from sklearn.svm import SVC
svm = SVC(probability=True)
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict_proba(X_test_tfidf)
y_pred_1 = svm.predict_proba(X_1_tfidf)
y_pred_2 = svm.predict_proba(X_2_tfidf)
y_pred_3 = svm.predict_proba(X_3_tfidf)
y_pred_4 = svm.predict_proba(X_tfidf)
y_pred_classification = np.argmax(y_pred, axis=1)
y_pred_1_classification = np.argmax(y_pred_1, axis=1)
y_pred_2_classification = np.argmax(y_pred_2, axis=1)
y_pred_3_classification = np.argmax(y_pred_3, axis=1)
y_pred_4_classification = np.argmax(y_pred_4, axis=1)
report = classification_report(y_test, y_pred_classification, output_dict=True)
report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))
print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.991608 | 0.967118 | precision |
| 1    | 0.985    | 0.711864 | precision |
| 2    | 0.943089 | 0.857143 | precision |
| 3    | 0.991708 | 0.915129 | precision |
| 4    | 0.995566 | 0.976372 | precision |
| Mean | 0.981394 | 0.885525 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.988153 | 0.976562 | recall   |
| 1    | 0.885393 | 0.954545 | recall   |
| 2    | 0.865672 | 0.93913  | recall   |
| 3    | 0.981132 | 0.96124  | recall   |
| 4    | 0.991542 | 0.987524 | recall   |
| Mean | 0.942378 | 0.963801 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.989878 | 0.971817 | f1-score |
| 1    | 0.932544 | 0.815534 | f1-score |
| 2    | 0.902724 | 0.896266 | f1-score |
| 3    | 0.986392 | 0.937618 | f1-score |
| 4    | 0.99355  | 0.981917 | f1

In [14]:
# change all 4 digit numbers (dates) to DATE - change (c), (C) & © to COPYRIGHT_SYMBOL
import re

def minor_preprocess(sentences):
    sentences = [re.sub('\d{4}', 'DATE', sentence)  for sentence in sentences]
    sentences = [re.sub('(c)', 'COPYRIGHT_SYMBOL', sentence)  for sentence in sentences]
    sentences = [re.sub('(C)', 'COPYRIGHT_SYMBOL', sentence)  for sentence in sentences]
    sentences = [re.sub('©', 'COPYRIGHT_SYMBOL', sentence)  for sentence in sentences]
    return sentences
    

X_train_temp = minor_preprocess(X_train)
X_test_temp = minor_preprocess(X_test)
X_1_temp = minor_preprocess(X_1)
X_2_temp = minor_preprocess(X_2)
X_3_temp = minor_preprocess(X_3)
X_temp = minor_preprocess(X)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_temp)
X_test_tfidf = vectorizer.transform(X_test_temp)
X_1_tfidf = vectorizer.transform(X_1_temp)
X_2_tfidf = vectorizer.transform(X_2_temp)
X_3_tfidf = vectorizer.transform(X_3_temp)
X_tfidf = vectorizer.transform(X_temp)
from sklearn.svm import SVC
svm = SVC(probability=True)
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict_proba(X_test_tfidf)
y_pred_1 = svm.predict_proba(X_1_tfidf)
y_pred_2 = svm.predict_proba(X_2_tfidf)
y_pred_3 = svm.predict_proba(X_3_tfidf)
y_pred_4 = svm.predict_proba(X_tfidf)
y_pred_classification = np.argmax(y_pred, axis=1)
y_pred_1_classification = np.argmax(y_pred_1, axis=1)
y_pred_2_classification = np.argmax(y_pred_2, axis=1)
y_pred_3_classification = np.argmax(y_pred_3, axis=1)
y_pred_4_classification = np.argmax(y_pred_4, axis=1)
report = classification_report(y_test, y_pred_classification, output_dict=True)
report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))
print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.992316 | 0.971872 | precision |
| 1    | 0.980676 | 0.760736 | precision |
| 2    | 0.909091 | 0.880342 | precision |
| 3    | 0.9688   | 0.964758 | precision |
| 4    | 0.993095 | 0.979979 | precision |
| Mean | 0.968796 | 0.911537 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.989895 | 0.978516 | recall   |
| 1    | 0.91236  | 0.939394 | recall   |
| 2    | 0.895522 | 0.895652 | recall   |
| 3    | 0.993437 | 0.848837 | recall   |
| 4    | 0.99291  | 0.980496 | recall   |
| Mean | 0.956825 | 0.928579 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.991104 | 0.975182 | f1-score |
| 1    | 0.945285 | 0.840678 | f1-score |
| 2    | 0.902256 | 0.887931 | f1-score |
| 3    | 0.980964 | 0.903093 | f1-score |
| 4    | 0.993003 | 0.980237 | f1

In [15]:
# change all 4 digit numbers (dates) to DATE - change (c), (C) & © to COPYRIGHT_SYMBOL - 0.99 threshold
import re

def minor_preprocess(sentences):
    sentences = [re.sub('\d{4}', 'DATE', sentence)  for sentence in sentences]
    sentences = [re.sub('(c)', 'COPYRIGHT_SYMBOL', sentence)  for sentence in sentences]
    sentences = [re.sub('(C)', 'COPYRIGHT_SYMBOL', sentence)  for sentence in sentences]
    sentences = [re.sub('©', 'COPYRIGHT_SYMBOL', sentence)  for sentence in sentences]
    return sentences
    

X_train_temp = minor_preprocess(X_train)
X_test_temp = minor_preprocess(X_test)
X_1_temp = minor_preprocess(X_1)
X_2_temp = minor_preprocess(X_2)
X_3_temp = minor_preprocess(X_3)
X_temp = minor_preprocess(X)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_temp)
X_test_tfidf = vectorizer.transform(X_test_temp)
X_1_tfidf = vectorizer.transform(X_1_temp)
X_2_tfidf = vectorizer.transform(X_2_temp)
X_3_tfidf = vectorizer.transform(X_3_temp)
X_tfidf = vectorizer.transform(X_temp)
from sklearn.svm import SVC
svm = SVC(probability=True)
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict_proba(X_test_tfidf)
y_pred_1 = svm.predict_proba(X_1_tfidf)
y_pred_2 = svm.predict_proba(X_2_tfidf)
y_pred_3 = svm.predict_proba(X_3_tfidf)
y_pred_4 = svm.predict_proba(X_tfidf)
y_pred_classification = [np.argmax(p) if np.max(p) > 0.99 else 0 for p in y_pred]
y_pred_1_classification = [np.argmax(p) if np.max(p) > 0.99 else 0 for p in y_pred_1]
y_pred_2_classification = [np.argmax(p) if np.max(p) > 0.99 else 0 for p in y_pred_2]
y_pred_3_classification = [np.argmax(p) if np.max(p) > 0.99 else 0 for p in y_pred_3]
y_pred_4_classification = [np.argmax(p) if np.max(p) > 0.99 else 0 for p in y_pred_4]
report = classification_report(y_test, y_pred_classification, output_dict=True)
report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))
print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.848225 | 0.994163 | precision |
| 1    | 0.802536 | 0.92     | precision |
| 2    | 0.590308 | 1        | precision |
| 3    | 0.86814  | 0.986486 | precision |
| 4    | 0.834701 | 0.997613 | precision |
| Mean | 0.788782 | 0.979653 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.998955 | 0.499023 | recall   |
| 1    | 0.995506 | 0.174242 | recall   |
| 2    | 1        | 0.191304 | recall   |
| 3    | 0.99918  | 0.282946 | recall   |
| 4    | 0.999627 | 0.440696 | recall   |
| Mean | 0.998653 | 0.317642 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.91744  | 0.664499 | f1-score |
| 1    | 0.888666 | 0.292994 | f1-score |
| 2    | 0.742382 | 0.321168 | f1-score |
| 3    | 0.929062 | 0.439759 | f1-score |
| 4    | 0.90975  | 0.611335 | f1

In [17]:
# change all 4 digit numbers (dates) to DATE - change (c), (C) & © to COPYRIGHT_SYMBOL - 0.95 threshold
import re

def minor_preprocess(sentences):
    sentences = [re.sub('\d{4}', 'DATE', sentence)  for sentence in sentences]
    sentences = [re.sub('(c)', 'COPYRIGHT_SYMBOL', sentence)  for sentence in sentences]
    sentences = [re.sub('(C)', 'COPYRIGHT_SYMBOL', sentence)  for sentence in sentences]
    sentences = [re.sub('©', 'COPYRIGHT_SYMBOL', sentence)  for sentence in sentences]
    return sentences
    

X_train_temp = minor_preprocess(X_train)
X_test_temp = minor_preprocess(X_test)
X_1_temp = minor_preprocess(X_1)
X_2_temp = minor_preprocess(X_2)
X_3_temp = minor_preprocess(X_3)
X_temp = minor_preprocess(X)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_temp)
X_test_tfidf = vectorizer.transform(X_test_temp)
X_1_tfidf = vectorizer.transform(X_1_temp)
X_2_tfidf = vectorizer.transform(X_2_temp)
X_3_tfidf = vectorizer.transform(X_3_temp)
X_tfidf = vectorizer.transform(X_temp)
from sklearn.svm import SVC
svm = SVC(probability=True)
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict_proba(X_test_tfidf)
y_pred_1 = svm.predict_proba(X_1_tfidf)
y_pred_2 = svm.predict_proba(X_2_tfidf)
y_pred_3 = svm.predict_proba(X_3_tfidf)
y_pred_4 = svm.predict_proba(X_tfidf)
y_pred_classification = [np.argmax(p) if np.max(p) > 0.95 else 0 for p in y_pred]
y_pred_1_classification = [np.argmax(p) if np.max(p) > 0.95 else 0 for p in y_pred_1]
y_pred_2_classification = [np.argmax(p) if np.max(p) > 0.95 else 0 for p in y_pred_2]
y_pred_3_classification = [np.argmax(p) if np.max(p) > 0.95 else 0 for p in y_pred_3]
y_pred_4_classification = [np.argmax(p) if np.max(p) > 0.95 else 0 for p in y_pred_4]
report = classification_report(y_test, y_pred_classification, output_dict=True)
report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))
print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.953892 | 0.983352 | precision |
| 1    | 0.837476 | 0.87037  | precision |
| 2    | 0.665    | 0.979592 | precision |
| 3    | 0.915789 | 0.993197 | precision |
| 4    | 0.971965 | 0.994862 | precision |
| Mean | 0.868825 | 0.964275 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.994774 | 0.865234 | recall   |
| 1    | 0.98427  | 0.356061 | recall   |
| 2    | 0.992537 | 0.417391 | recall   |
| 3    | 0.99918  | 0.565891 | recall   |
| 4    | 0.998321 | 0.918643 | recall   |
| Mean | 0.993816 | 0.624644 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.973904 | 0.920519 | f1-score |
| 1    | 0.904959 | 0.505376 | f1-score |
| 2    | 0.796407 | 0.585366 | f1-score |
| 3    | 0.955669 | 0.720988 | f1-score |
| 4    | 0.984967 | 0.955235 | f1

In [16]:
# Apply random search to improve over the default SVM parameters

param_distributions = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': [0.01, 0.1, 1, 10],
    'degree': [2, 3, 4, 5],
    'coef0': [0, 0.1, 0.5, 1],
    'shrinking': [True, False],
    'probability': [True, False],
    'tol': [1e-3, 1e-4, 1e-5],
    'class_weight': [None, 'balanced'],
}

random_search = RandomizedSearchCV(SVC(), param_distributions, n_iter=90, cv=5, verbose=2, n_jobs=-1)

random_search.fit(X_tfidf, y)

print(random_search.best_params_)
print(random_search.best_score_)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


KeyboardInterrupt: 

In [153]:
# No threshhold (50% basically) - with best params

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
X_1_tfidf = vectorizer.transform(X_1)
X_2_tfidf = vectorizer.transform(X_2)
X_3_tfidf = vectorizer.transform(X_3)
X_tfidf = vectorizer.transform(X)

svm = SVC(probability=True, C= 100, gamma= 0.1, kernel= 'rbf')
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict_proba(X_test_tfidf)
y_pred_1 = svm.predict_proba(X_1_tfidf)
y_pred_2 = svm.predict_proba(X_2_tfidf)
y_pred_3 = svm.predict_proba(X_3_tfidf)
y_pred_4 = svm.predict_proba(X_tfidf)
y_pred_classification = np.argmax(y_pred, axis=1)
y_pred_1_classification = np.argmax(y_pred_1, axis=1)
y_pred_2_classification = np.argmax(y_pred_2, axis=1)
y_pred_3_classification = np.argmax(y_pred_3, axis=1)
y_pred_4_classification = np.argmax(y_pred_4, axis=1)
report = classification_report(y_test, y_pred_classification, output_dict=True)
report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))
print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.990206 | 0.962319 | precision |
| 1    | 0.956019 | 0.77931  | precision |
| 2    | 0.871429 | 0.889908 | precision |
| 3    | 0.97284  | 0.858779 | precision |
| 4    | 0.993338 | 0.978105 | precision |
| Mean | 0.956766 | 0.893684 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.986411 | 0.972656 | recall   |
| 1    | 0.92809  | 0.856061 | recall   |
| 2    | 0.910448 | 0.843478 | recall   |
| 3    | 0.969647 | 0.872093 | recall   |
| 4    | 0.992226 | 0.981198 | recall   |
| Mean | 0.957364 | 0.905097 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.988305 | 0.96746  | f1-score |
| 1    | 0.941847 | 0.815884 | f1-score |
| 2    | 0.890511 | 0.866071 | f1-score |
| 3    | 0.971241 | 0.865385 | f1-score |
| 4    | 0.992782 | 0.979649 | f1

In [156]:
# Use a threshold of 0.999 otherwise set to class 0 - with best params
y_pred_classification = [np.argmax(p) if np.max(p) > 0.90 else 0 for p in y_pred]
y_pred_1_classification = [np.argmax(p) if np.max(p) > 0.90 else 0 for p in y_pred_1]
y_pred_2_classification = [np.argmax(p) if np.max(p) > 0.90 else 0 for p in y_pred_2]
y_pred_3_classification = [np.argmax(p) if np.max(p) > 0.90 else 0 for p in y_pred_3]
y_pred_4_classification = [np.argmax(p) if np.max(p) > 0.90 else 0 for p in y_pred_4]
report = classification_report(y_test, y_pred_classification, output_dict=True)
report_1 = classification_report(y_1, y_pred_1_classification, output_dict=True)
report_2 = classification_report(y_2, y_pred_2_classification, output_dict=True)
report_3 = classification_report(y_3, y_pred_3_classification, output_dict=True)
report_4 = classification_report(y, y_pred_4_classification, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))
print('Number of missclassifications in class 0: ', report_4['0']['support'] - round(report_4['0']['recall'] * report_4['0']['support']), 'out of a total sample of: ', report_4['0']['support'], ' - about ', round((1 - report_4['0']['recall']) * 100, 2), '% of the class was missclassified')
print('Number of missclassifications in class 1: ', report_4['1']['support'] - round(report_4['1']['recall'] * report_4['1']['support']), 'out of a total sample of: ', report_4['1']['support'], ' - about ', round((1 - report_4['1']['recall']) * 100, 2), '% of the class was missclassified')

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.96704  | 0.974763 | precision |
| 1    | 0.873727 | 0.813953 | precision |
| 2    | 0.775148 | 0.9625   | precision |
| 3    | 0.956696 | 0.886957 | precision |
| 4    | 0.983776 | 0.987086 | precision |
| Mean | 0.911277 | 0.925052 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.991638 | 0.905273 | recall   |
| 1    | 0.964045 | 0.530303 | recall   |
| 2    | 0.977612 | 0.669565 | recall   |
| 3    | 0.978671 | 0.790698 | recall   |
| 4    | 0.995584 | 0.953611 | recall   |
| Mean | 0.98151  | 0.76989  | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.979185 | 0.938734 | f1-score |
| 1    | 0.916667 | 0.642202 | f1-score |
| 2    | 0.864686 | 0.789744 | f1-score |
| 3    | 0.967559 | 0.836066 | f1-score |
| 4    | 0.989645 | 0.97006  | f1

: 

## GloVe

In [53]:
# Load GloVe embeddings
import numpy as np
def load_glove(file):
    """Load GloVe embeddings from a text file.
    Args:
        file (str): path to the glove file.
    Returns:
        dict: a dictionary mapping words to their vector representations.
    """
    embeddings = {}
    with open(file) as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove50 = load_glove('../glove.6B/glove.6B.50d.txt')
glove100 = load_glove('../glove.6B/glove.6B.100d.txt')
glove200 = load_glove('../glove.6B/glove.6B.200d.txt')
glove300 = load_glove('../glove.6B/glove.6B.300d.txt')

In [75]:
def sentences_to_embeddings(sentences, embeddings, preprocess=False, weighted_avg=None):
    """
        Convert a list of sentences into a matrix of embeddings. 
        
        Args:
            sentences (list): a list of strings, each representing a sentence.
            embeddings (dict): a dictionary mapping words to their vector representations.

        Returns: 
            np.array: a 2D array of shape (len(sentences), len(embeddings[word])), where each
                      row is the average of the word vectors in the sentence.
    """

    

    matrix = []
    if not preprocess:
        for sentence in sentences:
            words = sentence.split()
            vectors = [embeddings.get(word.lower(), np.zeros(len(embeddings['the']))) for word in words] 
            if weighted_avg is None:
                mean = np.mean(vectors, axis=0)
            else:
                weights = [weighted_avg.get(word.lower(), 0) for word in words]
                weighted_sum = np.sum([v * w for v, w in zip(vectors, weights)], axis=0)
                mean = weighted_sum / np.sum(weights)
            matrix.append(mean)
    else:
        for sentence in sentences:
            vectors = [embeddings.get(token, np.zeros(len(embeddings['the']))) for token in sentence] 
            if weighted_avg is None:
                mean = np.mean(vectors, axis=0)
            else:
                weights = [weighted_avg.get(token.lower(), 0) for token in sentence]
                weighted_sum = np.sum([v * w for v, w in zip(vectors, weights)], axis=0)
                mean = weighted_sum / np.sum(weights)
            matrix.append(mean)
    return np.array(matrix)

In [57]:
X_train_glove50 = sentences_to_embeddings(X_train, glove50) 
X_test_glove50 = sentences_to_embeddings(X_test, glove50)

X_1_glove50 = sentences_to_embeddings(X_1, glove50)
X_2_glove50 = sentences_to_embeddings(X_2, glove50)
X_3_glove50 = sentences_to_embeddings(X_3, glove50)
X_glove50 = sentences_to_embeddings(X, glove50)

In [31]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_glove50, y_train)
y_pred = svm.predict(X_test_glove50)
y_pred_1 = svm.predict(X_1_glove50)
y_pred_2 = svm.predict(X_2_glove50)
y_pred_3 = svm.predict(X_3_glove50)
y_pred_4 = svm.predict(X_glove50)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.971258 | 0.904899 | precision |
| 1    | 0.986339 | 0.601896 | precision |
| 2    | 0.915254 | 0.801527 | precision |
| 3    | 0.995847 | 0.911765 | precision |
| 4    | 0.979161 | 0.902747 | precision |
| Mean | 0.969572 | 0.824567 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.965505 | 0.919922 | recall   |
| 1    | 0.811236 | 0.962121 | recall   |
| 2    | 0.80597  | 0.913043 | recall   |
| 3    | 0.980376 | 0.980237 | recall   |
| 4    | 0.964124 | 0.941963 | recall   |
| Mean | 0.905442 | 0.943457 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.968373 | 0.912349 | f1-score |
| 1    | 0.890259 | 0.740525 | f1-score |
| 2    | 0.857143 | 0.853659 | f1-score |
| 3    | 0.988051 | 0.944762 | f1-score |
| 4    | 0.971584 | 0.921938 | f1

In [32]:
X_train_glove100 = sentences_to_embeddings(X_train, glove100) 
X_test_glove100 = sentences_to_embeddings(X_test, glove100)

X_1_glove100 = sentences_to_embeddings(X_1, glove100)
X_2_glove100 = sentences_to_embeddings(X_2, glove100)
X_3_glove100 = sentences_to_embeddings(X_3, glove100)
X_glove100 = sentences_to_embeddings(X, glove100)

In [33]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_glove100, y_train)
y_pred = svm.predict(X_test_glove100)
y_pred_1 = svm.predict(X_1_glove100)
y_pred_2 = svm.predict(X_2_glove100)
y_pred_3 = svm.predict(X_3_glove100)
y_pred_4 = svm.predict(X_glove100)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.977528 | 0.917782 | precision |
| 1    | 0.982005 | 0.664894 | precision |
| 2    | 1        | 0.771812 | precision |
| 3    | 0.9975   | 0.905797 | precision |
| 4    | 0.983936 | 0.921289 | precision |
| Mean | 0.988194 | 0.836315 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.970035 | 0.9375   | recall   |
| 1    | 0.858427 | 0.94697  | recall   |
| 2    | 0.746269 | 1        | recall   |
| 3    | 0.978741 | 0.988142 | recall   |
| 4    | 0.97115  | 0.955153 | recall   |
| Mean | 0.904924 | 0.965553 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.973767 | 0.927536 | f1-score |
| 1    | 0.916067 | 0.78125  | f1-score |
| 2    | 0.854701 | 0.871212 | f1-score |
| 3    | 0.988031 | 0.94518  | f1-score |
| 4    | 0.977501 | 0.937916 | f1

In [34]:
X_train_glove200 = sentences_to_embeddings(X_train, glove200) 
X_test_glove200 = sentences_to_embeddings(X_test, glove200)

X_1_glove200 = sentences_to_embeddings(X_1, glove200)
X_2_glove200 = sentences_to_embeddings(X_2, glove200)
X_3_glove200 = sentences_to_embeddings(X_3, glove200)
X_glove200 = sentences_to_embeddings(X, glove200)

In [35]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_glove200, y_train)
y_pred = svm.predict(X_test_glove200)
y_pred_1 = svm.predict(X_1_glove200)
y_pred_2 = svm.predict(X_2_glove200)
y_pred_3 = svm.predict(X_3_glove200)
y_pred_4 = svm.predict(X_glove200)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.98208  | 0.928435 | precision |
| 1    | 0.984772 | 0.688525 | precision |
| 2    | 0.924528 | 0.748252 | precision |
| 3    | 0.9975   | 0.905797 | precision |
| 4    | 0.985766 | 0.926838 | precision |
| Mean | 0.974929 | 0.839569 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.973868 | 0.950195 | recall   |
| 1    | 0.87191  | 0.954545 | recall   |
| 2    | 0.731343 | 0.930435 | recall   |
| 3    | 0.978741 | 0.988142 | recall   |
| 4    | 0.973202 | 0.960253 | recall   |
| Mean | 0.905813 | 0.956714 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.977957 | 0.939189 | f1-score |
| 1    | 0.924911 | 0.8      | f1-score |
| 2    | 0.816667 | 0.829457 | f1-score |
| 3    | 0.988031 | 0.94518  | f1-score |
| 4    | 0.979444 | 0.94325  | f1

In [55]:
X_train_glove300 = sentences_to_embeddings(X_train, glove300) 
X_test_glove300 = sentences_to_embeddings(X_test, glove300)

X_1_glove300 = sentences_to_embeddings(X_1, glove300)
X_2_glove300 = sentences_to_embeddings(X_2, glove300)
X_3_glove300 = sentences_to_embeddings(X_3, glove300)
X_glove300 = sentences_to_embeddings(X, glove300)

In [37]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_glove300, y_train)
y_pred = svm.predict(X_test_glove300)
y_pred_1 = svm.predict(X_1_glove300)
y_pred_2 = svm.predict(X_2_glove300)
y_pred_3 = svm.predict(X_3_glove300)
y_pred_4 = svm.predict(X_glove300)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.981754 | 0.931034 | precision |
| 1    | 0.989691 | 0.677249 | precision |
| 2    | 0.888889 | 0.730496 | precision |
| 3    | 0.99584  | 0.905109 | precision |
| 4    | 0.986601 | 0.93205  | precision |
| Mean | 0.968555 | 0.835188 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.974913 | 0.949219 | recall   |
| 1    | 0.862921 | 0.969697 | recall   |
| 2    | 0.716418 | 0.895652 | recall   |
| 3    | 0.978741 | 0.980237 | recall   |
| 4    | 0.975191 | 0.96254  | recall   |
| Mean | 0.901637 | 0.951469 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.978322 | 0.940039 | f1-score |
| 1    | 0.921969 | 0.797508 | f1-score |
| 2    | 0.793388 | 0.804688 | f1-score |
| 3    | 0.987216 | 0.941176 | f1-score |
| 4    | 0.980863 | 0.94705  | f1

In [67]:
# Preprocess with mode = 1 & lower = True
X_train_glove300 = sentences_to_embeddings(preprocess_sentences(X_train, mode=1, labels=None, lower=True), glove300, preprocess=True) 
X_test_glove300 = sentences_to_embeddings(preprocess_sentences(X_test, mode=1, labels=None, lower=True), glove300, preprocess=True)

X_1_glove300 = sentences_to_embeddings(preprocess_sentences(X_1, mode=1, labels=None, lower=True), glove300, preprocess=True)
X_2_glove300 = sentences_to_embeddings(preprocess_sentences(X_2, mode=1, labels=None, lower=True), glove300, preprocess=True)
X_3_glove300 = sentences_to_embeddings(preprocess_sentences(X_3, mode=1, labels=None, lower=True), glove300, preprocess=True)
X_glove300 = sentences_to_embeddings(preprocess_sentences(X, mode=1, labels=None, lower=True), glove300, preprocess=True)

svm = SVC()
svm.fit(X_train_glove300, y_train)
y_pred = svm.predict(X_test_glove300)
y_pred_1 = svm.predict(X_1_glove300)
y_pred_2 = svm.predict(X_2_glove300)
y_pred_3 = svm.predict(X_3_glove300)
y_pred_4 = svm.predict(X_glove300)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.986301 | 0.940783 | precision |
| 1    | 0.992386 | 0.704918 | precision |
| 2    | 0.970588 | 0.761905 | precision |
| 3    | 0.998337 | 0.934307 | precision |
| 4    | 0.990933 | 0.942086 | precision |
| Mean | 0.987709 | 0.8568   | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.978397 | 0.961914 | recall   |
| 1    | 0.878652 | 0.977273 | recall   |
| 2    | 0.738806 | 0.973913 | recall   |
| 3    | 0.985234 | 0.992248 | recall   |
| 4    | 0.978792 | 0.974697 | recall   |
| Mean | 0.911976 | 0.976009 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.982333 | 0.951231 | f1-score |
| 1    | 0.932062 | 0.819048 | f1-score |
| 2    | 0.838983 | 0.854962 | f1-score |
| 3    | 0.991742 | 0.962406 | f1-score |
| 4    | 0.984825 | 0.958114 | f1

In [68]:
# Preprocess with mode = 1 & lower = True, remove stopwords = True
X_train_glove300 = sentences_to_embeddings(preprocess_sentences(X_train, mode=1, labels=None, lower=True, remove_stopwords=True), glove300, preprocess=True) 
X_test_glove300 = sentences_to_embeddings(preprocess_sentences(X_test, mode=1, labels=None, lower=True, remove_stopwords=True), glove300, preprocess=True)

X_1_glove300 = sentences_to_embeddings(preprocess_sentences(X_1, mode=1, labels=None, lower=True, remove_stopwords=True), glove300, preprocess=True)
X_2_glove300 = sentences_to_embeddings(preprocess_sentences(X_2, mode=1, labels=None, lower=True, remove_stopwords=True), glove300, preprocess=True)
X_3_glove300 = sentences_to_embeddings(preprocess_sentences(X_3, mode=1, labels=None, lower=True, remove_stopwords=True), glove300, preprocess=True)
X_glove300 = sentences_to_embeddings(preprocess_sentences(X, mode=1, labels=None, lower=True, remove_stopwords=True), glove300, preprocess=True)

svm = SVC()
svm.fit(X_train_glove300, y_train)
y_pred = svm.predict(X_test_glove300)
y_pred_1 = svm.predict(X_1_glove300)
y_pred_2 = svm.predict(X_2_glove300)
y_pred_3 = svm.predict(X_3_glove300)
y_pred_4 = svm.predict(X_glove300)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.987013 | 0.944498 | precision |
| 1    | 0.994872 | 0.695187 | precision |
| 2    | 0.961905 | 0.770833 | precision |
| 3    | 0.995854 | 0.933579 | precision |
| 4    | 0.991318 | 0.945191 | precision |
| Mean | 0.986192 | 0.857858 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.979791 | 0.963867 | recall   |
| 1    | 0.87191  | 0.984848 | recall   |
| 2    | 0.753731 | 0.965217 | recall   |
| 3    | 0.985234 | 0.98062  | recall   |
| 4    | 0.979974 | 0.975751 | recall   |
| Mean | 0.914128 | 0.974061 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.983389 | 0.954084 | f1-score |
| 1    | 0.929341 | 0.815047 | f1-score |
| 2    | 0.845188 | 0.857143 | f1-score |
| 3    | 0.990515 | 0.956522 | f1-score |
| 4    | 0.985613 | 0.960228 | f1

In [70]:
# Preprocess with mode = 1 & lower = True, remove stopwords = False, remove punctuation = True
X_train_glove300 = sentences_to_embeddings(preprocess_sentences(X_train, mode=1, labels=None, lower=True, remove_stopwords=False, remove_punctuation=True), glove300, preprocess=True) 
X_test_glove300 = sentences_to_embeddings(preprocess_sentences(X_test, mode=1, labels=None, lower=True, remove_stopwords=False, remove_punctuation=True), glove300, preprocess=True)

X_1_glove300 = sentences_to_embeddings(preprocess_sentences(X_1, mode=1, labels=None, lower=True, remove_stopwords=False, remove_punctuation=True), glove300, preprocess=True)
X_2_glove300 = sentences_to_embeddings(preprocess_sentences(X_2, mode=1, labels=None, lower=True, remove_stopwords=False, remove_punctuation=True), glove300, preprocess=True)
X_3_glove300 = sentences_to_embeddings(preprocess_sentences(X_3, mode=1, labels=None, lower=True, remove_stopwords=False, remove_punctuation=True), glove300, preprocess=True)
X_glove300 = sentences_to_embeddings(preprocess_sentences(X, mode=1, labels=None, lower=True, remove_stopwords=False, remove_punctuation=True), glove300, preprocess=True)

svm = SVC()
svm.fit(X_train_glove300, y_train)
y_pred = svm.predict(X_test_glove300)
y_pred_1 = svm.predict(X_1_glove300)
y_pred_2 = svm.predict(X_2_glove300)
y_pred_3 = svm.predict(X_3_glove300)
y_pred_4 = svm.predict(X_glove300)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.987328 | 0.938272 | precision |
| 1    | 0.989848 | 0.699454 | precision |
| 2    | 0.989899 | 0.76     | precision |
| 3    | 0.999168 | 0.934545 | precision |
| 4    | 0.991233 | 0.938631 | precision |
| Mean | 0.991495 | 0.85418  | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.977352 | 0.964844 | recall   |
| 1    | 0.876404 | 0.969697 | recall   |
| 2    | 0.731343 | 0.991304 | recall   |
| 3    | 0.985234 | 0.996124 | recall   |
| 4    | 0.977424 | 0.975575 | recall   |
| Mean | 0.909551 | 0.979509 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.982315 | 0.951372 | f1-score |
| 1    | 0.929678 | 0.812698 | f1-score |
| 2    | 0.841202 | 0.860377 | f1-score |
| 3    | 0.992152 | 0.964353 | f1-score |
| 4    | 0.98428  | 0.956747 | f1

In [71]:
# Preprocess with mode = 1 & lower = True, remove stopwords = True, remove punctuation = True
X_train_glove300 = sentences_to_embeddings(preprocess_sentences(X_train, mode=1, labels=None, lower=True, remove_stopwords=True, remove_punctuation=True), glove300, preprocess=True) 
X_test_glove300 = sentences_to_embeddings(preprocess_sentences(X_test, mode=1, labels=None, lower=True, remove_stopwords=True, remove_punctuation=True), glove300, preprocess=True)

X_1_glove300 = sentences_to_embeddings(preprocess_sentences(X_1, mode=1, labels=None, lower=True, remove_stopwords=True, remove_punctuation=True), glove300, preprocess=True)
X_2_glove300 = sentences_to_embeddings(preprocess_sentences(X_2, mode=1, labels=None, lower=True, remove_stopwords=True, remove_punctuation=True), glove300, preprocess=True)
X_3_glove300 = sentences_to_embeddings(preprocess_sentences(X_3, mode=1, labels=None, lower=True, remove_stopwords=True, remove_punctuation=True), glove300, preprocess=True)
X_glove300 = sentences_to_embeddings(preprocess_sentences(X, mode=1, labels=None, lower=True, remove_stopwords=True, remove_punctuation=True), glove300, preprocess=True)

svm = SVC()
svm.fit(X_train_glove300, y_train)
y_pred = svm.predict(X_test_glove300)
y_pred_1 = svm.predict(X_1_glove300)
y_pred_2 = svm.predict(X_2_glove300)
y_pred_3 = svm.predict(X_3_glove300)
y_pred_4 = svm.predict(X_glove300)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.988032 | 0.940171 | precision |
| 1    | 0.997429 | 0.696809 | precision |
| 2    | 0.98     | 0.758389 | precision |
| 3    | 0.997508 | 0.934066 | precision |
| 4    | 0.991685 | 0.943003 | precision |
| Mean | 0.990931 | 0.854487 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.978049 | 0.966797 | recall   |
| 1    | 0.87191  | 0.992424 | recall   |
| 2    | 0.731343 | 0.982609 | recall   |
| 3    | 0.985234 | 0.988372 | recall   |
| 4    | 0.979103 | 0.976805 | recall   |
| Mean | 0.909128 | 0.981401 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.983015 | 0.953298 | f1-score |
| 1    | 0.930456 | 0.81875  | f1-score |
| 2    | 0.837607 | 0.856061 | f1-score |
| 3    | 0.991333 | 0.960452 | f1-score |
| 4    | 0.985354 | 0.959606 | f1

In [72]:
# Preprocess with mode = 1 & lower = True, remove stopwords = True, remove punctuation = True
X_train_glove300 = sentences_to_embeddings(preprocess_sentences(X_train, mode=1, labels=None, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=True), glove300, preprocess=True) 
X_test_glove300 = sentences_to_embeddings(preprocess_sentences(X_test, mode=1, labels=None, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=True), glove300, preprocess=True)

X_1_glove300 = sentences_to_embeddings(preprocess_sentences(X_1, mode=1, labels=None, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=True), glove300, preprocess=True)
X_2_glove300 = sentences_to_embeddings(preprocess_sentences(X_2, mode=1, labels=None, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=True), glove300, preprocess=True)
X_3_glove300 = sentences_to_embeddings(preprocess_sentences(X_3, mode=1, labels=None, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=True), glove300, preprocess=True)
X_glove300 = sentences_to_embeddings(preprocess_sentences(X, mode=1, labels=None, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=True), glove300, preprocess=True)

svm = SVC()
svm.fit(X_train_glove300, y_train)
y_pred = svm.predict(X_test_glove300)
y_pred_1 = svm.predict(X_1_glove300)
y_pred_2 = svm.predict(X_2_glove300)
y_pred_3 = svm.predict(X_3_glove300)
y_pred_4 = svm.predict(X_glove300)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.987672 | 0.937441 | precision |
| 1    | 0.997396 | 0.678756 | precision |
| 2    | 0.98     | 0.758389 | precision |
| 3    | 0.997508 | 0.934066 | precision |
| 4    | 0.992243 | 0.941654 | precision |
| Mean | 0.990964 | 0.850061 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.977003 | 0.96582  | recall   |
| 1    | 0.860674 | 0.992424 | recall   |
| 2    | 0.731343 | 0.982609 | recall   |
| 3    | 0.985234 | 0.988372 | recall   |
| 4    | 0.978543 | 0.978387 | recall   |
| Mean | 0.90656  | 0.981522 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.982309 | 0.951419 | f1-score |
| 1    | 0.924005 | 0.806154 | f1-score |
| 2    | 0.837607 | 0.856061 | f1-score |
| 3    | 0.991333 | 0.960452 | f1-score |
| 4    | 0.985346 | 0.959669 | f1

## FastText

In [None]:
from gensim.models.fasttext import FastText, load_facebook_model

model = FastText(vector_size=500, window=5, min_count=10, workers=6)

model.build_vocab(X_train)

model.train(X_train, total_examples=len(X_train), epochs=10)

X_train_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_train]
X_test_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_test]

X_1_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_1]
X_2_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_2]
X_3_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_3]
X_ft = [model.wv.get_sentence_vector(sentence) for sentence in X]

In [100]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_ft, y_train)
y_pred = svm.predict(X_test_ft)
y_pred_1 = svm.predict(X_1_ft)
y_pred_2 = svm.predict(X_2_ft)
y_pred_3 = svm.predict(X_3_ft)
y_pred_4 = svm.predict(X_ft)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.981645 | 0.916117 | precision |
| 1    | 0.984576 | 0.670213 | precision |
| 2    | 0.972477 | 0.8      | precision |
| 3    | 0.979339 | 0.872659 | precision |
| 4    | 0.985067 | 0.914348 | precision |
| Mean | 0.980621 | 0.834667 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.96899  | 0.949219 | recall   |
| 1    | 0.860674 | 0.954545 | recall   |
| 2    | 0.791045 | 0.973913 | recall   |
| 3    | 0.972108 | 0.903101 | recall   |
| 4    | 0.968219 | 0.958531 | recall   |
| Mean | 0.912207 | 0.947862 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.975276 | 0.932374 | f1-score |
| 1    | 0.918465 | 0.7875   | f1-score |
| 2    | 0.872428 | 0.878431 | f1-score |
| 3    | 0.97571  | 0.887619 | f1-score |
| 4    | 0.976571 | 0.935918 | f1

In [102]:
# Preprocess mode = 1, lower = true & remove_stopwords = true
model = FastText(vector_size=500, window=5, min_count=10, workers=6)

X_train_temp = preprocess_sentences(X_train, labels=None, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=False)
X_train_temp = [' '.join(token for token in sentence) for sentence in X_train_temp]
X_test_temp = preprocess_sentences(X_test, labels=None, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=False)
X_test_temp = [' '.join(token for token in sentence) for sentence in X_test_temp]
X_1_temp = preprocess_sentences(X_1, labels=None, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=False)
X_1_temp = [' '.join(token for token in sentence) for sentence in X_1_temp]
X_2_temp = preprocess_sentences(X_2, labels=None, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=False)
X_2_temp = [' '.join(token for token in sentence) for sentence in X_2_temp]
X_3_temp = preprocess_sentences(X_3, labels=None, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=False)
X_3_temp = [' '.join(token for token in sentence) for sentence in X_3_temp]
X_temp = preprocess_sentences(X, labels=None, mode=1, lower=True, remove_stopwords=True, remove_punctuation=False, lemmatize=False)
X_temp = [' '.join(token for token in sentence) for sentence in X_temp]

model.build_vocab(X_train_temp)

model.train(X_train_temp, total_examples=len(X_train_temp), epochs=10)

X_train_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_train_temp]
X_test_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_test_temp]

X_1_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_1_temp]
X_2_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_2_temp]
X_3_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_3_temp]
X_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_temp]

svm = SVC()
svm.fit(X_train_ft, y_train)
y_pred = svm.predict(X_test_ft)
y_pred_1 = svm.predict(X_1_ft)
y_pred_2 = svm.predict(X_2_ft)
y_pred_3 = svm.predict(X_3_ft)
y_pred_4 = svm.predict(X_ft)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.980641 | 0.920228 | precision |
| 1    | 0.971939 | 0.654054 | precision |
| 2    | 0.925926 | 0.758865 | precision |
| 3    | 0.961912 | 0.868313 | precision |
| 4    | 0.981727 | 0.915424 | precision |
| Mean | 0.964429 | 0.823377 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.970732 | 0.946289 | recall   |
| 1    | 0.85618  | 0.916667 | recall   |
| 2    | 0.746269 | 0.930435 | recall   |
| 3    | 0.973749 | 0.817829 | recall   |
| 4    | 0.968966 | 0.949042 | recall   |
| Mean | 0.903179 | 0.912052 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.975661 | 0.933077 | f1-score |
| 1    | 0.910394 | 0.763407 | f1-score |
| 2    | 0.826446 | 0.835938 | f1-score |
| 3    | 0.967795 | 0.842315 | f1-score |
| 4    | 0.975304 | 0.93193  | f1

In [103]:
# Preprocess mode = 1, lower = true & remove_stopwords = true, remove_punctuation = True
model = FastText(vector_size=500, window=5, min_count=10, workers=6)

X_train_temp = preprocess_sentences(X_train, labels=None, mode=1, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=False)
X_train_temp = [' '.join(token for token in sentence) for sentence in X_train_temp]
X_test_temp = preprocess_sentences(X_test, labels=None, mode=1, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=False)
X_test_temp = [' '.join(token for token in sentence) for sentence in X_test_temp]
X_1_temp = preprocess_sentences(X_1, labels=None, mode=1, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=False)
X_1_temp = [' '.join(token for token in sentence) for sentence in X_1_temp]
X_2_temp = preprocess_sentences(X_2, labels=None, mode=1, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=False)
X_2_temp = [' '.join(token for token in sentence) for sentence in X_2_temp]
X_3_temp = preprocess_sentences(X_3, labels=None, mode=1, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=False)
X_3_temp = [' '.join(token for token in sentence) for sentence in X_3_temp]
X_temp = preprocess_sentences(X, labels=None, mode=1, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=False)
X_temp = [' '.join(token for token in sentence) for sentence in X_temp]

model.build_vocab(X_train_temp)

model.train(X_train_temp, total_examples=len(X_train_temp), epochs=10)

X_train_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_train_temp]
X_test_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_test_temp]

X_1_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_1_temp]
X_2_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_2_temp]
X_3_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_3_temp]
X_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_temp]

svm = SVC()
svm.fit(X_train_ft, y_train)
y_pred = svm.predict(X_test_ft)
y_pred_1 = svm.predict(X_1_ft)
y_pred_2 = svm.predict(X_2_ft)
y_pred_3 = svm.predict(X_3_ft)
y_pred_4 = svm.predict(X_ft)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.97567  | 0.902647 | precision |
| 1    | 0.972678 | 0.578199 | precision |
| 2    | 0.896226 | 0.727273 | precision |
| 3    | 0.961601 | 0.833992 | precision |
| 4    | 0.977993 | 0.896928 | precision |
| Mean | 0.956834 | 0.787808 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.964111 | 0.932617 | recall   |
| 1    | 0.8      | 0.924242 | recall   |
| 2    | 0.708955 | 0.904348 | recall   |
| 3    | 0.965546 | 0.817829 | recall   |
| 4    | 0.961814 | 0.938851 | recall   |
| Mean | 0.880085 | 0.903578 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.969856 | 0.917387 | f1-score |
| 1    | 0.877928 | 0.71137  | f1-score |
| 2    | 0.791667 | 0.806202 | f1-score |
| 3    | 0.963569 | 0.825832 | f1-score |
| 4    | 0.969836 | 0.917411 | f1

In [104]:
# Preprocess mode = 1, lower = true & remove_stopwords = true, remove_punctuation = True, lemmatize = True
model = FastText(vector_size=500, window=5, min_count=10, workers=6)

X_train_temp = preprocess_sentences(X_train, labels=None, mode=1, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=True)
X_train_temp = [' '.join(token for token in sentence) for sentence in X_train_temp]
X_test_temp = preprocess_sentences(X_test, labels=None, mode=1, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=True)
X_test_temp = [' '.join(token for token in sentence) for sentence in X_test_temp]
X_1_temp = preprocess_sentences(X_1, labels=None, mode=1, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=True)
X_1_temp = [' '.join(token for token in sentence) for sentence in X_1_temp]
X_2_temp = preprocess_sentences(X_2, labels=None, mode=1, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=True)
X_2_temp = [' '.join(token for token in sentence) for sentence in X_2_temp]
X_3_temp = preprocess_sentences(X_3, labels=None, mode=1, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=True)
X_3_temp = [' '.join(token for token in sentence) for sentence in X_3_temp]
X_temp = preprocess_sentences(X, labels=None, mode=1, lower=True, remove_stopwords=True, remove_punctuation=True, lemmatize=True)
X_temp = [' '.join(token for token in sentence) for sentence in X_temp]

model.build_vocab(X_train_temp)

model.train(X_train_temp, total_examples=len(X_train_temp), epochs=10)

X_train_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_train_temp]
X_test_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_test_temp]

X_1_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_1_temp]
X_2_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_2_temp]
X_3_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_3_temp]
X_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_temp]

svm = SVC()
svm.fit(X_train_ft, y_train)
y_pred = svm.predict(X_test_ft)
y_pred_1 = svm.predict(X_1_ft)
y_pred_2 = svm.predict(X_2_ft)
y_pred_3 = svm.predict(X_3_ft)
y_pred_4 = svm.predict(X_ft)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.977385 | 0.902256 | precision |
| 1    | 0.970027 | 0.57619  | precision |
| 2    | 0.899083 | 0.742857 | precision |
| 3    | 0.961601 | 0.833992 | precision |
| 4    | 0.979412 | 0.896725 | precision |
| Mean | 0.957502 | 0.790404 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.963763 | 0.9375   | recall   |
| 1    | 0.8      | 0.916667 | recall   |
| 2    | 0.731343 | 0.904348 | recall   |
| 3    | 0.965546 | 0.817829 | recall   |
| 4    | 0.961565 | 0.942892 | recall   |
| Mean | 0.884443 | 0.903847 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.970526 | 0.91954  | f1-score |
| 1    | 0.876847 | 0.707602 | f1-score |
| 2    | 0.806584 | 0.815686 | f1-score |
| 3    | 0.963569 | 0.825832 | f1-score |
| 4    | 0.970406 | 0.919229 | f1

In [40]:
X_train_temp = ["".join(word.lower() for word in sentence) for sentence in tqdm(X_train)]
X_test_temp = ["".join(word.lower() for word in sentence) for sentence in tqdm(X_test)]

X_1_temp = ["".join(word.lower() for word in sentence) for sentence in tqdm(X_1)]
X_2_temp = ["".join(word.lower() for word in sentence) for sentence in tqdm(X_2)]
X_3_temp = ["".join(word.lower() for word in sentence) for sentence in tqdm(X_3)]
X_temp = ["".join(word.lower() for word in sentence) for sentence in tqdm(X)]

100%|██████████| 15573/15573 [00:00<00:00, 90872.90it/s]
100%|██████████| 3894/3894 [00:00<00:00, 101318.35it/s]
100%|██████████| 577/577 [00:00<00:00, 102586.30it/s]
100%|██████████| 249/249 [00:00<00:00, 52109.65it/s]
100%|██████████| 1476/1476 [00:00<00:00, 88529.69it/s]
100%|██████████| 21769/21769 [00:00<00:00, 105235.51it/s]


In [41]:
from gensim.models.fasttext import FastText

model = FastText(vector_size=500, window=5, min_count=10, workers=6)

model.build_vocab(X_train_temp)

model.train(X_train_temp, total_examples=len(X_train_temp), epochs=10)

X_train_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_train_temp)]
X_test_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_test_temp)]

X_1_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_1_temp)]
X_2_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_2_temp)]
X_3_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_3_temp)]
X_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_temp)]

100%|██████████| 15573/15573 [00:13<00:00, 1159.99it/s]
100%|██████████| 3894/3894 [00:03<00:00, 1202.50it/s]
100%|██████████| 577/577 [00:00<00:00, 1194.95it/s]
100%|██████████| 249/249 [00:00<00:00, 826.28it/s] 
100%|██████████| 1476/1476 [00:00<00:00, 1583.30it/s]
100%|██████████| 21769/21769 [00:18<00:00, 1204.99it/s]


In [42]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_ft, y_train)
y_pred = svm.predict(X_test_ft)
y_pred_1 = svm.predict(X_1_ft)
y_pred_2 = svm.predict(X_2_ft)
y_pred_3 = svm.predict(X_3_ft)
y_pred_4 = svm.predict(X_ft)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.979809 | 0.902894 | precision |
| 1    | 0.984    | 0.623762 | precision |
| 2    | 0.962264 | 0.776224 | precision |
| 3    | 0.969547 | 0.827586 | precision |
| 4    | 0.981653 | 0.896959 | precision |
| Mean | 0.975455 | 0.805485 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.963763 | 0.944336 | recall   |
| 1    | 0.829213 | 0.954545 | recall   |
| 2    | 0.761194 | 0.965217 | recall   |
| 3    | 0.963205 | 0.853755 | recall   |
| 4    | 0.96145  | 0.949173 | recall   |
| Mean | 0.895765 | 0.933405 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.97172  | 0.92315  | f1-score |
| 1    | 0.9      | 0.754491 | f1-score |
| 2    | 0.85     | 0.860465 | f1-score |
| 3    | 0.966366 | 0.840467 | f1-score |
| 4    | 0.971447 | 0.922328 | f1

In [None]:
# Apply grid search to FastText parameters to get the best embedding parameters.
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from gensim.sklearn_api.ftmodel import FastText

model = FastText(vector_size=300, window=5, min_count=10, workers=4)
pipeline = Pipeline([
    ('ft', model),
    ('lr', LogisticRegression())
])


param_grid = {
    'ft__vector_size': [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'ft__window': [1, 3, 5, 7, 9, 11, 13, 15],
    'ft__min_count': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
}

grid_search = GridSearchCV(pipeline, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train,y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)
print(grid_search.cv_results_)

## BERT (Sentence Transformer)

In [5]:
import sentence_transformers as st

class BertSentenceEmbedder():
    '''
    An interface for converting given text into sentence BERT embeddings.
    '''
    def __init__(self):
        '''
        Load the pre-trained model and tokenizer
        '''
        self.model = st.SentenceTransformer('all-MiniLM-L6-v2')

    def embed(self, sentences):
        '''
        Convert the given sentences into BERT embeddings.
        :param sentences: A list of sentences to convert into BERT embeddings.
        :return: A list of BERT embeddings for the given sentences.
        '''
        return self.model.encode(sentences, convert_to_tensor=True)

embedder = BertSentenceEmbedder()

In [16]:
X_train_st = [embedder.embed(sentence) for sentence in tqdm(X_train)]
X_test_st = [embedder.embed(sentence) for sentence in tqdm(X_test)]

X_1_st = [embedder.embed(sentence) for sentence in tqdm(X_1)]
X_2_st = [embedder.embed(sentence) for sentence in tqdm(X_2)]
X_3_st = [embedder.embed(sentence) for sentence in tqdm(X_3)]
X_st = [embedder.embed(sentence) for sentence in tqdm(X)]

100%|██████████| 15573/15573 [02:01<00:00, 128.12it/s]
100%|██████████| 3894/3894 [00:29<00:00, 130.88it/s]
100%|██████████| 577/577 [00:04<00:00, 133.46it/s]
100%|██████████| 249/249 [00:01<00:00, 128.29it/s]
100%|██████████| 1476/1476 [00:11<00:00, 132.90it/s]
100%|██████████| 21769/21769 [02:46<00:00, 131.10it/s]


In [19]:
# go back to cpu
X_train_st = np.array([np.mean(np.array(embedding.cpu()), axis=0) for embedding in X_train_st])
X_train_st = X_train_st.reshape(-1, 1)
X_test_st = np.array([np.mean(np.array(embedding.cpu()), axis=0) for embedding in X_test_st])
X_test_st = X_test_st.reshape(-1, 1)
X_1_st = np.array([np.mean(np.array(embedding.cpu()), axis=0) for embedding in X_1_st])
X_1_st = X_1_st.reshape(-1, 1)
X_2_st = np.array([np.mean(np.array(embedding.cpu()), axis=0) for embedding in X_2_st])
X_2_st = X_2_st.reshape(-1, 1)
X_3_st = np.array([np.mean(np.array(embedding.cpu()), axis=0) for embedding in X_3_st])
X_3_st = X_3_st.reshape(-1, 1)
X_st = np.array([np.mean(np.array(embedding.cpu()), axis=0) for embedding in X_st])
X_st = X_st.reshape(-1, 1)

In [20]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_st, y_train)
y_pred = svm.predict(X_test_st)
y_pred_1 = svm.predict(X_1_st)
y_pred_2 = svm.predict(X_2_st)
y_pred_3 = svm.predict(X_3_st)
y_pred_4 = svm.predict(X_st)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.758685 | 0.926829 | precision |
| 1    | 0.771777 | 0.333333 | precision |
| 2    | 0.536585 | 0.333333 | precision |
| 3    | 0.83871  | 0.947368 | precision |
| 4    | 0.758919 | 0.930709 | precision |
| Mean | 0.73     | 0.69     | precision |
## Recall
|      |        0 |          1 | Metric   |
|:-----|---------:|-----------:|:---------|
| 0    | 0.996864 | 0.111328   | recall   |
| 1    | 0.995506 | 0.00757576 | recall   |
| 2    | 0.985075 | 0.00869565 | recall   |
| 3    | 0.999182 | 0.0711462  | recall   |
| 4    | 0.997264 | 0.10394    | recall   |
| Mean | 0.99     | 0.06       | recall   |
## F1-score
|      |        0 |         1 | Metric   |
|:-----|---------:|----------:|:---------|
| 0    | 0.861617 | 0.198779  | f1-score |
| 1    | 0.86948  | 0.0148148 | f1-score |
| 2    | 0.694737 | 0.0169492 | f1-score |
| 3    | 0.91194  | 0.132353  | f1-score |
| 4    | 0.

## Word2Vec

In [43]:
from gensim.models import Word2Vec
import numpy as np

model = Word2Vec(vector_size=500, window=5, min_count=1, workers=6)
model.build_vocab(X_train)
model.train(X_train, total_examples=len(X_train), epochs=10)

(4669100, 15551690)

In [44]:
def get_word2vec_sentence_embeddings(sentences, model):
    sentence_embeddings = []
    for sentence in tqdm(sentences):
        word_vectors = []
        for word in sentence.split():
            try:
                word_vectors.append(model.wv.get_vector(word.lower()))
            except KeyError:
                word_vectors.append(np.zeros(500))
        if word_vectors:
            sentence_embeddings.append(np.mean(np.mean(word_vectors, axis=0), axis=0))
    sentence_embeddings = np.array(sentence_embeddings)
    sentence_embeddings = sentence_embeddings.reshape(-1, 1)
    return sentence_embeddings

In [45]:
X_train_word2vec = get_word2vec_sentence_embeddings(X_train, model) 
X_test_word2vec = get_word2vec_sentence_embeddings(X_test, model)

X_1_word2vec = get_word2vec_sentence_embeddings(X_1, model)
X_2_word2vec = get_word2vec_sentence_embeddings(X_2, model)
X_3_word2vec = get_word2vec_sentence_embeddings(X_3, model)
X_word2vec = get_word2vec_sentence_embeddings(X, model)

100%|██████████| 15573/15573 [00:01<00:00, 15093.93it/s]
100%|██████████| 3894/3894 [00:00<00:00, 14176.28it/s]
100%|██████████| 577/577 [00:00<00:00, 13687.34it/s]
100%|██████████| 249/249 [00:00<00:00, 12501.28it/s]
100%|██████████| 1476/1476 [00:00<00:00, 16481.57it/s]
100%|██████████| 21769/21769 [00:01<00:00, 15875.40it/s]


In [46]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_word2vec, y_train)
y_pred = svm.predict(X_test_word2vec)
y_pred_1 = svm.predict(X_1_word2vec)
y_pred_2 = svm.predict(X_2_word2vec)
y_pred_3 = svm.predict(X_3_word2vec)
y_pred_4 = svm.predict(X_word2vec)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.765232 | 0.781095 | precision |
| 1    | 0.766478 | 0.173913 | precision |
| 2    | 0.59276  | 0.892857 | precision |
| 3    | 0.854093 | 0.676056 | precision |
| 4    | 0.768065 | 0.767227 | precision |
| Mean | 0.749325 | 0.65823  | precision |
## Recall
|      |        0 |         1 | Metric   |
|:-----|---------:|----------:|:---------|
| 0    | 0.984669 | 0.15332   | recall   |
| 1    | 0.914607 | 0.0606061 | recall   |
| 2    | 0.977612 | 0.217391  | recall   |
| 3    | 0.981194 | 0.189723  | recall   |
| 4    | 0.982777 | 0.16057   | recall   |
| Mean | 0.968172 | 0.156322  | recall   |
## F1-score
|      |        0 |         1 | Metric   |
|:-----|---------:|----------:|:---------|
| 0    | 0.861192 | 0.256327  | f1-score |
| 1    | 0.834016 | 0.0898876 | f1-score |
| 2    | 0.738028 | 0.34965   | f1-score |
| 3    | 0.913242 | 0.296296  | f1-score |
| 4    | 0.862255 |