In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import numpy as np
from tqdm import tqdm
import scipy

In [50]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
stop_words = stopwords.words('english')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/jimbo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jimbo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
def preprocess_sentences(sentences, labels, mode=-1):
    sentences = sentences.to_list()
    labels = labels.to_list()
    # Remove stopwords and extra spaces
    if mode == 0:
        for i, sentence in enumerate(sentences):
            for word in sentence.split():
                if word in stop_words:
                    sentence = sentence.replace(word, '')
                else:
                    sentence = sentence.replace(word, word.lower())
            sentence = sentence.replace('  ', ' ')
            sentences[i] = sentence
    # Replace (c), (C), © with COPYRIGHT_SYMBOL then use the word_tokenize function instaed of split + previous
    elif mode > 0:
        for i, sentence in enumerate(sentences):
            sentence = sentence.replace('(c)', 'COPYRIGHT_SYMBOL') # (c) -> '(' 'c' ')
            sentence = sentence.replace('(C)', 'COPYRIGHT_SYMBOL')
            sentence = sentence.replace('©', 'COPYRIGHT_SYMBOL')
            tokens = word_tokenize(sentence)
            if mode > 1: # Change tokens to lower case
                tokens = [token.lower() for token in tokens]
            if mode > 2: # Apply lemmatization
                tokens = [lemmatizer.lemmatize(token) for token in tokens]
            sentences[i] = tokens
    return sentences, labels

In [51]:
data_0 = pd.read_csv('../cleared_datasets/fossology-master.csv')
X_0 = data_0["copyright"]
y_0 = data_0["falsePositive"]
X_0 = X_0.drop_duplicates()
y_0 = y_0[X_0.index]

data_1 = pd.read_csv('../cleared_datasets/kubernetes-master.csv')
X_1 = data_1["copyright"]
y_1 = data_1["falsePositive"]
X_1 = X_1.drop_duplicates()
y_1 = y_1[X_1.index]

data_2 = pd.read_csv('../cleared_datasets/tensorflow-master.csv')
X_2 = data_2["copyright"]
y_2 = data_2["falsePositive"]
X_2 = X_2.drop_duplicates()
y_2 = y_2[X_2.index]

data_3 = pd.read_csv('../Fossology-Provided-Dataset-1.csv')

X_3 = data_3['scanner_content']
y_3 = data_3['falsePositive']
X_3 = X_3.drop_duplicates()
y_3 = y_3[X_3.index]

X = pd.concat([X_0, X_1, X_2, X_3])
y = pd.concat([y_0, y_1, y_2, y_3])

print('Class 0 Percentage: ', len(y[y == 0]) / len(y))
print('Class 1 Percentage: ', len(y[y == 1]) / len(y))

X_train, X_test, y_train, y_test = train_test_split(X_0, y_0, test_size=0.2, random_state=42)

Class 0 Percentage:  0.738802884836235
Class 1 Percentage:  0.261197115163765


In [22]:
def aggregate_reports(reports):
    import pandas as pd
    import numpy as np
    dfs = []
    for metric in ['precision', 'recall', 'f1-score']:
        scores = []
        for report in reports:
            scores.append([report['0'][metric], report['1'][metric]])
        scores = np.array(scores)
        scores = scores[:, :2]
        mean_scores = np.mean(scores, axis=0)
        mean_scores = [f"{score:.6f}" for score in mean_scores]
        df = pd.DataFrame(scores, columns=['0', '1'])
        df.loc['Mean'] = mean_scores
        df['Metric'] = metric
        dfs.append(df)
    print("## Precision")
    print(dfs[0].to_markdown())
    print("## Recall")
    print(dfs[1].to_markdown())
    print("## F1-score")
    print(dfs[2].to_markdown())

## Bag Of Words

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

X_1_bow = vectorizer.transform(X_1)

X_2_bow = vectorizer.transform(X_2)

X_3_bow = vectorizer.transform(X_3)

X_bow = vectorizer.transform(X)

In [24]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_bow, y_train)
y_pred = svm.predict(X_test_bow)
y_pred_1 = svm.predict(X_1_bow)
y_pred_2 = svm.predict(X_2_bow)
y_pred_3 = svm.predict(X_3_bow)
y_pred_4 = svm.predict(X_bow)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.994346 | 0.947368 | precision |
| 1    | 0.986911 | 0.651282 | precision |
| 2    | 1        | 0.751634 | precision |
| 3    | 1        | 0.893993 | precision |
| 4    | 0.996347 | 0.955356 | precision |
| Mean | 0.995521 | 0.839927 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.980488 | 0.984375 | recall   |
| 1    | 0.847191 | 0.962121 | recall   |
| 2    | 0.716418 | 1        | recall   |
| 3    | 0.97547  | 1        | recall   |
| 4    | 0.983647 | 0.9898   | recall   |
| Mean | 0.900643 | 0.987259 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.987368 | 0.965517 | f1-score |
| 1    | 0.911729 | 0.776758 | f1-score |
| 2    | 0.834783 | 0.858209 | f1-score |
| 3    | 0.987583 | 0.94403  | f1-score |
| 4    | 0.989957 | 0.972273 | f1

## TF-IDF

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

X_1_tfidf = vectorizer.transform(X_1)

X_2_tfidf = vectorizer.transform(X_2)

X_3_tfidf = vectorizer.transform(X_3)

X_tfidf = vectorizer.transform(X)

In [None]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict_proba(X_test_tfidf)
y_pred = y_pred

In [16]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_tfidf, y_train)
y_pred = svm.predict(X_test_tfidf)
y_pred_1 = svm.predict(X_1_tfidf)
y_pred_2 = svm.predict(X_2_tfidf)
y_pred_3 = svm.predict(X_3_tfidf)
y_pred_4 = svm.predict(X_tfidf)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.991262 | 0.967086 | precision |
| 1    | 0.97284  | 0.703488 | precision |
| 2    | 0.945312 | 0.892562 | precision |
| 3    | 0.991701 | 0.896679 | precision |
| 4    | 0.995004 | 0.974109 | precision |
| Mean | 0.979224 | 0.886785 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.988153 | 0.975586 | recall   |
| 1    | 0.885393 | 0.916667 | recall   |
| 2    | 0.902985 | 0.93913  | recall   |
| 3    | 0.977105 | 0.960474 | recall   |
| 4    | 0.990736 | 0.98593  | recall   |
| Mean | 0.948875 | 0.955558 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.989705 | 0.971317 | f1-score |
| 1    | 0.927059 | 0.796053 | f1-score |
| 2    | 0.923664 | 0.915254 | f1-score |
| 3    | 0.984349 | 0.927481 | f1-score |
| 4    | 0.992865 | 0.979984 | f1

## GloVe

In [25]:
# Load GloVe embeddings
import numpy as np
def load_glove(file):
    """Load GloVe embeddings from a text file.
    Args:
        file (str): path to the glove file.
    Returns:
        dict: a dictionary mapping words to their vector representations.
    """
    embeddings = {}
    with open(file) as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove50 = load_glove('../glove.6B/glove.6B.50d.txt')
glove100 = load_glove('../glove.6B/glove.6B.100d.txt')
glove200 = load_glove('../glove.6B/glove.6B.200d.txt')
glove300 = load_glove('../glove.6B/glove.6B.300d.txt')

In [29]:
def sentences_to_embeddings(sentences, embeddings):
    """
        Convert a list of sentences into a matrix of embeddings. 
        
        Args:
            sentences (list): a list of strings, each representing a sentence.
            embeddings (dict): a dictionary mapping words to their vector representations.

        Returns: 
            np.array: a 2D array of shape (len(sentences), len(embeddings[word])), where each
                      row is the average of the word vectors in the sentence.
    """

    matrix = []
    for sentence in sentences:
        words = sentence.split()
        vectors = [embeddings.get(word.lower(), np.zeros(len(embeddings['the']))) for word in words] 
        mean = np.mean(vectors, axis=0) 
        matrix.append(mean)
    return np.array(matrix)

In [30]:
X_train_glove50 = sentences_to_embeddings(X_train, glove50) 
X_test_glove50 = sentences_to_embeddings(X_test, glove50)

X_1_glove50 = sentences_to_embeddings(X_1, glove50)
X_2_glove50 = sentences_to_embeddings(X_2, glove50)
X_3_glove50 = sentences_to_embeddings(X_3, glove50)
X_glove50 = sentences_to_embeddings(X, glove50)

In [31]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_glove50, y_train)
y_pred = svm.predict(X_test_glove50)
y_pred_1 = svm.predict(X_1_glove50)
y_pred_2 = svm.predict(X_2_glove50)
y_pred_3 = svm.predict(X_3_glove50)
y_pred_4 = svm.predict(X_glove50)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.971258 | 0.904899 | precision |
| 1    | 0.986339 | 0.601896 | precision |
| 2    | 0.915254 | 0.801527 | precision |
| 3    | 0.995847 | 0.911765 | precision |
| 4    | 0.979161 | 0.902747 | precision |
| Mean | 0.969572 | 0.824567 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.965505 | 0.919922 | recall   |
| 1    | 0.811236 | 0.962121 | recall   |
| 2    | 0.80597  | 0.913043 | recall   |
| 3    | 0.980376 | 0.980237 | recall   |
| 4    | 0.964124 | 0.941963 | recall   |
| Mean | 0.905442 | 0.943457 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.968373 | 0.912349 | f1-score |
| 1    | 0.890259 | 0.740525 | f1-score |
| 2    | 0.857143 | 0.853659 | f1-score |
| 3    | 0.988051 | 0.944762 | f1-score |
| 4    | 0.971584 | 0.921938 | f1

In [32]:
X_train_glove100 = sentences_to_embeddings(X_train, glove100) 
X_test_glove100 = sentences_to_embeddings(X_test, glove100)

X_1_glove100 = sentences_to_embeddings(X_1, glove100)
X_2_glove100 = sentences_to_embeddings(X_2, glove100)
X_3_glove100 = sentences_to_embeddings(X_3, glove100)
X_glove100 = sentences_to_embeddings(X, glove100)

In [33]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_glove100, y_train)
y_pred = svm.predict(X_test_glove100)
y_pred_1 = svm.predict(X_1_glove100)
y_pred_2 = svm.predict(X_2_glove100)
y_pred_3 = svm.predict(X_3_glove100)
y_pred_4 = svm.predict(X_glove100)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.977528 | 0.917782 | precision |
| 1    | 0.982005 | 0.664894 | precision |
| 2    | 1        | 0.771812 | precision |
| 3    | 0.9975   | 0.905797 | precision |
| 4    | 0.983936 | 0.921289 | precision |
| Mean | 0.988194 | 0.836315 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.970035 | 0.9375   | recall   |
| 1    | 0.858427 | 0.94697  | recall   |
| 2    | 0.746269 | 1        | recall   |
| 3    | 0.978741 | 0.988142 | recall   |
| 4    | 0.97115  | 0.955153 | recall   |
| Mean | 0.904924 | 0.965553 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.973767 | 0.927536 | f1-score |
| 1    | 0.916067 | 0.78125  | f1-score |
| 2    | 0.854701 | 0.871212 | f1-score |
| 3    | 0.988031 | 0.94518  | f1-score |
| 4    | 0.977501 | 0.937916 | f1

In [34]:
X_train_glove200 = sentences_to_embeddings(X_train, glove200) 
X_test_glove200 = sentences_to_embeddings(X_test, glove200)

X_1_glove200 = sentences_to_embeddings(X_1, glove200)
X_2_glove200 = sentences_to_embeddings(X_2, glove200)
X_3_glove200 = sentences_to_embeddings(X_3, glove200)
X_glove200 = sentences_to_embeddings(X, glove200)

In [35]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_glove200, y_train)
y_pred = svm.predict(X_test_glove200)
y_pred_1 = svm.predict(X_1_glove200)
y_pred_2 = svm.predict(X_2_glove200)
y_pred_3 = svm.predict(X_3_glove200)
y_pred_4 = svm.predict(X_glove200)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.98208  | 0.928435 | precision |
| 1    | 0.984772 | 0.688525 | precision |
| 2    | 0.924528 | 0.748252 | precision |
| 3    | 0.9975   | 0.905797 | precision |
| 4    | 0.985766 | 0.926838 | precision |
| Mean | 0.974929 | 0.839569 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.973868 | 0.950195 | recall   |
| 1    | 0.87191  | 0.954545 | recall   |
| 2    | 0.731343 | 0.930435 | recall   |
| 3    | 0.978741 | 0.988142 | recall   |
| 4    | 0.973202 | 0.960253 | recall   |
| Mean | 0.905813 | 0.956714 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.977957 | 0.939189 | f1-score |
| 1    | 0.924911 | 0.8      | f1-score |
| 2    | 0.816667 | 0.829457 | f1-score |
| 3    | 0.988031 | 0.94518  | f1-score |
| 4    | 0.979444 | 0.94325  | f1

In [36]:
X_train_glove300 = sentences_to_embeddings(X_train, glove300) 
X_test_glove300 = sentences_to_embeddings(X_test, glove300)

X_1_glove300 = sentences_to_embeddings(X_1, glove300)
X_2_glove300 = sentences_to_embeddings(X_2, glove300)
X_3_glove300 = sentences_to_embeddings(X_3, glove300)
X_glove300 = sentences_to_embeddings(X, glove300)

In [37]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_glove300, y_train)
y_pred = svm.predict(X_test_glove300)
y_pred_1 = svm.predict(X_1_glove300)
y_pred_2 = svm.predict(X_2_glove300)
y_pred_3 = svm.predict(X_3_glove300)
y_pred_4 = svm.predict(X_glove300)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.981754 | 0.931034 | precision |
| 1    | 0.989691 | 0.677249 | precision |
| 2    | 0.888889 | 0.730496 | precision |
| 3    | 0.99584  | 0.905109 | precision |
| 4    | 0.986601 | 0.93205  | precision |
| Mean | 0.968555 | 0.835188 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.974913 | 0.949219 | recall   |
| 1    | 0.862921 | 0.969697 | recall   |
| 2    | 0.716418 | 0.895652 | recall   |
| 3    | 0.978741 | 0.980237 | recall   |
| 4    | 0.975191 | 0.96254  | recall   |
| Mean | 0.901637 | 0.951469 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.978322 | 0.940039 | f1-score |
| 1    | 0.921969 | 0.797508 | f1-score |
| 2    | 0.793388 | 0.804688 | f1-score |
| 3    | 0.987216 | 0.941176 | f1-score |
| 4    | 0.980863 | 0.94705  | f1

## FastText

In [38]:
from gensim.models.fasttext import FastText, load_facebook_model

model = FastText(vector_size=500, window=5, min_count=10, workers=6)

model.build_vocab(X_train)

model.train(X_train, total_examples=len(X_train), epochs=10)

X_train_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_train]
X_test_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_test]

X_1_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_1]
X_2_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_2]
X_3_ft = [model.wv.get_sentence_vector(sentence) for sentence in X_3]
X_ft = [model.wv.get_sentence_vector(sentence) for sentence in X]

In [39]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_ft, y_train)
y_pred = svm.predict(X_test_ft)
y_pred_1 = svm.predict(X_1_ft)
y_pred_2 = svm.predict(X_2_ft)
y_pred_3 = svm.predict(X_3_ft)
y_pred_4 = svm.predict(X_ft)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.980966 | 0.917692 | precision |
| 1    | 0.97954  | 0.666667 | precision |
| 2    | 0.973913 | 0.835821 | precision |
| 3    | 0.978477 | 0.847015 | precision |
| 4    | 0.984705 | 0.91542  | precision |
| Mean | 0.97952  | 0.836523 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.969686 | 0.947266 | recall   |
| 1    | 0.860674 | 0.939394 | recall   |
| 2    | 0.835821 | 0.973913 | recall   |
| 3    | 0.966476 | 0.897233 | recall   |
| 4    | 0.968725 | 0.957439 | recall   |
| Mean | 0.920276 | 0.943049 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.975293 | 0.932244 | f1-score |
| 1    | 0.916268 | 0.779874 | f1-score |
| 2    | 0.899598 | 0.899598 | f1-score |
| 3    | 0.972439 | 0.871401 | f1-score |
| 4    | 0.976649 | 0.935958 | f1

In [40]:
X_train_temp = ["".join(word.lower() for word in sentence) for sentence in tqdm(X_train)]
X_test_temp = ["".join(word.lower() for word in sentence) for sentence in tqdm(X_test)]

X_1_temp = ["".join(word.lower() for word in sentence) for sentence in tqdm(X_1)]
X_2_temp = ["".join(word.lower() for word in sentence) for sentence in tqdm(X_2)]
X_3_temp = ["".join(word.lower() for word in sentence) for sentence in tqdm(X_3)]
X_temp = ["".join(word.lower() for word in sentence) for sentence in tqdm(X)]

100%|██████████| 15573/15573 [00:00<00:00, 90872.90it/s]
100%|██████████| 3894/3894 [00:00<00:00, 101318.35it/s]
100%|██████████| 577/577 [00:00<00:00, 102586.30it/s]
100%|██████████| 249/249 [00:00<00:00, 52109.65it/s]
100%|██████████| 1476/1476 [00:00<00:00, 88529.69it/s]
100%|██████████| 21769/21769 [00:00<00:00, 105235.51it/s]


In [41]:
from gensim.models.fasttext import FastText

model = FastText(vector_size=500, window=5, min_count=10, workers=6)

model.build_vocab(X_train_temp)

model.train(X_train_temp, total_examples=len(X_train_temp), epochs=10)

X_train_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_train_temp)]
X_test_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_test_temp)]

X_1_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_1_temp)]
X_2_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_2_temp)]
X_3_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_3_temp)]
X_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_temp)]

100%|██████████| 15573/15573 [00:13<00:00, 1159.99it/s]
100%|██████████| 3894/3894 [00:03<00:00, 1202.50it/s]
100%|██████████| 577/577 [00:00<00:00, 1194.95it/s]
100%|██████████| 249/249 [00:00<00:00, 826.28it/s] 
100%|██████████| 1476/1476 [00:00<00:00, 1583.30it/s]
100%|██████████| 21769/21769 [00:18<00:00, 1204.99it/s]


In [42]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_ft, y_train)
y_pred = svm.predict(X_test_ft)
y_pred_1 = svm.predict(X_1_ft)
y_pred_2 = svm.predict(X_2_ft)
y_pred_3 = svm.predict(X_3_ft)
y_pred_4 = svm.predict(X_ft)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.979809 | 0.902894 | precision |
| 1    | 0.984    | 0.623762 | precision |
| 2    | 0.962264 | 0.776224 | precision |
| 3    | 0.969547 | 0.827586 | precision |
| 4    | 0.981653 | 0.896959 | precision |
| Mean | 0.975455 | 0.805485 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.963763 | 0.944336 | recall   |
| 1    | 0.829213 | 0.954545 | recall   |
| 2    | 0.761194 | 0.965217 | recall   |
| 3    | 0.963205 | 0.853755 | recall   |
| 4    | 0.96145  | 0.949173 | recall   |
| Mean | 0.895765 | 0.933405 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.97172  | 0.92315  | f1-score |
| 1    | 0.9      | 0.754491 | f1-score |
| 2    | 0.85     | 0.860465 | f1-score |
| 3    | 0.966366 | 0.840467 | f1-score |
| 4    | 0.971447 | 0.922328 | f1

In [None]:
# Apply grid search to FastText parameters to get the best embedding parameters.
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from gensim.sklearn_api.ftmodel import FastText

model = FastText(vector_size=300, window=5, min_count=10, workers=4)
pipeline = Pipeline([
    ('ft', model),
    ('lr', LogisticRegression())
])


param_grid = {
    'ft__vector_size': [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'ft__window': [1, 3, 5, 7, 9, 11, 13, 15],
    'ft__min_count': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
}

grid_search = GridSearchCV(pipeline, param_grid, scoring='accuracy', cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train,y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)
print(grid_search.cv_results_)

## BERT (Sentence Transformer)

In [5]:
import sentence_transformers as st

class BertSentenceEmbedder():
    '''
    An interface for converting given text into sentence BERT embeddings.
    '''
    def __init__(self):
        '''
        Load the pre-trained model and tokenizer
        '''
        self.model = st.SentenceTransformer('all-MiniLM-L6-v2')

    def embed(self, sentences):
        '''
        Convert the given sentences into BERT embeddings.
        :param sentences: A list of sentences to convert into BERT embeddings.
        :return: A list of BERT embeddings for the given sentences.
        '''
        return self.model.encode(sentences, convert_to_tensor=True)

embedder = BertSentenceEmbedder()

In [16]:
X_train_st = [embedder.embed(sentence) for sentence in tqdm(X_train)]
X_test_st = [embedder.embed(sentence) for sentence in tqdm(X_test)]

X_1_st = [embedder.embed(sentence) for sentence in tqdm(X_1)]
X_2_st = [embedder.embed(sentence) for sentence in tqdm(X_2)]
X_3_st = [embedder.embed(sentence) for sentence in tqdm(X_3)]
X_st = [embedder.embed(sentence) for sentence in tqdm(X)]

100%|██████████| 15573/15573 [02:01<00:00, 128.12it/s]
100%|██████████| 3894/3894 [00:29<00:00, 130.88it/s]
100%|██████████| 577/577 [00:04<00:00, 133.46it/s]
100%|██████████| 249/249 [00:01<00:00, 128.29it/s]
100%|██████████| 1476/1476 [00:11<00:00, 132.90it/s]
100%|██████████| 21769/21769 [02:46<00:00, 131.10it/s]


In [19]:
# go back to cpu
X_train_st = np.array([np.mean(np.array(embedding.cpu()), axis=0) for embedding in X_train_st])
X_train_st = X_train_st.reshape(-1, 1)
X_test_st = np.array([np.mean(np.array(embedding.cpu()), axis=0) for embedding in X_test_st])
X_test_st = X_test_st.reshape(-1, 1)
X_1_st = np.array([np.mean(np.array(embedding.cpu()), axis=0) for embedding in X_1_st])
X_1_st = X_1_st.reshape(-1, 1)
X_2_st = np.array([np.mean(np.array(embedding.cpu()), axis=0) for embedding in X_2_st])
X_2_st = X_2_st.reshape(-1, 1)
X_3_st = np.array([np.mean(np.array(embedding.cpu()), axis=0) for embedding in X_3_st])
X_3_st = X_3_st.reshape(-1, 1)
X_st = np.array([np.mean(np.array(embedding.cpu()), axis=0) for embedding in X_st])
X_st = X_st.reshape(-1, 1)

In [20]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_st, y_train)
y_pred = svm.predict(X_test_st)
y_pred_1 = svm.predict(X_1_st)
y_pred_2 = svm.predict(X_2_st)
y_pred_3 = svm.predict(X_3_st)
y_pred_4 = svm.predict(X_st)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.758685 | 0.926829 | precision |
| 1    | 0.771777 | 0.333333 | precision |
| 2    | 0.536585 | 0.333333 | precision |
| 3    | 0.83871  | 0.947368 | precision |
| 4    | 0.758919 | 0.930709 | precision |
| Mean | 0.73     | 0.69     | precision |
## Recall
|      |        0 |          1 | Metric   |
|:-----|---------:|-----------:|:---------|
| 0    | 0.996864 | 0.111328   | recall   |
| 1    | 0.995506 | 0.00757576 | recall   |
| 2    | 0.985075 | 0.00869565 | recall   |
| 3    | 0.999182 | 0.0711462  | recall   |
| 4    | 0.997264 | 0.10394    | recall   |
| Mean | 0.99     | 0.06       | recall   |
## F1-score
|      |        0 |         1 | Metric   |
|:-----|---------:|----------:|:---------|
| 0    | 0.861617 | 0.198779  | f1-score |
| 1    | 0.86948  | 0.0148148 | f1-score |
| 2    | 0.694737 | 0.0169492 | f1-score |
| 3    | 0.91194  | 0.132353  | f1-score |
| 4    | 0.

## Word2Vec

In [43]:
from gensim.models import Word2Vec
import numpy as np

model = Word2Vec(vector_size=500, window=5, min_count=1, workers=6)
model.build_vocab(X_train)
model.train(X_train, total_examples=len(X_train), epochs=10)

(4669100, 15551690)

In [44]:
def get_word2vec_sentence_embeddings(sentences, model):
    sentence_embeddings = []
    for sentence in tqdm(sentences):
        word_vectors = []
        for word in sentence.split():
            try:
                word_vectors.append(model.wv.get_vector(word.lower()))
            except KeyError:
                word_vectors.append(np.zeros(500))
        if word_vectors:
            sentence_embeddings.append(np.mean(np.mean(word_vectors, axis=0), axis=0))
    sentence_embeddings = np.array(sentence_embeddings)
    sentence_embeddings = sentence_embeddings.reshape(-1, 1)
    return sentence_embeddings

In [45]:
X_train_word2vec = get_word2vec_sentence_embeddings(X_train, model) 
X_test_word2vec = get_word2vec_sentence_embeddings(X_test, model)

X_1_word2vec = get_word2vec_sentence_embeddings(X_1, model)
X_2_word2vec = get_word2vec_sentence_embeddings(X_2, model)
X_3_word2vec = get_word2vec_sentence_embeddings(X_3, model)
X_word2vec = get_word2vec_sentence_embeddings(X, model)

100%|██████████| 15573/15573 [00:01<00:00, 15093.93it/s]
100%|██████████| 3894/3894 [00:00<00:00, 14176.28it/s]
100%|██████████| 577/577 [00:00<00:00, 13687.34it/s]
100%|██████████| 249/249 [00:00<00:00, 12501.28it/s]
100%|██████████| 1476/1476 [00:00<00:00, 16481.57it/s]
100%|██████████| 21769/21769 [00:01<00:00, 15875.40it/s]


In [46]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_word2vec, y_train)
y_pred = svm.predict(X_test_word2vec)
y_pred_1 = svm.predict(X_1_word2vec)
y_pred_2 = svm.predict(X_2_word2vec)
y_pred_3 = svm.predict(X_3_word2vec)
y_pred_4 = svm.predict(X_word2vec)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.765232 | 0.781095 | precision |
| 1    | 0.766478 | 0.173913 | precision |
| 2    | 0.59276  | 0.892857 | precision |
| 3    | 0.854093 | 0.676056 | precision |
| 4    | 0.768065 | 0.767227 | precision |
| Mean | 0.749325 | 0.65823  | precision |
## Recall
|      |        0 |         1 | Metric   |
|:-----|---------:|----------:|:---------|
| 0    | 0.984669 | 0.15332   | recall   |
| 1    | 0.914607 | 0.0606061 | recall   |
| 2    | 0.977612 | 0.217391  | recall   |
| 3    | 0.981194 | 0.189723  | recall   |
| 4    | 0.982777 | 0.16057   | recall   |
| Mean | 0.968172 | 0.156322  | recall   |
## F1-score
|      |        0 |         1 | Metric   |
|:-----|---------:|----------:|:---------|
| 0    | 0.861192 | 0.256327  | f1-score |
| 1    | 0.834016 | 0.0898876 | f1-score |
| 2    | 0.738028 | 0.34965   | f1-score |
| 3    | 0.913242 | 0.296296  | f1-score |
| 4    | 0.862255 |