In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import numpy as np
from tqdm import tqdm
import scipy

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
stop_words = stopwords.words('english')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /home/jimbo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jimbo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def preprocess_sentences(sentences, labels, mode=-1):
    sentences = sentences.to_list()
    labels = labels.to_list()
    # Remove stopwords and extra spaces
    if mode == 0:
        for i, sentence in enumerate(sentences):
            for word in sentence.split():
                if word in stop_words:
                    sentence = sentence.replace(word, '')
                else:
                    sentence = sentence.replace(word, word.lower())
            sentence = sentence.replace('  ', ' ')
            sentences[i] = sentence
    # Replace (c), (C), © with COPYRIGHT_SYMBOL then use the word_tokenize function instaed of split + previous
    elif mode > 0:
        for i, sentence in enumerate(sentences):
            sentence = sentence.replace('(c)', 'COPYRIGHT_SYMBOL')
            sentence = sentence.replace('(C)', 'COPYRIGHT_SYMBOL')
            sentence = sentence.replace('©', 'COPYRIGHT_SYMBOL')
            tokens = word_tokenize(sentence)
            if mode > 1: # Change tokens to lower case
                tokens = [token.lower() for token in tokens]
            if mode > 2: # Apply lemmatization
                tokens = [lemmatizer.lemmatize(token) for token in tokens]
            sentences[i] = tokens
    return sentences, labels

In [3]:
data_0 = pd.read_csv('../cleared_datasets/fossology-master.csv')
X_0 = data_0["copyright"]
y_0 = data_0["falsePositive"]
X_0 = X_0.drop_duplicates()
y_0 = y_0[X_0.index]

data_1 = pd.read_csv('../cleared_datasets/kubernetes-master.csv')
X_1 = data_1["copyright"]
y_1 = data_1["falsePositive"]
X_1 = X_1.drop_duplicates()
y_1 = y_1[X_1.index]

data_2 = pd.read_csv('../cleared_datasets/tensorflow-master.csv')
X_2 = data_2["copyright"]
y_2 = data_2["falsePositive"]
X_2 = X_2.drop_duplicates()
y_2 = y_2[X_2.index]

data_3 = pd.read_csv('../Fossology-Provided-Dataset-1.csv')

X_3 = data_3['scanner_content']
y_3 = data_3['falsePositive']
X_3 = X_3.drop_duplicates()
y_3 = y_3[X_3.index]

X = pd.concat([X_0, X_1, X_2, X_3])
y = pd.concat([y_0, y_1, y_2, y_3])

print('Class 0 Percentage: ', len(y[y == 0]) / len(y))
print('Class 1 Percentage: ', len(y[y == 1]) / len(y))

X_train, X_test, y_train, y_test = train_test_split(X_0, y_0, test_size=0.2, random_state=42)

Class 0 Percentage:  0.738802884836235
Class 1 Percentage:  0.261197115163765


In [4]:
def aggregate_reports(reports):
    import pandas as pd
    import numpy as np
    dfs = []
    for metric in ['precision', 'recall', 'f1-score']:
        scores = []
        for report in reports:
            scores.append([report['0'][metric], report['1'][metric]])
        scores = np.array(scores)
        scores = scores[:, :2]
        mean_scores = np.mean(scores, axis=0)
        mean_scores = [f"{score:.6f}" for score in mean_scores]
        df = pd.DataFrame(scores, columns=['0', '1'])
        df.loc['Mean'] = mean_scores
        df['Metric'] = metric
        dfs.append(df)
    print("## Precision")
    print(dfs[0].to_markdown())
    print("## Recall")
    print(dfs[1].to_markdown())
    print("## F1-score")
    print(dfs[2].to_markdown())

## Bag of Worlds

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

X_1_bow = vectorizer.transform(X_1)

X_2_bow = vectorizer.transform(X_2)

X_3_bow = vectorizer.transform(X_3)

X_bow = vectorizer.transform(X)

In [20]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
y_pred_1 = rf.predict(X_1_bow)
y_pred_2 = rf.predict(X_2_bow)
y_pred_3 = rf.predict(X_3_bow)
y_pred_4 = rf.predict(X_bow)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.99361  | 0.934076 | precision |
| 1    | 0.991892 | 0.623188 | precision |
| 2    | 1        | 0.793103 | precision |
| 3    | 1        | 0.76435  | precision |
| 4    | 0.998423 | 0.95625  | precision |
| Mean | 0.996785 | 0.814194 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.975261 | 0.982422 | recall   |
| 1    | 0.824719 | 0.977273 | recall   |
| 2    | 0.776119 | 1        | recall   |
| 3    | 0.936222 | 1        | recall   |
| 4    | 0.983896 | 0.995603 | recall   |
| Mean | 0.899244 | 0.99106  | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.98435  | 0.957639 | f1-score |
| 1    | 0.900613 | 0.761062 | f1-score |
| 2    | 0.87395  | 0.884615 | f1-score |
| 3    | 0.967061 | 0.866438 | f1-score |
| 4    | 0.991106 | 0.97553  | f1

## TF-IDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

X_1_tfidf = vectorizer.transform(X_1)

X_2_tfidf = vectorizer.transform(X_2)

X_3_tfidf = vectorizer.transform(X_3)

X_tfidf = vectorizer.transform(X)

In [22]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_test_tfidf)
y_pred_1 = rf.predict(X_1_tfidf)
y_pred_2 = rf.predict(X_2_tfidf)
y_pred_3 = rf.predict(X_3_tfidf)
y_pred_4 = rf.predict(X_tfidf)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.988792 | 0.954764 | precision |
| 1    | 0.982234 | 0.68306  | precision |
| 2    | 1        | 0.804196 | precision |
| 3    | 0.99916  | 0.881119 | precision |
| 4    | 0.997367 | 0.970593 | precision |
| Mean | 0.99351  | 0.858746 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.983624 | 0.96875  | recall   |
| 1    | 0.869663 | 0.94697  | recall   |
| 2    | 0.791045 | 1        | recall   |
| 3    | 0.9722   | 0.996047 | recall   |
| 4    | 0.989368 | 0.992613 | recall   |
| Mean | 0.92118  | 0.980876 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.986201 | 0.961706 | f1-score |
| 1    | 0.922527 | 0.793651 | f1-score |
| 2    | 0.883333 | 0.891473 | f1-score |
| 3    | 0.985495 | 0.935065 | f1-score |
| 4    | 0.993351 | 0.98148  | f1

## GloVe

In [23]:
# Load GloVe embeddings
import numpy as np
def load_glove(file):
    """Load GloVe embeddings from a text file.
    Args:
        file (str): path to the glove file.
    Returns:
        dict: a dictionary mapping words to their vector representations.
    """
    embeddings = {}
    with open(file) as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove50 = load_glove('../glove.6B/glove.6B.50d.txt')
glove100 = load_glove('../glove.6B/glove.6B.100d.txt')
glove200 = load_glove('../glove.6B/glove.6B.200d.txt')
glove300 = load_glove('../glove.6B/glove.6B.300d.txt')

In [24]:
def sentences_to_embeddings(sentences, embeddings):
    """
        Convert a list of sentences into a matrix of embeddings. 
        
        Args:
            sentences (list): a list of strings, each representing a sentence.
            embeddings (dict): a dictionary mapping words to their vector representations.

        Returns: 
            np.array: a 2D array of shape (len(sentences), len(embeddings[word])), where each
                      row is the average of the word vectors in the sentence.
    """

    matrix = []
    for sentence in sentences:
        words = sentence.split()
        vectors = [embeddings.get(word.lower(), np.zeros(len(embeddings['the']))) for word in words] 
        mean = np.mean(vectors, axis=0) 
        matrix.append(mean)
    return np.array(matrix)

In [25]:
X_train_glove50 = sentences_to_embeddings(X_train, glove50) 
X_test_glove50 = sentences_to_embeddings(X_test, glove50)

X_1_glove50 = sentences_to_embeddings(X_1, glove50)
X_2_glove50 = sentences_to_embeddings(X_2, glove50)
X_3_glove50 = sentences_to_embeddings(X_3, glove50)
X_glove50 = sentences_to_embeddings(X, glove50)

In [26]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_glove50, y_train)
y_pred = rf.predict(X_test_glove50)
y_pred_1 = rf.predict(X_1_glove50)
y_pred_2 = rf.predict(X_2_glove50)
y_pred_3 = rf.predict(X_3_glove50)
y_pred_4 = rf.predict(X_glove50)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.967185 | 0.92993  | precision |
| 1    | 0.979651 | 0.536481 | precision |
| 2    | 0.875    | 0.737226 | precision |
| 3    | 0.995851 | 0.915129 | precision |
| 4    | 0.990792 | 0.954342 | precision |
| Mean | 0.961696 | 0.814622 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.97561  | 0.907227 | recall   |
| 1    | 0.757303 | 0.94697  | recall   |
| 2    | 0.731343 | 0.878261 | recall   |
| 3    | 0.981194 | 0.980237 | recall   |
| 4    | 0.983523 | 0.974147 | recall   |
| Mean | 0.885795 | 0.937368 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.971379 | 0.918438 | f1-score |
| 1    | 0.854246 | 0.684932 | f1-score |
| 2    | 0.796748 | 0.801587 | f1-score |
| 3    | 0.988468 | 0.946565 | f1-score |
| 4    | 0.987144 | 0.964143 | f1

In [27]:
X_train_glove100 = sentences_to_embeddings(X_train, glove100) 
X_test_glove100 = sentences_to_embeddings(X_test, glove100)

X_1_glove100 = sentences_to_embeddings(X_1, glove100)
X_2_glove100 = sentences_to_embeddings(X_2, glove100)
X_3_glove100 = sentences_to_embeddings(X_3, glove100)
X_glove100 = sentences_to_embeddings(X, glove100)

In [28]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_glove100, y_train)
y_pred = rf.predict(X_test_glove100)
y_pred_1 = rf.predict(X_1_glove100)
y_pred_2 = rf.predict(X_2_glove100)
y_pred_3 = rf.predict(X_3_glove100)
y_pred_4 = rf.predict(X_glove100)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.973976 | 0.937747 | precision |
| 1    | 0.968927 | 0.542601 | precision |
| 2    | 1        | 0.761589 | precision |
| 3    | 0.997504 | 0.912409 | precision |
| 4    | 0.992602 | 0.95703  | precision |
| Mean | 0.986602 | 0.822275 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.978049 | 0.926758 | recall   |
| 1    | 0.770787 | 0.916667 | recall   |
| 2    | 0.731343 | 1        | recall   |
| 3    | 0.980376 | 0.988142 | recall   |
| 4    | 0.984456 | 0.979247 | recall   |
| Mean | 0.889002 | 0.962163 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.976008 | 0.93222  | f1-score |
| 1    | 0.858573 | 0.68169  | f1-score |
| 2    | 0.844828 | 0.864662 | f1-score |
| 3    | 0.988866 | 0.948767 | f1-score |
| 4    | 0.988512 | 0.968011 | f1

In [29]:
X_train_glove200 = sentences_to_embeddings(X_train, glove200) 
X_test_glove200 = sentences_to_embeddings(X_test, glove200)

X_1_glove200 = sentences_to_embeddings(X_1, glove200)
X_2_glove200 = sentences_to_embeddings(X_2, glove200)
X_3_glove200 = sentences_to_embeddings(X_3, glove200)
X_glove200 = sentences_to_embeddings(X, glove200)

In [30]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_glove200, y_train)
y_pred = rf.predict(X_test_glove200)
y_pred_1 = rf.predict(X_1_glove200)
y_pred_2 = rf.predict(X_2_glove200)
y_pred_3 = rf.predict(X_3_glove200)
y_pred_4 = rf.predict(X_glove200)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.973282 | 0.935771 | precision |
| 1    | 0.971279 | 0.623711 | precision |
| 2    | 1        | 0.756579 | precision |
| 3    | 0.995021 | 0.911439 | precision |
| 4    | 0.992735 | 0.960179 | precision |
| Mean | 0.986464 | 0.837536 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.977352 | 0.924805 | recall   |
| 1    | 0.835955 | 0.916667 | recall   |
| 2    | 0.723881 | 1        | recall   |
| 3    | 0.980376 | 0.976285 | recall   |
| 4    | 0.985637 | 0.979599 | recall   |
| Mean | 0.90064  | 0.959471 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.975313 | 0.930255 | f1-score |
| 1    | 0.898551 | 0.742331 | f1-score |
| 2    | 0.839827 | 0.861423 | f1-score |
| 3    | 0.987644 | 0.942748 | f1-score |
| 4    | 0.989174 | 0.969792 | f1

In [31]:
X_train_glove300 = sentences_to_embeddings(X_train, glove300) 
X_test_glove300 = sentences_to_embeddings(X_test, glove300)

X_1_glove300 = sentences_to_embeddings(X_1, glove300)
X_2_glove300 = sentences_to_embeddings(X_2, glove300)
X_3_glove300 = sentences_to_embeddings(X_3, glove300)
X_glove300 = sentences_to_embeddings(X, glove300)

In [32]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_glove300, y_train)
y_pred = rf.predict(X_test_glove300)
y_pred_1 = rf.predict(X_1_glove300)
y_pred_2 = rf.predict(X_2_glove300)
y_pred_3 = rf.predict(X_3_glove300)
y_pred_4 = rf.predict(X_glove300)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.970609 | 0.937126 | precision |
| 1    | 0.969152 | 0.638298 | precision |
| 2    | 0.906542 | 0.739437 | precision |
| 3    | 0.996675 | 0.912088 | precision |
| 4    | 0.991683 | 0.961226 | precision |
| Mean | 0.966932 | 0.837635 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.978049 | 0.916992 | recall   |
| 1    | 0.847191 | 0.909091 | recall   |
| 2    | 0.723881 | 0.913043 | recall   |
| 3    | 0.980376 | 0.98419  | recall   |
| 4    | 0.986072 | 0.976609 | recall   |
| Mean | 0.903114 | 0.939985 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.974314 | 0.92695  | f1-score |
| 1    | 0.904077 | 0.75     | f1-score |
| 2    | 0.804979 | 0.817121 | f1-score |
| 3    | 0.988458 | 0.946768 | f1-score |
| 4    | 0.98887  | 0.968856 | f1

## FastText

In [33]:
from gensim.models.fasttext import FastText

model = FastText(vector_size=500, window=5, min_count=10, workers=6)

model.build_vocab(X_train)

model.train(X_train, total_examples=len(X_train), epochs=10)

X_train_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_train)]
X_test_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_test)]

X_1_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_1)]
X_2_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_2)]
X_3_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_3)]
X_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X)]

100%|██████████| 15573/15573 [00:14<00:00, 1105.83it/s]
100%|██████████| 3894/3894 [00:03<00:00, 983.33it/s] 
100%|██████████| 577/577 [00:00<00:00, 1004.06it/s]
100%|██████████| 249/249 [00:00<00:00, 808.25it/s] 
100%|██████████| 1476/1476 [00:01<00:00, 1475.24it/s]
100%|██████████| 21769/21769 [00:19<00:00, 1141.69it/s]


In [34]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_ft, y_train)
y_pred = rf.predict(X_test_ft)
y_pred_1 = rf.predict(X_1_ft)
y_pred_2 = rf.predict(X_2_ft)
y_pred_3 = rf.predict(X_3_ft)
y_pred_4 = rf.predict(X_ft)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.98768  | 0.939221 | precision |
| 1    | 0.987562 | 0.725714 | precision |
| 2    | 0.965812 | 0.840909 | precision |
| 3    | 0.983375 | 0.85348  | precision |
| 4    | 0.995993 | 0.970147 | precision |
| Mean | 0.984085 | 0.865894 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.9777   | 0.96582  | recall   |
| 1    | 0.892135 | 0.962121 | recall   |
| 2    | 0.843284 | 0.965217 | recall   |
| 3    | 0.967294 | 0.920949 | recall   |
| 4    | 0.989243 | 0.988744 | recall   |
| Mean | 0.933931 | 0.96057  | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.982665 | 0.952335 | f1-score |
| 1    | 0.937426 | 0.827362 | f1-score |
| 2    | 0.900398 | 0.898785 | f1-score |
| 3    | 0.975268 | 0.885932 | f1-score |
| 4    | 0.992607 | 0.979357 | f1

In [5]:
X_train_temp = ["".join(word.lower() for word in sentence) for sentence in tqdm(X_train)]
X_test_temp = ["".join(word.lower() for word in sentence) for sentence in tqdm(X_test)]

X_1_temp = ["".join(word.lower() for word in sentence) for sentence in tqdm(X_1)]
X_2_temp = ["".join(word.lower() for word in sentence) for sentence in tqdm(X_2)]
X_3_temp = ["".join(word.lower() for word in sentence) for sentence in tqdm(X_3)]
X_temp = ["".join(word.lower() for word in sentence) for sentence in tqdm(X)]

100%|██████████| 15573/15573 [00:00<00:00, 92867.90it/s] 
100%|██████████| 3894/3894 [00:00<00:00, 98637.06it/s]
100%|██████████| 577/577 [00:00<00:00, 90685.11it/s]
100%|██████████| 249/249 [00:00<00:00, 62113.82it/s]
100%|██████████| 1476/1476 [00:00<00:00, 99362.69it/s]
100%|██████████| 21769/21769 [00:00<00:00, 86925.29it/s] 


In [42]:
from gensim.models.fasttext import FastText

model = FastText(vector_size=500, window=5, min_count=10, workers=6)

model.build_vocab(X_train_temp)

model.train(X_train_temp, total_examples=len(X_train_temp), epochs=10)

X_train_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_train_temp)]
X_test_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_test_temp)]

X_1_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_1_temp)]
X_2_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_2_temp)]
X_3_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_3_temp)]
X_ft = [model.wv.get_sentence_vector(sentence) for sentence in tqdm(X_temp)]

100%|██████████| 15573/15573 [00:12<00:00, 1201.23it/s]
100%|██████████| 3894/3894 [00:03<00:00, 1115.67it/s]
100%|██████████| 577/577 [00:00<00:00, 1155.23it/s]
100%|██████████| 249/249 [00:00<00:00, 910.85it/s] 
100%|██████████| 1476/1476 [00:00<00:00, 1590.18it/s]
100%|██████████| 21769/21769 [00:18<00:00, 1204.94it/s]


In [43]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_ft, y_train)
y_pred = rf.predict(X_test_ft)
y_pred_1 = rf.predict(X_1_ft)
y_pred_2 = rf.predict(X_2_ft)
y_pred_3 = rf.predict(X_3_ft)
y_pred_4 = rf.predict(X_ft)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.98518  | 0.926415 | precision |
| 1    | 0.981865 | 0.65445  | precision |
| 2    | 0.936937 | 0.782609 | precision |
| 3    | 0.979236 | 0.838235 | precision |
| 4    | 0.99492  | 0.962562 | precision |
| Mean | 0.975628 | 0.832854 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.972822 | 0.958984 | recall   |
| 1    | 0.851685 | 0.94697  | recall   |
| 2    | 0.776119 | 0.93913  | recall   |
| 3    | 0.964023 | 0.901186 | recall   |
| 4    | 0.986445 | 0.985754 | recall   |
| Mean | 0.910219 | 0.946405 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.978962 | 0.942418 | f1-score |
| 1    | 0.912154 | 0.773994 | f1-score |
| 2    | 0.84898  | 0.853755 | f1-score |
| 3    | 0.97157  | 0.868571 | f1-score |
| 4    | 0.990665 | 0.97402  | f1

## BERT (Sentence Transformer)

In [44]:
import sentence_transformers as st

class BertSentenceEmbedder():
    '''
    An interface for converting given text into sentence BERT embeddings.
    '''
    def __init__(self):
        '''
        Load the pre-trained model and tokenizer
        '''
        self.model = st.SentenceTransformer('all-MiniLM-L6-v2')

    def embed(self, sentences):
        '''
        Convert the given sentences into BERT embeddings.
        :param sentences: A list of sentences to convert into BERT embeddings.
        :return: A list of BERT embeddings for the given sentences.
        '''
        return self.model.encode(sentences, convert_to_tensor=True)

embedder = BertSentenceEmbedder()

In [None]:
X_train_st = [embedder.embed(sentence) for sentence in tqdm(X_train)]
X_test_st = [embedder.embed(sentence) for sentence in tqdm(X_test)]

X_1_st = [embedder.embed(sentence) for sentence in tqdm(X_1)]
X_2_st = [embedder.embed(sentence) for sentence in tqdm(X_2)]
X_3_st = [embedder.embed(sentence) for sentence in tqdm(X_3)]
X_st = [embedder.embed(sentence) for sentence in tqdm(X)]

In [None]:
X_train_st = np.array([np.mean(np.array(embedding.cpu()), axis=0) for embedding in X_train_st])
X_train_st = X_train_st.reshape(-1, 1)
X_test_st = np.array([np.mean(np.array(embedding.cpu()), axis=0) for embedding in X_test_st])
X_test_st = X_test_st.reshape(-1, 1)
X_1_st = np.array([np.mean(np.array(embedding.cpu()), axis=0) for embedding in X_1_st])
X_1_st = X_1_st.reshape(-1, 1)
X_2_st = np.array([np.mean(np.array(embedding.cpu()), axis=0) for embedding in X_2_st])
X_2_st = X_2_st.reshape(-1, 1)
X_3_st = np.array([np.mean(np.array(embedding.cpu()), axis=0) for embedding in X_3_st])
X_3_st = X_3_st.reshape(-1, 1)
X_st = np.array([np.mean(np.array(embedding.cpu()), axis=0) for embedding in X_st])
X_st = X_st.reshape(-1, 1)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_st, y_train)
y_pred = rf.predict(X_test_st)
y_pred_1 = rf.predict(X_1_st)
y_pred_2 = rf.predict(X_2_st)
y_pred_3 = rf.predict(X_3_st)
y_pred_4 = rf.predict(X_st)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.764063 | 0.37037  | precision |
| 1    | 0.791932 | 0.320755 | precision |
| 2    | 0.5625   | 0.54386  | precision |
| 3    | 0.872541 | 0.338762 | precision |
| 4    | 0.812331 | 0.586929 | precision |
| Mean | 0.76     | 0.43     | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.828223 | 0.283203 | recall   |
| 1    | 0.838202 | 0.257576 | recall   |
| 2    | 0.80597  | 0.269565 | recall   |
| 3    | 0.834015 | 0.411067 | recall   |
| 4    | 0.897034 | 0.413823 | recall   |
| Mean | 0.84     | 0.33     | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.79485  | 0.320974 | f1-score |
| 1    | 0.81441  | 0.285714 | f1-score |
| 2    | 0.662577 | 0.360465 | f1-score |
| 3    | 0.852843 | 0.371429 | f1-score |
| 4    | 0.852584 | 0.485405 | f1

## Word2Vec

In [10]:
from gensim.models import Word2Vec
import numpy as np

model = Word2Vec(vector_size=500, window=5, min_count=1, workers=6)
model.build_vocab(X_train)
model.train(X_train, total_examples=len(X_train), epochs=10)

(4665133, 15551690)

In [11]:
def get_word2vec_sentence_embeddings(sentences, model):
    sentence_embeddings = []
    for sentence in tqdm(sentences):
        word_vectors = []
        for word in sentence.split():
            try:
                word_vectors.append(model.wv.get_vector(word.lower()))
            except KeyError:
                word_vectors.append(np.zeros(500))
        if word_vectors:
            sentence_embeddings.append(np.mean(np.mean(word_vectors, axis=0), axis=0))
    sentence_embeddings = np.array(sentence_embeddings)
    sentence_embeddings = sentence_embeddings.reshape(-1, 1)
    return sentence_embeddings

In [12]:
X_train_word2vec = get_word2vec_sentence_embeddings(X_train, model) 
X_test_word2vec = get_word2vec_sentence_embeddings(X_test, model)

X_1_word2vec = get_word2vec_sentence_embeddings(X_1, model)
X_2_word2vec = get_word2vec_sentence_embeddings(X_2, model)
X_3_word2vec = get_word2vec_sentence_embeddings(X_3, model)
X_word2vec = get_word2vec_sentence_embeddings(X, model)

100%|██████████| 15573/15573 [00:01<00:00, 14584.76it/s]
100%|██████████| 3894/3894 [00:00<00:00, 13863.01it/s]
100%|██████████| 577/577 [00:00<00:00, 10798.29it/s]
100%|██████████| 249/249 [00:00<00:00, 12864.85it/s]
100%|██████████| 1476/1476 [00:00<00:00, 16889.86it/s]
100%|██████████| 21769/21769 [00:01<00:00, 14976.38it/s]


In [9]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_word2vec, y_train)
y_pred = rf.predict(X_test_word2vec)
y_pred_1 = rf.predict(X_1_word2vec)
y_pred_2 = rf.predict(X_2_word2vec)
y_pred_3 = rf.predict(X_3_word2vec)
y_pred_4 = rf.predict(X_word2vec)
report = classification_report(y_test, y_pred, output_dict=True)
report_1 = classification_report(y_1, y_pred_1, output_dict=True)
report_2 = classification_report(y_2, y_pred_2, output_dict=True)
report_3 = classification_report(y_3, y_pred_3, output_dict=True)
report_4 = classification_report(y, y_pred_4, output_dict=True)
print(aggregate_reports([report, report_1, report_2, report_3, report_4]))

## Precision
|      |        0 |        1 | Metric    |
|:-----|---------:|---------:|:----------|
| 0    | 0.782076 | 0.800664 | precision |
| 1    | 0.771058 | 0.22807  | precision |
| 2    | 0.565657 | 0.568627 | precision |
| 3    | 0.857445 | 0.864407 | precision |
| 4    | 0.791227 | 0.851351 | precision |
| Mean | 0.753493 | 0.662624 | precision |
## Recall
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.979094 | 0.235352 | recall   |
| 1    | 0.802247 | 0.19697  | recall   |
| 2    | 0.835821 | 0.252174 | recall   |
| 3    | 0.993459 | 0.201581 | recall   |
| 4    | 0.983585 | 0.265916 | recall   |
| Mean | 0.918841 | 0.230398 | recall   |
## F1-score
|      |        0 |        1 | Metric   |
|:-----|---------:|---------:|:---------|
| 0    | 0.869565 | 0.363774 | f1-score |
| 1    | 0.786344 | 0.211382 | f1-score |
| 2    | 0.674699 | 0.349398 | f1-score |
| 3    | 0.920455 | 0.326923 | f1-score |
| 4    | 0.876982 | 0.405253 | f1