<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/Classifiers_with_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Important and helpful sources
https://www.youtube.com/watch?v=P47raNuzAW0
https://github.com/PradipNichite/Youtube-Tutorials/blob/main/Yotutube_WordVectors.ipynb


In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import nltk
import gensim
from spacy.tokenizer import Tokenizer
from sklearn.model_selection import train_test_split

In [2]:
np.random.seed(42) #The random seed helps in the reproduction of the results

In [3]:
dataset=pd.read_excel('posts.xlsx')

In [None]:
dataset.shape

(14932, 8)

In [None]:
dataset.head()

Unnamed: 0,Post,Class,Tokens,W2V_Vect,Tf-IDF_vect,BoW_vect,fastText_vect,GloVe_vect
0,Here's the architecture:\n<ol>\n<li>Input comp...,1,,,,,,
1,I am confused about the architecture of Razor ...,1,,,,,,
2,I have some spring boot microservices and I wa...,1,,,,,,
3,I'm trying to properly design an application a...,1,,,,,,
4,I heard that for .NET8 Microsoft gifted us wit...,1,,,,,,


# Tokenization

In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
print(stop_words)
print("-----------------------------------------------")

import string
punctuations = string.punctuation
print(punctuations)

{'our', 'first', 'except', 'three', 'always', 'why', "'re", 'must', 'that', 'together', 'however', "n't", 'i', 'four', 'top', 'latterly', 'nevertheless', 'eleven', 'its', 'ca', 'next', 'whereupon', 'seem', 'whether', 'through', 'throughout', 'to', 'himself', 'wherein', 'using', 'here', 'both', 'onto', 'twenty', 'less', '’ve', 'do', 'myself', 'bottom', 'hence', 'across', 'along', 'above', 'became', 'anyway', 'at', 'ours', 'beyond', 'toward', 'back', 'much', 'fifty', 'have', 'fifteen', 'same', '‘ve', 'whereas', "'s", 'but', 'put', 'show', 'whoever', 'ever', 'hereafter', 'we', 'namely', 'again', 'rather', 'their', 'make', 'unless', 'part', 'wherever', 'amongst', 'be', "'m", 'whereafter', 'hundred', 'really', 'whatever', 'when', 'up', 'had', 'call', 'if', 'quite', 'elsewhere', '‘re', 'forty', 'off', 'therefore', 'move', 'in', 'although', 'either', 'keep', 'each', 'another', 'very', 'an', 'n’t', 'various', 'already', 'other', 'after', 'get', 'you', 'noone', 'front', '‘d', 'while', 'thus', '

In [5]:
# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # print(doc)
    # print(type(doc))

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in doc ]

    # print(mytokens)

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [7]:
dataset['Tokens'] = dataset['Question_body'].apply(spacy_tokenizer)

In [8]:
dataset.head()

Unnamed: 0,Question_body,Label,Tokens
0,Here's the architecture:\n<ol>\n<li>Input comp...,1,"[architecture, ol, li, input, component, recei..."
1,I am confused about the architecture of Razor ...,1,"[confused, architecture, razor, pages, asp.net..."
2,I have some spring boot microservices and I wa...,1,"[spring, boot, microservice, want, use, micros..."
3,I'm trying to properly design an application a...,1,"[try, properly, design, application, accord, c..."
4,I heard that for .NET8 Microsoft gifted us wit...,1,"[hear, .net8, microsoft, gift, totally, quot;f..."


# Feature Vectors Engineering

In [9]:
import gensim.downloader as api #importing word embeddings
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [10]:
pretrained = api.load('word2vec-google-news-300')



In [12]:
def sent_vec(sent):
    vector_size = pretrained.vector_size
    pretrained_res = np.zeros(vector_size)
    # print(wv_res)
    ctr = 1
    for w in sent:
        if w in pretrained:
            ctr += 1
            pretrained_res += pretrained[w]
    pretrained_res = pretrained_res/ctr
    return pretrained_res

In [13]:
dataset['W2V_Vect'] = dataset['Tokens'].apply(sent_vec)

In [14]:
dataset.head()

Unnamed: 0,Question_body,Label,Tokens,W2V_Vect
0,Here's the architecture:\n<ol>\n<li>Input comp...,1,"[architecture, ol, li, input, component, recei...","[0.042191066099016857, -0.03547005171186469, -..."
1,I am confused about the architecture of Razor ...,1,"[confused, architecture, razor, pages, asp.net...","[0.020880169338650174, 0.029664357503255207, 0..."
2,I have some spring boot microservices and I wa...,1,"[spring, boot, microservice, want, use, micros...","[0.011846774961890244, 0.0009713521817835366, ..."
3,I'm trying to properly design an application a...,1,"[try, properly, design, application, accord, c...","[0.046028193721064814, -0.027165794372558595, ..."
4,I heard that for .NET8 Microsoft gifted us wit...,1,"[hear, .net8, microsoft, gift, totally, quot;f...","[0.06962245343679405, -0.020223778414439005, -..."


In [15]:
x = dataset['W2V_Vect'].to_list()
y = dataset['Label'].to_list()

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x , y, test_size=0.33, random_state=125)

# Naive Bayes with Word2vect

In [17]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [21]:
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)


In [22]:
#print(classification_report(y_test, y_pred))

In [25]:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

# Compute metrics for each class
precision_class, recall_class, f1_class, support_class = precision_recall_fscore_support(y_test, y_pred, average=None, labels=[0, 1])

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])

# Extract TP, TN, FP, FN for each class
TP_0, FN_0 = conf_matrix[0, 0], conf_matrix[0, 1]
FP_0, TN_0 = conf_matrix[1, 0], conf_matrix[1, 1]

TP_1, FN_1 = conf_matrix[1, 1], conf_matrix[1, 0]
FP_1, TN_1 = conf_matrix[0, 1], conf_matrix[0, 0]

# Calculate accuracy for each class
accuracy_class_0 = (TP_0 + TN_0) / (TP_0 + TN_0 + FP_0 + FN_0)
accuracy_class_1 = (TP_1 + TN_1) / (TP_1 + TN_1 + FP_1 + FN_1)

print(f'Class 0 - Precision: {precision_class[0]}, Recall: {recall_class[0]}, Accuracy: {accuracy_class_0}, F1-score: {f1_class[0]}, Support: {support_class[0]}')
print(f'Class 1 - Precision: {precision_class[1]}, Recall: {recall_class[1]}, Accuracy: {accuracy_class_1}, F1-score: {f1_class[1]}, Support: {support_class[1]}')


Class 0 - Precision: 0.8277752398355414, Recall: 0.7122641509433962, Accuracy: 0.7749594155844156, F1-score: 0.76568772448764, Support: 2544
Class 1 - Precision: 0.732749178532311, Recall: 0.8418624161073825, Accuracy: 0.7749594155844156, F1-score: 0.7835252781573298, Support: 2384


# kNN with Word2vect

In [26]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [48]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X_train, y_train)

In [None]:
#y_pred = knn_model.predict(X_test)
#print(classification_report(y_test, y_pred))

In [51]:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

# Compute metrics for each class
precision_class, recall_class, f1_class, support_class = precision_recall_fscore_support(y_test, y_pred, average=None, labels=[0, 1])

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])

# Extract TP, TN, FP, FN for each class
TP_0, FN_0 = conf_matrix[0, 0], conf_matrix[0, 1]
FP_0, TN_0 = conf_matrix[1, 0], conf_matrix[1, 1]

TP_1, FN_1 = conf_matrix[1, 1], conf_matrix[1, 0]
FP_1, TN_1 = conf_matrix[0, 1], conf_matrix[0, 0]

# Calculate accuracy for each class
accuracy_class_0 = (TP_0 + TN_0) / (TP_0 + TN_0 + FP_0 + FN_0)
accuracy_class_1 = (TP_1 + TN_1) / (TP_1 + TN_1 + FP_1 + FN_1)

print(f'Class 0 - Precision: {precision_class[0]}, Recall: {recall_class[0]}, Accuracy: {accuracy_class_0}, F1-score: {f1_class[0]}, Support: {support_class[0]}')
print(f'Class 1 - Precision: {precision_class[1]}, Recall: {recall_class[1]}, Accuracy: {accuracy_class_1}, F1-score: {f1_class[1]}, Support: {support_class[1]}')


Class 0 - Precision: 0.8753979081400637, Recall: 0.7566823899371069, Accuracy: 0.8187905844155844, F1-score: 0.8117225384777567, Support: 2544
Class 1 - Precision: 0.7731769879076584, Recall: 0.8850671140939598, Accuracy: 0.8187905844155844, F1-score: 0.8253471543125367, Support: 2384


# Logistic Regression with Word2vect

In [31]:
from sklearn.linear_model import LogisticRegression
lg_model = LogisticRegression(C=5, penalty='l1', solver='liblinear')
lg_model = lg_model.fit(X_train, y_train)

In [33]:
#y_pred = lg_model.predict(X_test)
#print(classification_report(y_test, y_pred))

In [32]:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

# Compute metrics for each class
precision_class, recall_class, f1_class, support_class = precision_recall_fscore_support(y_test, y_pred, average=None, labels=[0, 1])

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])

# Extract TP, TN, FP, FN for each class
TP_0, FN_0 = conf_matrix[0, 0], conf_matrix[0, 1]
FP_0, TN_0 = conf_matrix[1, 0], conf_matrix[1, 1]

TP_1, FN_1 = conf_matrix[1, 1], conf_matrix[1, 0]
FP_1, TN_1 = conf_matrix[0, 1], conf_matrix[0, 0]

# Calculate accuracy for each class
accuracy_class_0 = (TP_0 + TN_0) / (TP_0 + TN_0 + FP_0 + FN_0)
accuracy_class_1 = (TP_1 + TN_1) / (TP_1 + TN_1 + FP_1 + FN_1)

print(f'Class 0 - Precision: {precision_class[0]}, Recall: {recall_class[0]}, Accuracy: {accuracy_class_0}, F1-score: {f1_class[0]}, Support: {support_class[0]}')
print(f'Class 1 - Precision: {precision_class[1]}, Recall: {recall_class[1]}, Accuracy: {accuracy_class_1}, F1-score: {f1_class[1]}, Support: {support_class[1]}')


Class 0 - Precision: 0.9089043906442347, Recall: 0.8706761006289309, Accuracy: 0.888189935064935, F1-score: 0.8893796426420398, Support: 2544
Class 1 - Precision: 0.8679245283018868, Recall: 0.9068791946308725, Accuracy: 0.888189935064935, F1-score: 0.8869743589743591, Support: 2384


# Decision Tree with Word2vect

In [37]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
dt_model = dt_model.fit(X_train, y_train)

In [None]:
#y_pred = dt_model.predict(X_test)
#print(classification_report(y_test, y_pred))

In [41]:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

# Compute metrics for each class
precision_class, recall_class, f1_class, support_class = precision_recall_fscore_support(y_test, y_pred, average=None, labels=[0, 1])

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])

# Extract TP, TN, FP, FN for each class
TP_0, FN_0 = conf_matrix[0, 0], conf_matrix[0, 1]
FP_0, TN_0 = conf_matrix[1, 0], conf_matrix[1, 1]

TP_1, FN_1 = conf_matrix[1, 1], conf_matrix[1, 0]
FP_1, TN_1 = conf_matrix[0, 1], conf_matrix[0, 0]

# Calculate accuracy for each class
accuracy_class_0 = (TP_0 + TN_0) / (TP_0 + TN_0 + FP_0 + FN_0)
accuracy_class_1 = (TP_1 + TN_1) / (TP_1 + TN_1 + FP_1 + FN_1)

print(f'Class 0 - Precision: {precision_class[0]}, Recall: {recall_class[0]}, Accuracy: {accuracy_class_0}, F1-score: {f1_class[0]}, Support: {support_class[0]}')
print(f'Class 1 - Precision: {precision_class[1]}, Recall: {recall_class[1]}, Accuracy: {accuracy_class_1}, F1-score: {f1_class[1]}, Support: {support_class[1]}')


Class 0 - Precision: 0.7652979066022544, Recall: 0.747248427672956, Accuracy: 0.7512175324675324, F1-score: 0.7561654733492442, Support: 2544
Class 1 - Precision: 0.7369067103109657, Recall: 0.7554530201342282, Accuracy: 0.7512175324675324, F1-score: 0.7460646230323116, Support: 2384


# SVM with Word2vect

In [44]:
from sklearn.svm import SVC
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

In [46]:
#y_pred = svm_model.predict(X_test)
#print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.88      0.90      2544
           1       0.87      0.92      0.90      2384

    accuracy                           0.90      4928
   macro avg       0.90      0.90      0.90      4928
weighted avg       0.90      0.90      0.90      4928



In [47]:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

# Compute metrics for each class
precision_class, recall_class, f1_class, support_class = precision_recall_fscore_support(y_test, y_pred, average=None, labels=[0, 1])

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])

# Extract TP, TN, FP, FN for each class
TP_0, FN_0 = conf_matrix[0, 0], conf_matrix[0, 1]
FP_0, TN_0 = conf_matrix[1, 0], conf_matrix[1, 1]

TP_1, FN_1 = conf_matrix[1, 1], conf_matrix[1, 0]
FP_1, TN_1 = conf_matrix[0, 1], conf_matrix[0, 0]

# Calculate accuracy for each class
accuracy_class_0 = (TP_0 + TN_0) / (TP_0 + TN_0 + FP_0 + FN_0)
accuracy_class_1 = (TP_1 + TN_1) / (TP_1 + TN_1 + FP_1 + FN_1)

print(f'Class 0 - Precision: {precision_class[0]}, Recall: {recall_class[0]}, Accuracy: {accuracy_class_0}, F1-score: {f1_class[0]}, Support: {support_class[0]}')
print(f'Class 1 - Precision: {precision_class[1]}, Recall: {recall_class[1]}, Accuracy: {accuracy_class_1}, F1-score: {f1_class[1]}, Support: {support_class[1]}')


Class 0 - Precision: 0.9198678232135481, Recall: 0.8753930817610063, Accuracy: 0.8963068181818182, F1-score: 0.897079556898288, Support: 2544
Class 1 - Precision: 0.8735540486637415, Recall: 0.9186241610738255, Accuracy: 0.8963068181818182, F1-score: 0.8955223880597014, Support: 2384
