<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/Classifiers_with_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Important and helpful sources
https://www.youtube.com/watch?v=P47raNuzAW0
https://github.com/PradipNichite/Youtube-Tutorials/blob/main/Yotutube_WordVectors.ipynb


In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import nltk
import gensim
from spacy.tokenizer import Tokenizer
from sklearn.model_selection import train_test_split

In [None]:
np.random.seed(42) #The random seed helps in the reproduction of the results

In [4]:
dataset=pd.read_excel('ARPs_and_ProgrammingPosts.xlsx')

In [None]:
dataset.shape

(14932, 8)

In [None]:
dataset.head()

Unnamed: 0,Post,Class,Tokens,W2V_Vect,Tf-IDF_vect,BoW_vect,fastText_vect,GloVe_vect
0,Here's the architecture:\n<ol>\n<li>Input comp...,1,,,,,,
1,I am confused about the architecture of Razor ...,1,,,,,,
2,I have some spring boot microservices and I wa...,1,,,,,,
3,I'm trying to properly design an application a...,1,,,,,,
4,I heard that for .NET8 Microsoft gifted us wit...,1,,,,,,


# Tokenization

In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
print(stop_words)
print("-----------------------------------------------")

import string
punctuations = string.punctuation
print(punctuations)

{'re', 'towards', 'who', 'mostly', 'so', 'latterly', 'himself', 'some', 'where', 'nothing', 'his', 'amount', 'nor', 'herself', 'behind', 'than', 'whereupon', 'seemed', 'seem', 'most', 'off', 'alone', 'mine', 'seeming', 'empty', 'call', 'third', 'someone', 'wherein', 'wherever', 'quite', 'hence', 'just', 'first', 'becoming', 'only', 'ca', 'six', 'beyond', 'is', 'often', 'front', 'again', 'i', 'serious', 'during', 'per', 'one', 'they', 'about', 'the', 'under', 'others', 'another', 'before', 'our', 'name', 'does', 'why', 'everything', 'no', 'otherwise', 'anywhere', 'within', 'hers', 'hereafter', 'neither', 'whence', 'when', 'had', "'ll", 'unless', 'top', 'below', 'ten', 'whether', 'can', 'both', 'of', 'she', 'yours', 'afterwards', 'further', 'my', 'should', 'get', 'many', 'used', 'go', 'without', 'might', 'in', 'are', '’re', 'but', 'forty', 'would', 'show', 'take', '‘ve', 'themselves', 'meanwhile', 'whereafter', 'hereupon', 'ours', 'well', 'twelve', 'other', 'that', 'become', 'its', 'out'

In [6]:
# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # print(doc)
    # print(type(doc))

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in doc ]

    # print(mytokens)

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [7]:
dataset['Tokens'] = dataset['Post'].apply(spacy_tokenizer)

In [None]:
dataset.head()

Unnamed: 0,Post,Class,Tokens,W2V_Vect,Tf-IDF_vect,BoW_vect,fastText_vect,GloVe_vect
0,Here's the architecture:\n<ol>\n<li>Input comp...,1,"[architecture, ol, li, input, component, recei...",,,,,
1,I am confused about the architecture of Razor ...,1,"[confused, architecture, razor, pages, asp.net...",,,,,
2,I have some spring boot microservices and I wa...,1,"[spring, boot, microservice, want, use, micros...",,,,,
3,I'm trying to properly design an application a...,1,"[try, properly, design, application, accord, c...",,,,,
4,I heard that for .NET8 Microsoft gifted us wit...,1,"[hear, .net8, microsoft, gift, totally, quot;f...",,,,,


# Feature Vectors Engineering

In [8]:
import gensim.downloader as api
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [9]:
pretrained = api.load('word2vec-google-news-300')



In [None]:
#pretrained.save('/content/drive/MyDrive/Content Creation/Youtube Tutorials/vectors.kv')

In [10]:
def sent_vec(sent):
    vector_size = pretrained.vector_size
    pretrained_res = np.zeros(vector_size)
    # print(wv_res)
    ctr = 1
    for w in sent:
        if w in pretrained:
            ctr += 1
            pretrained_res += pretrained[w]
    pretrained_res = pretrained_res/ctr
    return pretrained_res

In [11]:
dataset['W2V_Vect'] = dataset['Tokens'].apply(sent_vec)

In [None]:
dataset.head()

Unnamed: 0,Post,Class,Tokens,W2V_Vect,Tf-IDF_vect,BoW_vect,fastText_vect,GloVe_vect
0,Here's the architecture:\n<ol>\n<li>Input comp...,1,"[architecture, ol, li, input, component, recei...","[0.042191066099016857, -0.03547005171186469, -...",,,,
1,I am confused about the architecture of Razor ...,1,"[confused, architecture, razor, pages, asp.net...","[0.020880169338650174, 0.029664357503255207, 0...",,,,
2,I have some spring boot microservices and I wa...,1,"[spring, boot, microservice, want, use, micros...","[0.011846774961890244, 0.0009713521817835366, ...",,,,
3,I'm trying to properly design an application a...,1,"[try, properly, design, application, accord, c...","[0.046028193721064814, -0.027165794372558595, ...",,,,
4,I heard that for .NET8 Microsoft gifted us wit...,1,"[hear, .net8, microsoft, gift, totally, quot;f...","[0.06962245343679405, -0.020223778414439005, -...",,,,


In [12]:
x = dataset['W2V_Vect'].to_list()
y = dataset['Class'].to_list()

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x , y, test_size=0.33, random_state=125)

# Naive Bayes with W2vect

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [None]:
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.71      0.77      2544
           1       0.73      0.84      0.78      2384

    accuracy                           0.77      4928
   macro avg       0.78      0.78      0.77      4928
weighted avg       0.78      0.77      0.77      4928



# kNN with W2vect

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [15]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X_train, y_train)

In [16]:
y_pred = knn_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.76      0.81      2544
           1       0.77      0.89      0.83      2384

    accuracy                           0.82      4928
   macro avg       0.82      0.82      0.82      4928
weighted avg       0.83      0.82      0.82      4928



# Logistic Regression with W2vect

In [None]:
from sklearn.linear_model import LogisticRegression
lg_model = LogisticRegression(C=5, penalty='l1', solver='liblinear')
lg_model = lg_model.fit(X_train, y_train)

In [None]:
y_pred = lg_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.87      0.89      2544
           1       0.87      0.91      0.89      2384

    accuracy                           0.89      4928
   macro avg       0.89      0.89      0.89      4928
weighted avg       0.89      0.89      0.89      4928



# Decison Tree with W2vect

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
dt_model = dt_model.fit(X_train, y_train)

In [None]:
y_pred = dt_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.75      0.76      2544
           1       0.74      0.75      0.74      2384

    accuracy                           0.75      4928
   macro avg       0.75      0.75      0.75      4928
weighted avg       0.75      0.75      0.75      4928



# SVM with W2vect

In [None]:
from sklearn.svm import SVC
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.88      0.90      2544
           1       0.87      0.92      0.90      2384

    accuracy                           0.90      4928
   macro avg       0.90      0.90      0.90      4928
weighted avg       0.90      0.90      0.90      4928

