<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/Classifiers_with_Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Important and helpful sources
https://www.youtube.com/watch?v=P47raNuzAW0
https://github.com/PradipNichite/Youtube-Tutorials/blob/main/Yotutube_WordVectors.ipynb


In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import nltk
import gensim
from spacy.tokenizer import Tokenizer
from sklearn.model_selection import train_test_split

In [None]:
np.random.seed(42) #The random seed helps in the reproduction of the results

In [None]:
dataset=pd.read_excel('ARPs_and_ProgrammingPosts.xlsx')

In [None]:
dataset.shape

(14932, 8)

In [None]:
dataset.head()

Unnamed: 0,Post,Class,Tokens,W2V_Vect,Tf-IDF_vect,BoW_vect,fastText_vect,GloVe_vect
0,Here's the architecture:\n<ol>\n<li>Input comp...,1,,,,,,
1,I am confused about the architecture of Razor ...,1,,,,,,
2,I have some spring boot microservices and I wa...,1,,,,,,
3,I'm trying to properly design an application a...,1,,,,,,
4,I heard that for .NET8 Microsoft gifted us wit...,1,,,,,,


# Tokenization

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
print(stop_words)
print("-----------------------------------------------")

import string
punctuations = string.punctuation
print(punctuations)

{'front', 'becomes', 'least', 'against', 'or', 'can', 'across', 'down', 'seems', 'since', '’s', 'yourself', 'where', 'quite', 'onto', 'serious', 'top', 'into', 'really', 'in', 'five', 'beyond', 'whence', 'becoming', 'do', 'hereby', 'every', 'from', 'then', 'among', 'whatever', 'because', '‘d', 'herself', 'three', 'fifteen', 'as', 'few', 'out', 'be', 'show', 'back', 'is', 'by', 'nobody', 'thru', 'i', 'of', '‘m', 'nor', 'should', 'who', 'however', 'while', 'hereafter', 'whereas', 'my', 'around', 'next', 'still', 'his', 'again', 'twelve', 'something', 'make', 'with', 'full', 'except', 'herein', "'m", 'say', 'ours', 'their', 'somewhere', 'now', 'our', 'keep', 'might', 'could', 'much', 'anyhow', 'are', 'would', 'anyone', 'through', 'had', 'due', 'mine', 'sometime', 'everywhere', 'you', 'else', 'but', 'put', 'became', 'whole', 'it', 'call', 'take', 'move', 'perhaps', 'does', 'these', 'above', 'many', 'meanwhile', '’ll', 'using', 'thereby', "'ll", 'ourselves', 'another', '’m', 'when', 'wherei

In [None]:
# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # print(doc)
    # print(type(doc))

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in doc ]

    # print(mytokens)

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [None]:
dataset['Tokens'] = dataset['Post'].apply(spacy_tokenizer)

In [None]:
dataset.head()

Unnamed: 0,Post,Class,Tokens,W2V_Vect,Tf-IDF_vect,BoW_vect,fastText_vect,GloVe_vect
0,Here's the architecture:\n<ol>\n<li>Input comp...,1,"[architecture, ol, li, input, component, recei...",,,,,
1,I am confused about the architecture of Razor ...,1,"[confused, architecture, razor, pages, asp.net...",,,,,
2,I have some spring boot microservices and I wa...,1,"[spring, boot, microservice, want, use, micros...",,,,,
3,I'm trying to properly design an application a...,1,"[try, properly, design, application, accord, c...",,,,,
4,I heard that for .NET8 Microsoft gifted us wit...,1,"[hear, .net8, microsoft, gift, totally, quot;f...",,,,,


# Feature Vectors Engineering

In [None]:
import gensim.downloader as api
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [None]:
pretrained = api.load('word2vec-google-news-300')

In [None]:
#pretrained.save('/content/drive/MyDrive/Content Creation/Youtube Tutorials/vectors.kv')

In [None]:
def sent_vec(sent):
    vector_size = pretrained.vector_size
    pretrained_res = np.zeros(vector_size)
    # print(wv_res)
    ctr = 1
    for w in sent:
        if w in pretrained:
            ctr += 1
            pretrained_res += pretrained[w]
    pretrained_res = pretrained_res/ctr
    return pretrained_res

In [None]:
dataset['W2V_Vect'] = dataset['Tokens'].apply(sent_vec)

In [None]:
dataset.head()

Unnamed: 0,Post,Class,Tokens,W2V_Vect,Tf-IDF_vect,BoW_vect,fastText_vect,GloVe_vect
0,Here's the architecture:\n<ol>\n<li>Input comp...,1,"[architecture, ol, li, input, component, recei...","[0.042191066099016857, -0.03547005171186469, -...",,,,
1,I am confused about the architecture of Razor ...,1,"[confused, architecture, razor, pages, asp.net...","[0.020880169338650174, 0.029664357503255207, 0...",,,,
2,I have some spring boot microservices and I wa...,1,"[spring, boot, microservice, want, use, micros...","[0.011846774961890244, 0.0009713521817835366, ...",,,,
3,I'm trying to properly design an application a...,1,"[try, properly, design, application, accord, c...","[0.046028193721064814, -0.027165794372558595, ...",,,,
4,I heard that for .NET8 Microsoft gifted us wit...,1,"[hear, .net8, microsoft, gift, totally, quot;f...","[0.06962245343679405, -0.020223778414439005, -...",,,,


In [None]:
x = dataset['W2V_Vect'].to_list()
y = dataset['Class'].to_list()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x , y, test_size=0.33, random_state=125)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [None]:
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.71      0.77      2544
           1       0.73      0.84      0.78      2384

    accuracy                           0.77      4928
   macro avg       0.78      0.78      0.77      4928
weighted avg       0.78      0.77      0.77      4928



In [None]:
from sklearn.linear_model import LogisticRegression
lg_model = LogisticRegression(C=5, penalty='l1', solver='liblinear')
lg_model = lg_model.fit(X_train, y_train)

In [None]:
y_pred = lg_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.87      0.89      2544
           1       0.87      0.91      0.89      2384

    accuracy                           0.89      4928
   macro avg       0.89      0.89      0.89      4928
weighted avg       0.89      0.89      0.89      4928



In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
dt_model = dt_model.fit(X_train, y_train)

In [None]:
y_pred = dt_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.75      0.76      2544
           1       0.74      0.75      0.74      2384

    accuracy                           0.75      4928
   macro avg       0.75      0.75      0.75      4928
weighted avg       0.75      0.75      0.75      4928



In [None]:
from sklearn.svm import SVC
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.88      0.90      2544
           1       0.87      0.92      0.90      2384

    accuracy                           0.90      4928
   macro avg       0.90      0.90      0.90      4928
weighted avg       0.90      0.90      0.90      4928



# Classifiers with Bag of Word Embbeding

In [None]:




x = dataset.Post
y = dataset.Class
X_train, X_test, y_train, y_test = train_test_split(x , y, test_size=0.33, random_state=125)

print('Total number of sample:', len(dataset))
# check the shape of X_train and X_test
print('X Training sample:', len(X_train))
print('X Test sample:', len(X_test))

# check the shape of y_train and y_test
print("-------------------------")
print('Y Training sample:', len(y_train))
print('Y Test sample:', len(y_test))

Total number of sample: 14932
X Training sample: 10004
X Test sample: 4928
-------------------------
Y Training sample: 10004
Y Test sample: 4928


In [None]:
#Feature Engineering or feature generation/extraction is the process of transforming raw data into useful features that help us to understand
#our model better and increase its predictive power. I will carry out feature engineering on different types of variables.
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(X_train) #CountVectorizer implements both tokenization and occurrence counting in a single class
X_train_vect = vect.transform(X_train).toarray()
X_test_vect = vect.transform(X_test).toarray()

In [None]:
# Printing the identified Unique words along with their indices
print("Vocabulary: ", vect.vocabulary_)



In [None]:
# Summarizing the Encoded Texts
print("Encoded dataset is:")
print(vect.transform(X_train).toarray())

Encoded dataset is:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [2 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


#  Naive Bayes -- BoW

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [None]:
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train_vect, y_train)

y_pred = nb_model.predict(X_test_vect)

In [None]:
print(classification_report(y_test, y_pred))

NameError: name 'classification_report' is not defined

# Naive Bayes -- TFIDF

In [None]:
# I consider words that appear more than once, so I set the parameter min_df=2
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vector = TfidfVectorizer(min_df=2).fit(X_train)
X_train_vect_tf_idf = tf_idf_vector.transform(X_train).toarray()
X_test_vect_tf_idf = tf_idf_vector.transform(X_test).toarray()

In [None]:
# Summarizing the Encoded Texts with TFIDF
print("Encoded dataset is:")
print(tf_idf_vector.transform(X_train).toarray())

Encoded dataset is:
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.18813346 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [None]:
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train_vect_tf_idf, y_train)

y_pred_tf_idf = nb_model.predict(X_test_vect_tf_idf)

In [None]:
print(classification_report(y_test, y_pred_tf_idf))

              precision    recall  f1-score   support

           0       0.84      0.80      0.82      2544
           1       0.79      0.84      0.82      2384

    accuracy                           0.82      4928
   macro avg       0.82      0.82      0.82      4928
weighted avg       0.82      0.82      0.82      4928



# Naive Bayes -- Word2Vec

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec

In [None]:
#Downloading package punkt

# Create CBOW model
model1 = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5)
#Some useful parameters that Gensim Word2Vec class takes:
#data: It is the data on which the model is trained to create word embeddings. It can be a list of lists of tokens/words,
#or a data stream coming from network/disk in the case of large corpora. In our example, we will be using punkt Corpus present in NLTK.
#size: It represents how long you want the dimensionality of your vector to be for each word in the vocabulary. Its default value is 100.
#window: The maximum distance between the current word and its neighboring words. If your neighboring word is greater than the width, then,
#some neighboring words would not be considered as being related to the current word. Its default value is 5.
#min_count: It represents the minimum frequency value of words to be present in the vocabulary. Its default value is 5.
#iter: It represents the number of iterations/epochs over the dataset. Its default value is 5.

model.build_vocab(dataset.Post)
model.epochs
model.corpus_count




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


NameError: name 'sent_tokenize' is not defined

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from nltk.tokenize import sent_tokenize #from nltk import tokenize
from nltk.tokenize import word_tokenize
dataset['Post'].dropna(inplace=True)

tokens_word = dataset['Post'].apply(word_tokenize)

In [None]:
print(tokens_word)

0        [Here, 's, the, architecture, :, <, /p, >, <, ...
1        [I, am, confused, about, the, architecture, of...
2        [I, have, some, spring, boot, microservices, a...
3        [I, 'm, trying, to, properly, design, an, appl...
4        [I, heard, that, for, .NET8, Microsoft, gifted...
                               ...                        
14927    [I, need, help, with, a, pretty, simple, exerc...
14928    [Problem, is, ,, <, /p, >, <, pre, >, <, code,...
14929    [It, 's, not, possible, to, include, both, fil...
14930    [Yes, and, no, ., Yes, ,, redBall, is, a, vari...
14931    [You, do, n't, have, to, declare, variable, .,...
Name: Post, Length: 14932, dtype: object


In [None]:
tokens_sent = dataset['Post'].apply(sent_tokenize)
print(tokens_sent)

0        [Here's the architecture:</p>\n<ol>\n<li>Input...
1        [I am confused about the architecture of Razor...
2        [I have some spring boot microservices and I w...
3        [I'm trying to properly design an application ...
4        [I heard that for .NET8 Microsoft gifted us wi...
                               ...                        
14927    [I need help with a pretty simple exercise I a...
14928    [Problem is,</p>\n\n<pre><code>sed -e 's/$/LIN...
14929    [It's not possible to include both files as is...
14930    [Yes and no., Yes, redBall is a variable to ho...
14931    [You don't have to declare variable., Followin...
Name: Post, Length: 14932, dtype: object


0        [Here's the architecture:</p>\n<ol>\n<li>Input...
1        [I am confused about the architecture of Razor...
2        [I have some spring boot microservices and I w...
3        [I'm trying to properly design an application ...
4        [I heard that for .NET8 Microsoft gifted us wi...
                               ...                        
14927    [I need help with a pretty simple exercise I a...
14928    [Problem is,</p>\n\n<pre><code>sed -e 's/$/LIN...
14929    [It's not possible to include both files as is...
14930    [Yes and no., Yes, redBall is a variable to ho...
14931    [You don't have to declare variable., Followin...
Name: Post, Length: 14932, dtype: object
