<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/Classifiers_with_TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#What is the difference between Word2Vec and TF-IDF vectorizer?
Word2Vec vs. BoW and TF-IDF: Word2Vec is a neural network-based technique that learns continuous word embeddings, capturing the semantic relationships between words

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import nltk
import gensim
from spacy.tokenizer import Tokenizer
from sklearn.model_selection import train_test_split

In [2]:
np.random.seed(42) #The random seed helps in the reproduction of the results

In [4]:
dataset=pd.read_excel('posts.xlsx')

In [None]:
dataset.shape

(14932, 8)

In [None]:
dataset.head()

Unnamed: 0,Post,Class,Tokens,W2V_Vect,Tf-IDF_vect,BoW_vect,fastText_vect,GloVe_vect
0,Here's the architecture:\n<ol>\n<li>Input comp...,1,,,,,,
1,I am confused about the architecture of Razor ...,1,,,,,,
2,I have some spring boot microservices and I wa...,1,,,,,,
3,I'm trying to properly design an application a...,1,,,,,,
4,I heard that for .NET8 Microsoft gifted us wit...,1,,,,,,


# Tokenization

In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
print(stop_words)
print("-----------------------------------------------")

import string
punctuations = string.punctuation
print(punctuations)

{'themselves', 'upon', 'which', "'m", 'their', 'ten', 'except', '‘d', "'re", 'various', 'two', 'per', 'to', 'whoever', 'what', 'anywhere', 'eight', 'few', 'thence', 'seemed', 'only', 'seeming', '’d', 'of', 'three', 'well', 'eleven', 'where', 'amongst', 'but', '‘m', 'too', 'whereafter', 'front', 'others', 'though', 'although', "'ve", 'again', 'still', 'call', 'seem', 'besides', 'yourselves', 'everything', '‘ve', 'throughout', 'get', 'on', 'she', 'now', 'thereupon', 'together', 'cannot', 'anyhow', 'in', 'none', 'someone', 'with', 'anyone', 'are', 'why', 'you', 'due', 'is', 'otherwise', 'at', 'becomes', 'just', 'move', 'therefore', 'by', 'namely', "'ll", 'wherever', 'we', 'were', 'sixty', 'latter', 'take', 'towards', 'done', 'hers', 'must', 'whenever', 'show', 'somewhere', 'himself', 'full', 'herself', '‘ll', 'indeed', 'there', 'five', 'nowhere', 'along', '‘re', 'n‘t', 'often', '’s', 'sometimes', 'less', 'not', 'ca', 'amount', 'out', 'fifty', 'also', 'his', 'put', 'first', 'for', 'thus', 

In [6]:
# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)
    #print(doc)
    # print(type(doc))

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in doc ]

    # print(mytokens)

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [8]:
dataset['Tokens'] = dataset['Question_body'].apply(spacy_tokenizer)

In [9]:
dataset.head()

Unnamed: 0,Question_body,Label,Tokens
0,Here's the architecture:\n<ol>\n<li>Input comp...,1,"[architecture, ol, li, input, component, recei..."
1,I am confused about the architecture of Razor ...,1,"[confused, architecture, razor, pages, asp.net..."
2,I have some spring boot microservices and I wa...,1,"[spring, boot, microservice, want, use, micros..."
3,I'm trying to properly design an application a...,1,"[try, properly, design, application, accord, c..."
4,I heard that for .NET8 Microsoft gifted us wit...,1,"[hear, .net8, microsoft, gift, totally, quot;f..."


In [11]:
x = dataset['Tokens'].map(' '.join)
y = dataset['Label']

In [None]:
#dataset.head()

In [13]:
X_train, X_test, y_train, y_test = train_test_split(x , y, test_size=0.3, random_state=125)

# Features Engineering with TF-IDF

In [14]:
# I consider words that appear more than once, so I set the parameter min_df=2
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vector = TfidfVectorizer(min_df=2).fit(X_train)
X_train_vect_tf_idf = tf_idf_vector.transform(X_train).toarray()
X_test_vect_tf_idf = tf_idf_vector.transform(X_test).toarray()

# Naive Bayes -- TFIDF

In [15]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [16]:
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train_vect_tf_idf, y_train)

In [21]:
#y_pred_tf_idf = nb_model.predict(X_test_vect_tf_idf)
#print(classification_report(y_test, y_pred_tf_idf))

In [20]:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

# Compute metrics for each class
precision_class, recall_class, f1_class, support_class = precision_recall_fscore_support(y_test, y_pred_tf_idf, average=None, labels=[0, 1])

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_tf_idf, labels=[0, 1])

# Extract TP, TN, FP, FN for each class
TP_0, FN_0 = conf_matrix[0, 0], conf_matrix[0, 1]
FP_0, TN_0 = conf_matrix[1, 0], conf_matrix[1, 1]

TP_1, FN_1 = conf_matrix[1, 1], conf_matrix[1, 0]
FP_1, TN_1 = conf_matrix[0, 1], conf_matrix[0, 0]

# Calculate accuracy for each class
accuracy_class_0 = (TP_0 + TN_0) / (TP_0 + TN_0 + FP_0 + FN_0)
accuracy_class_1 = (TP_1 + TN_1) / (TP_1 + TN_1 + FP_1 + FN_1)

print(f'Class 0 - Precision: {precision_class[0]}, Recall: {recall_class[0]}, Accuracy: {accuracy_class_0}, F1-score: {f1_class[0]}, Support: {support_class[0]}')
print(f'Class 1 - Precision: {precision_class[1]}, Recall: {recall_class[1]}, Accuracy: {accuracy_class_1}, F1-score: {f1_class[1]}, Support: {support_class[1]}')

Class 0 - Precision: 0.829806393516434, Recall: 0.7981810307492421, Accuracy: 0.8116071428571429, F1-score: 0.8136865342163356, Support: 2309
Class 1 - Precision: 0.7937140327578575, Recall: 0.8258866881621373, Accuracy: 0.8116071428571429, F1-score: 0.8094808126410836, Support: 2171


# kNN with TF-IDF

In [22]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [23]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X_train_vect_tf_idf, y_train)

In [None]:
y_pred = knn_model.predict(X_test_vect_tf_idf)

In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85      2309
           1       0.85      0.82      0.83      2171

    accuracy                           0.84      4480
   macro avg       0.84      0.84      0.84      4480
weighted avg       0.84      0.84      0.84      4480



In [31]:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

# Compute metrics for each class
precision_class, recall_class, f1_class, support_class = precision_recall_fscore_support(y_test, y_pred, average=None, labels=[0, 1])

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])

# Extract TP, TN, FP, FN for each class
TP_0, FN_0 = conf_matrix[0, 0], conf_matrix[0, 1]
FP_0, TN_0 = conf_matrix[1, 0], conf_matrix[1, 1]

TP_1, FN_1 = conf_matrix[1, 1], conf_matrix[1, 0]
FP_1, TN_1 = conf_matrix[0, 1], conf_matrix[0, 0]

# Calculate accuracy for each class
accuracy_class_0 = (TP_0 + TN_0) / (TP_0 + TN_0 + FP_0 + FN_0)
accuracy_class_1 = (TP_1 + TN_1) / (TP_1 + TN_1 + FP_1 + FN_1)

print(f'Class 0 - Precision: {precision_class[0]}, Recall: {recall_class[0]}, Accuracy: {accuracy_class_0}, F1-score: {f1_class[0]}, Support: {support_class[0]}')
print(f'Class 1 - Precision: {precision_class[1]}, Recall: {recall_class[1]}, Accuracy: {accuracy_class_1}, F1-score: {f1_class[1]}, Support: {support_class[1]}')


Class 0 - Precision: 0.838123415046492, Recall: 0.8588133391078389, Accuracy: 0.8417410714285715, F1-score: 0.8483422459893047, Support: 2309
Class 1 - Precision: 0.8457899716177862, Recall: 0.8235836020267158, Accuracy: 0.8417410714285715, F1-score: 0.834539089848308, Support: 2171


# Decison Tree with TF-IDF

In [32]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
dt_model = dt_model.fit(X_train_vect_tf_idf, y_train)

In [34]:
y_pred = dt_model.predict(X_test_vect_tf_idf)


In [37]:
#print(classification_report(y_test, y_pred))

In [36]:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

# Compute metrics for each class
precision_class, recall_class, f1_class, support_class = precision_recall_fscore_support(y_test, y_pred, average=None, labels=[0, 1])

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])

# Extract TP, TN, FP, FN for each class
TP_0, FN_0 = conf_matrix[0, 0], conf_matrix[0, 1]
FP_0, TN_0 = conf_matrix[1, 0], conf_matrix[1, 1]

TP_1, FN_1 = conf_matrix[1, 1], conf_matrix[1, 0]
FP_1, TN_1 = conf_matrix[0, 1], conf_matrix[0, 0]

# Calculate accuracy for each class
accuracy_class_0 = (TP_0 + TN_0) / (TP_0 + TN_0 + FP_0 + FN_0)
accuracy_class_1 = (TP_1 + TN_1) / (TP_1 + TN_1 + FP_1 + FN_1)

print(f'Class 0 - Precision: {precision_class[0]}, Recall: {recall_class[0]}, Accuracy: {accuracy_class_0}, F1-score: {f1_class[0]}, Support: {support_class[0]}')
print(f'Class 1 - Precision: {precision_class[1]}, Recall: {recall_class[1]}, Accuracy: {accuracy_class_1}, F1-score: {f1_class[1]}, Support: {support_class[1]}')


Class 0 - Precision: 0.9524647887323944, Recall: 0.9372022520571676, Accuracy: 0.9435267857142857, F1-score: 0.944771883868151, Support: 2309
Class 1 - Precision: 0.9343297101449275, Recall: 0.9502533394748963, Accuracy: 0.9435267857142857, F1-score: 0.9422242521123543, Support: 2171


# Logistic Regression with TF-IDF

In [38]:
from sklearn.linear_model import LogisticRegression
lg_model = LogisticRegression(C=5, penalty='l1', solver='liblinear')
lg_model = lg_model.fit(X_train_vect_tf_idf, y_train)

In [None]:
y_pred = lg_model.predict(X_test_vect_tf_idf)
#print(classification_report(y_test, y_pred))

In [40]:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

# Compute metrics for each class
precision_class, recall_class, f1_class, support_class = precision_recall_fscore_support(y_test, y_pred, average=None, labels=[0, 1])

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])

# Extract TP, TN, FP, FN for each class
TP_0, FN_0 = conf_matrix[0, 0], conf_matrix[0, 1]
FP_0, TN_0 = conf_matrix[1, 0], conf_matrix[1, 1]

TP_1, FN_1 = conf_matrix[1, 1], conf_matrix[1, 0]
FP_1, TN_1 = conf_matrix[0, 1], conf_matrix[0, 0]

# Calculate accuracy for each class
accuracy_class_0 = (TP_0 + TN_0) / (TP_0 + TN_0 + FP_0 + FN_0)
accuracy_class_1 = (TP_1 + TN_1) / (TP_1 + TN_1 + FP_1 + FN_1)

print(f'Class 0 - Precision: {precision_class[0]}, Recall: {recall_class[0]}, Accuracy: {accuracy_class_0}, F1-score: {f1_class[0]}, Support: {support_class[0]}')
print(f'Class 1 - Precision: {precision_class[1]}, Recall: {recall_class[1]}, Accuracy: {accuracy_class_1}, F1-score: {f1_class[1]}, Support: {support_class[1]}')


Class 0 - Precision: 0.9788764044943821, Recall: 0.9432654828930273, Accuracy: 0.9602678571428571, F1-score: 0.960741067490075, Support: 2309
Class 1 - Precision: 0.9419068736141907, Recall: 0.9783509903270382, Accuracy: 0.9602678571428571, F1-score: 0.9597830998644374, Support: 2171


# SVM with TF-IDF

In [41]:
from sklearn.svm import SVC
svm_model = SVC()
svm_model = svm_model.fit(X_train_vect_tf_idf, y_train)

In [None]:
y_pred = svm_model.predict(X_test_vect_tf_idf)


In [None]:
print(classification_report(y_test, y_pred))

In [43]:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

# Compute metrics for each class
precision_class, recall_class, f1_class, support_class = precision_recall_fscore_support(y_test, y_pred, average=None, labels=[0, 1])

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])

# Extract TP, TN, FP, FN for each class
TP_0, FN_0 = conf_matrix[0, 0], conf_matrix[0, 1]
FP_0, TN_0 = conf_matrix[1, 0], conf_matrix[1, 1]

TP_1, FN_1 = conf_matrix[1, 1], conf_matrix[1, 0]
FP_1, TN_1 = conf_matrix[0, 1], conf_matrix[0, 0]

# Calculate accuracy for each class
accuracy_class_0 = (TP_0 + TN_0) / (TP_0 + TN_0 + FP_0 + FN_0)
accuracy_class_1 = (TP_1 + TN_1) / (TP_1 + TN_1 + FP_1 + FN_1)

print(f'Class 0 - Precision: {precision_class[0]}, Recall: {recall_class[0]}, Accuracy: {accuracy_class_0}, F1-score: {f1_class[0]}, Support: {support_class[0]}')
print(f'Class 1 - Precision: {precision_class[1]}, Recall: {recall_class[1]}, Accuracy: {accuracy_class_1}, F1-score: {f1_class[1]}, Support: {support_class[1]}')


Class 0 - Precision: 0.9903448275862069, Recall: 0.9328713728886964, Accuracy: 0.9607142857142857, F1-score: 0.9607493309545049, Support: 2309
Class 1 - Precision: 0.9327548806941431, Recall: 0.9903270382312298, Accuracy: 0.9607142857142857, F1-score: 0.9606791778373548, Support: 2171
