<a href="https://colab.research.google.com/github/JeanMusenga/PhD-Thesis_2024_Musenga/blob/main/Classifiers_with_TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import nltk
import gensim
from spacy.tokenizer import Tokenizer
from sklearn.model_selection import train_test_split

In [6]:
np.random.seed(42) #The random seed helps in the reproduction of the results

In [32]:
dataset=pd.read_excel('ARPs_and_ProgrammingPosts.xlsx')

In [33]:
dataset.shape

(14932, 8)

In [34]:
dataset.head()

Unnamed: 0,Post,Class,Tokens,W2V_Vect,Tf-IDF_vect,BoW_vect,fastText_vect,GloVe_vect
0,Here's the architecture:\n<ol>\n<li>Input comp...,1,,,,,,
1,I am confused about the architecture of Razor ...,1,,,,,,
2,I have some spring boot microservices and I wa...,1,,,,,,
3,I'm trying to properly design an application a...,1,,,,,,
4,I heard that for .NET8 Microsoft gifted us wit...,1,,,,,,


# Tokenization

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
print(stop_words)
print("-----------------------------------------------")

import string
punctuations = string.punctuation
print(punctuations)

{'they', 'behind', 'either', 'no', 'except', 'whither', 'hers', 'hence', 'besides', 'while', 'across', 'give', 'what', 'very', 'ever', 'each', 'therein', 'whereupon', 'side', 'formerly', 'see', 'all', 'every', 'amount', 'whenever', 'elsewhere', 'many', 'using', 'bottom', 'these', 'are', 'whence', 'however', 'the', 'eight', 'otherwise', 'ours', 'through', 'so', 'from', 'show', 'and', 'off', 'has', 'put', 'their', 'before', 'call', 'where', 'of', 'indeed', 'an', 'him', 'latter', 'therefore', 'nine', '’ll', 'after', 'me', 'own', 'herself', 'more', 'meanwhile', 'my', 'per', 'serious', 'perhaps', 'somehow', 'via', 'be', 'yourselves', 'done', 'hereby', 'anyway', 'herein', 'becomes', 'in', 'some', 'latterly', 'amongst', 'just', 'namely', 'also', 'yours', 'afterwards', "'ll", 'whether', 'its', '’re', 'doing', 'well', '‘s', 'were', 'only', 'then', 'around', 'on', 'now', 'anything', "'s", 'why', 'few', 'to', 'whereby', 'whatever', 'again', 'hereafter', 'at', 'nobody', 'made', 'who', 'towards', '

In [3]:
# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # print(doc)
    # print(type(doc))

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in doc ]

    # print(mytokens)

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [7]:
dataset['Tokens'] = dataset['Post'].apply(spacy_tokenizer)

In [31]:
dataset.head()

Unnamed: 0,Post,Class,Tokens,W2V_Vect,Tf-IDF_vect,BoW_vect,fastText_vect,GloVe_vect
0,Here's the architecture:\n<ol>\n<li>Input comp...,1,"[architecture, ol, li, input, component, recei...",,,,,
1,I am confused about the architecture of Razor ...,1,"[confused, architecture, razor, pages, asp.net...",,,,,
2,I have some spring boot microservices and I wa...,1,"[spring, boot, microservice, want, use, micros...",,,,,
3,I'm trying to properly design an application a...,1,"[try, properly, design, application, accord, c...",,,,,
4,I heard that for .NET8 Microsoft gifted us wit...,1,"[hear, .net8, microsoft, gift, totally, quot;f...",,,,,


In [8]:
#def Bow_vectorizer(word)
x = dataset['Tokens'].map(' '.join)
y = dataset['Class']

In [None]:
#dataset.head()

In [10]:
X_train, X_test, y_train, y_test = train_test_split(x , y, test_size=0.33, random_state=125)

# Features Engineering with TF-IDF

In [11]:
# I consider words that appear more than once, so I set the parameter min_df=2
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vector = TfidfVectorizer(min_df=2).fit(X_train)
X_train_vect_tf_idf = tf_idf_vector.transform(X_train).toarray()
X_test_vect_tf_idf = tf_idf_vector.transform(X_test).toarray()

# Naive Bayes -- TFIDF

In [13]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [14]:
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train_vect_tf_idf, y_train)

In [15]:
y_pred_tf_idf = nb_model.predict(X_test_vect_tf_idf)
print(classification_report(y_test, y_pred_tf_idf))

              precision    recall  f1-score   support

           0       0.83      0.80      0.82      2544
           1       0.79      0.83      0.81      2384

    accuracy                           0.81      4928
   macro avg       0.81      0.81      0.81      4928
weighted avg       0.81      0.81      0.81      4928



# kNN with TF-IDF

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [19]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X_train_vect_tf_idf, y_train)

In [21]:
y_pred = knn_model.predict(X_test_vect_tf_idf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85      2544
           1       0.85      0.82      0.84      2384

    accuracy                           0.84      4928
   macro avg       0.84      0.84      0.84      4928
weighted avg       0.84      0.84      0.84      4928



# Decison Tree with TF-IDF

In [23]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
dt_model = dt_model.fit(X_train_vect_tf_idf, y_train)

In [24]:
y_pred = dt_model.predict(X_test_vect_tf_idf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.94      0.95      2544
           1       0.94      0.95      0.95      2384

    accuracy                           0.95      4928
   macro avg       0.95      0.95      0.95      4928
weighted avg       0.95      0.95      0.95      4928



# Logistic Regression with TF-IDF

In [25]:
from sklearn.linear_model import LogisticRegression
lg_model = LogisticRegression(C=5, penalty='l1', solver='liblinear')
lg_model = lg_model.fit(X_train_vect_tf_idf, y_train)

In [26]:
y_pred = lg_model.predict(X_test_vect_tf_idf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96      2544
           1       0.94      0.98      0.96      2384

    accuracy                           0.96      4928
   macro avg       0.96      0.96      0.96      4928
weighted avg       0.96      0.96      0.96      4928



# SVM with TF-IDF

In [27]:
from sklearn.svm import SVC
svm_model = SVC()
svm_model = svm_model.fit(X_train_vect_tf_idf, y_train)

In [28]:
y_pred = svm_model.predict(X_test_vect_tf_idf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.93      0.96      2544
           1       0.93      0.99      0.96      2384

    accuracy                           0.96      4928
   macro avg       0.96      0.96      0.96      4928
weighted avg       0.96      0.96      0.96      4928

