In [6]:
# import library 
import pandas as pd
import numpy as np
import time
import pickle
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.feature_selection import RFE

In [7]:
# load the data
dataset = pd.read_csv('flirting_rated.csv')
dataset

Unnamed: 0,polarity,final_messages
0,0.0,Why do u say that Kat
1,0.0,Hey Lizzie what’s ur most played song on Spoti...
2,0.0,Haha drivers license for me too but most recen...
3,0.0,Frank is always on repeat tho
4,0.0,Aw thank youuu I see u like Frank too what is ...
...,...,...
2881,0.0,"i don't watch reality tv, no clue"
2882,0.0,what kind of air guitars?
2883,0.0,i have an epiphone
2884,0.0,but favourite?


In [8]:
dataset.dropna(inplace = True)

In [9]:
x = dataset['final_messages']
y = dataset['polarity']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.33,random_state =53)


In [18]:
# Align X_train and X_test with the cleaned y_train and y_test
x_train = x_train[y_train.index]
x_test = x_test[y_test.index]


In [19]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words ='english')
c_train = count_vect.fit_transform(x_train)
c_test = count_vect.transform(x_test)
len(count_vect.get_feature_names_out())
c_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [38]:
def cm_pred(classifier,x_test):
    test_pred = classifier.predict(x_test)
    
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test,test_pred)
    
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import classification_report
    
    accuracy = accuracy_score(y_test,test_pred)
    
    report = classification_report(y_test,test_pred)
    return classifier,accuracy,report,x_test,y_test,cm

def log(x_train,x_test,y_train,y_test):
    classifier = LogisticRegression(random_state =0)
    classifier.fit(x_train,y_train)
    classifier,accuracy,report,x_test,y_test,cm = cm_pred(classifier,x_test)
    return classifier,accuracy,report,x_test,y_test,cm

def svm(x_train,x_test,y_train,y_test):
    classifier = SVC(kernel='linear',random_state =0)
    classifier.fit(x_train,y_train)
    classifier,accuracy,report,x_test,y_test,cm = cm_pred(classifier,x_test)
    return classifier,accuracy,report,x_test,y_test,cm

def dtree(x_train,x_test,y_train,y_test):
    classifier = DecisionTreeClassifier(criterion ='gini',max_features ='sqrt',splitter ='best',random_state =0)
    classifier.fit(x_train,y_train)
    classifier,accuracy,report,x_test,y_test,cm = cm_pred(classifier,x_test)
    return classifier,accuracy,report,x_test,y_test,cm

def random(x_train,x_test,y_train,y_test):
    classifier =RandomForestClassifier(n_estimators =10, criterion ='entropy', random_state =0)
    classifier.fit(x_train,y_train)
    classifier,accuracy,report,x_test,y_test,cm = cm_pred(classifier,x_test)
    return classifier,accuracy,report,x_test,y_test,cm

def nb(x_train,x_test,y_train,y_test):
    classifier = GaussianNB()
    classifier.fit(x_train.toarray(),y_train)
    classifier,accuracy,report,x_test,y_test,cm = cm_pred(classifier,x_test.toarray())
    return classifier,accuracy,report,x_test,y_test,cm

def knn(x_train,x_test,y_train,y_test):
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(x_train,y_train)
    classifier,accuracy,report,x_test,y_test,cm=cm_pred(classifier,x_test)
    return  classifier,accuracy,report,x_test,y_test,cm


# Call each algorithm function with the training and test data
models = {
    'Logistic Regression': log(c_train,c_test, y_train, y_test),
    'Linear SVM': svm(c_train,c_test, y_train, y_test),
    'Decision Tree': dtree(c_train,c_test, y_train, y_test),
    'Random Forest': random(c_train,c_test, y_train, y_test),
    'Naive Bayes': nb(c_train,c_test, y_train, y_test),
    'K-Nearest Neighbors': knn(c_train,c_test, y_train, y_test)
}
# Print results

for clf_name, (classifier, accuracy, report,x_test,y_test,cm) in models.items():
    print(f"Classifier: {clf_name}")
    print(f"Accuracy: {accuracy}")
    print(f"Report: {report}")
    print(f"Confusion Matrix:\n{cm}")
    print("---------------------------")

Classifier: Logistic Regression
Accuracy: 0.8609062170706007
Report:               precision    recall  f1-score   support

         0.0       0.86      0.99      0.92       772
         1.0       0.89      0.29      0.44       177

    accuracy                           0.86       949
   macro avg       0.88      0.64      0.68       949
weighted avg       0.87      0.86      0.83       949

Confusion Matrix:
[[766   6]
 [126  51]]
---------------------------
Classifier: Linear SVM
Accuracy: 0.8535300316122234
Report:               precision    recall  f1-score   support

         0.0       0.87      0.97      0.91       772
         1.0       0.71      0.36      0.48       177

    accuracy                           0.85       949
   macro avg       0.79      0.66      0.70       949
weighted avg       0.84      0.85      0.83       949

Confusion Matrix:
[[746  26]
 [113  64]]
---------------------------
Classifier: Decision Tree
Accuracy: 0.8408851422550052
Report:               pr

In [41]:
import pandas as pd
classifier_names =[]
accuracies =[]
for clf_name, (classifier, accuracy, report,x_test,y_test,cm) in models.items():
    classifier_names.append(clf_name)
    accuracies.append(accuracy)
    
    
acc_df = pd.DataFrame({
    'Classifier':classifier_names,
    'Accuracy':accuracies
})

print(acc_df)

            Classifier  Accuracy
0  Logistic Regression  0.860906
1           Linear SVM  0.853530
2        Decision Tree  0.840885
3        Random Forest  0.854584
4          Naive Bayes  0.646997
5  K-Nearest Neighbors  0.831401


In [46]:
import plotly.express as px
import plotly.io as pl

# Creating the bar plot using Plotly Express
fig = px.bar(acc_df, x='Classifier', y='Accuracy', 
             color = 'Accuracy',
             labels={'Classifier': 'Classifier', 'Accuracy': 'Accuracy'},
             text= 'Accuracy',
             title='Accuracies based on Classifier')
fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')  # Format accuracy to 3 decimal places

fig.show()