In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [12]:
# Get processed data
# df = pd.read_csv('preprocessed_data 2.csv', header=0)
df_model_old = pd.read_csv('model_data 2.csv', header=0)

# Remove neutral from the data
df_model = df_model_old.drop(df_model_old[(df_model_old.label == 1)].index)
df_model = df_model.rename(columns={'label':'old_label'})
print('Old Label')
print(df_model['old_label'].value_counts())

# Retag label for positive from 2 to 1
df_model['label'] = np.where(df_model.old_label == 0, 0, 1)
print('\nNew Label')
print(df_model['label'].value_counts())
df_model.head()

Old Label
2    100403
0     98812
Name: old_label, dtype: int64

New Label
1    100403
0     98812
Name: label, dtype: int64


Unnamed: 0,old_label,final_text,rating,word_tokens,number_of_words,label
0,0,model may ok sedentary type active get alot jo...,1,"['model', 'may', 'ok', 'sedentary', 'type', 'a...",37,0
1,2,fast read filled unexpected humour profound in...,4,"['fast', 'read', 'filled', 'unexpected', 'humo...",14,1
2,0,bought one charger instruction say light stay ...,2,"['bought', 'one', 'charger', 'instruction', 's...",44,0
3,0,excited find book ostensibly feminism volume n...,2,"['excited', 'find', 'book', 'ostensibly', 'fem...",50,0
4,0,big fan not model suspiscious saw several unit...,2,"['big', 'fan', 'not', 'model', 'suspiscious', ...",45,0


In [13]:
# Split data into train and test sets
df_train, df_test = train_test_split(df_model, test_size=0.2, random_state=42)

print("Train set size:",df_train.shape)
print("Test set size:",df_test.shape)

Train set size: (159372, 6)
Test set size: (39843, 6)


In [14]:
# Y train/test split
y_train = df_train["label"]
y_test = df_test["label"]

In [15]:
import pickle

# Get pickle file
tfidf = pickle.load(open("tfidf.pkl",'rb'))

In [16]:
# Extract feature_names
terms_features = tfidf.get_feature_names_out()
print(len(terms_features))
print(terms_features[0:50])

1600
['ability' 'able' 'absolutely' 'account' 'accurate' 'act' 'acting'
 'action' 'actor' 'actual' 'actually' 'adapter' 'add' 'added' 'addition'
 'admit' 'adult' 'adventure' 'advertised' 'advice' 'age' 'ago' 'agree'
 'ahead' 'air' 'album' 'album not' 'alien' 'allow' 'almost' 'alone' 'alot'
 'already' 'also' 'also not' 'always' 'amazing' 'american' 'amount'
 'animal' 'annoying' 'another' 'answer' 'anymore' 'anyone' 'anything'
 'anyway' 'anywhere' 'apart' 'apparently']


In [17]:
# Word vectorization using TF-IDF
X_train_tfidf_matrix = tfidf.transform(df_train["final_text"])
X_test_tfidf_matrix = tfidf.transform(df_test["final_text"])

In [18]:
X_train_tfidf_vector = X_train_tfidf_matrix.toarray()
X_train_tfidf_vector

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [19]:
X_test_tfidf_vector = X_test_tfidf_matrix.toarray()
X_test_tfidf_vector

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
# View tfidf matrix as a dataframe
result = pd.DataFrame(data=X_train_tfidf_matrix.toarray(), columns=terms_features)
result.head(5)

Unnamed: 0,ability,able,absolutely,account,accurate,act,acting,action,actor,actual,...,wrong,wrote,yeah,year,year ago,year old,yes,yet,young,younger
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114305,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.090891,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,roc_auc_score,f1_score

# Model Training
clf_nb = MultinomialNB()
clf_nb.fit(X_train_tfidf_vector, y_train)

# Predict the values of y
y_pred_train_nb = clf_nb.predict(X_train_tfidf_vector)
y_pred_test_nb = clf_nb.predict(X_test_tfidf_vector)

print("Accuracy for Naive Bayes model:",accuracy_score(y_test,list(y_pred_test_nb)))

# Show classification report
print(classification_report(y_test, y_pred_test_nb))

Accuracy for Naive Bayes model: 0.8177095098260673
              precision    recall  f1-score   support

           0       0.82      0.81      0.82     19830
           1       0.81      0.83      0.82     20013

    accuracy                           0.82     39843
   macro avg       0.82      0.82      0.82     39843
weighted avg       0.82      0.82      0.82     39843



In [22]:
# Show classification report
print("Accuracy for LR model:",accuracy_score(y_train,list(y_pred_train_nb)))

print('Classification Report for Train')
print(classification_report(y_train, y_pred_train_nb))

Accuracy for LR model: 0.8178224531285295
Classification Report for Train
              precision    recall  f1-score   support

           0       0.82      0.81      0.81     78982
           1       0.81      0.83      0.82     80390

    accuracy                           0.82    159372
   macro avg       0.82      0.82      0.82    159372
weighted avg       0.82      0.82      0.82    159372

