In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn 
import string
import nltk
import torch.nn.functional as F
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC 
from sklearn import svm
from sklearn.neural_network import MLPClassifier 
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

In [2]:
drive.mount("/content/datasets")

Drive already mounted at /content/datasets; to attempt to forcibly remount, call drive.mount("/content/datasets", force_remount=True).


In [3]:
path = "/content/datasets/MyDrive/goodreads.csv"

In [4]:
df = pd.read_csv(path) 

In [5]:
df = df.dropna()

In [6]:
df.isna().sum()

Sentiment    0
Rating       0
dtype: int64

In [7]:
vectorizer = TfidfVectorizer()

In [8]:
X = df["Sentiment"]
y = df["Rating"]

In [9]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.1, random_state=42)

In [10]:
train_X = vectorizer.fit_transform(train_X)
test_X = vectorizer.transform(test_X)

In [11]:
mnb = MultinomialNB(fit_prior=False)

In [12]:
mnb.fit(train_X, train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=False)

In [13]:
mnb_pred = mnb.predict(test_X) 

In [14]:
accuracy_score(mnb_pred, test_y) 

0.8175

In [15]:
print(classification_report(mnb_pred, test_y) )

              precision    recall  f1-score   support

    negative       0.82      0.82      0.82      1001
    positive       0.82      0.82      0.82       999

    accuracy                           0.82      2000
   macro avg       0.82      0.82      0.82      2000
weighted avg       0.82      0.82      0.82      2000



In [16]:
sgd = SGDClassifier()

In [17]:
sgd.fit(train_X, train_y) 

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [18]:
sgd_pred = sgd.predict(test_X)

In [19]:
accuracy_score(sgd_pred, test_y) 

0.839

In [20]:
print(classification_report(sgd_pred, test_y) )

              precision    recall  f1-score   support

    negative       0.83      0.84      0.84       990
    positive       0.84      0.84      0.84      1010

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000



In [21]:
svc = SVC()

In [22]:
svc.fit(train_X, train_y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [23]:
svc_pred = svc.predict(test_X)

In [24]:
accuracy_score(svc_pred, test_y)

0.8425

In [25]:
print(classification_report(svc_pred, test_y))

              precision    recall  f1-score   support

    negative       0.84      0.84      0.84      1001
    positive       0.84      0.84      0.84       999

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000



In [26]:
logr = LogisticRegression()

In [27]:
logr.fit(train_X, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [28]:
logr_pred = logr.predict(test_X)

In [29]:
accuracy_score(logr_pred, test_y)

0.8395

In [30]:
print(classification_report(logr_pred, test_y))

              precision    recall  f1-score   support

    negative       0.83      0.84      0.84       991
    positive       0.84      0.84      0.84      1009

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000



In [31]:
estimators = [
  ("svm", svc),
  ("lr", logr),
  ("clf", sgd),
  ("mnb", mnb)
]

In [32]:
stack_model = StackingClassifier(estimators=estimators, final_estimator=logr)

In [33]:
stack_model.fit(train_X, train_y) 

StackingClassifier(cv=None,
                   estimators=[('svm',
                                SVC(C=1.0, break_ties=False, cache_size=200,
                                    class_weight=None, coef0=0.0,
                                    decision_function_shape='ovr', degree=3,
                                    gamma='scale', kernel='rbf', max_iter=-1,
                                    probability=False, random_state=None,
                                    shrinking=True, tol=0.001, verbose=False)),
                               ('lr',
                                LogisticRegression(C=1.0, class_weight=None,
                                                   dual=False,
                                                   fit_intercept=True,...
                                MultinomialNB(alpha=1.0, class_prior=None,
                                              fit_prior=False))],
                   final_estimator=LogisticRegression(C=1.0, class_weight=None,
        

In [34]:
stack_pred = stack_model.predict(test_X)

In [35]:
accuracy_score(stack_pred, test_y)

0.841

In [36]:
print(classification_report(stack_pred, test_y))

              precision    recall  f1-score   support

    negative       0.84      0.84      0.84      1000
    positive       0.84      0.84      0.84      1000

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000



In [37]:
nltk.download("stopwords")
nltk.download('punkt')
stop = set(stopwords.words("english"))
ps = SnowballStemmer(language='english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [38]:
def stop_words(sentiment):
  filtered_words = [word.lower() for word in sentiment.split() if word.lower() not in stop]
  return " ".join(filtered_words)

In [39]:
translator=str.maketrans('','',string.punctuation)

In [40]:
def punctuations(sentiment):
  clean = sentiment.translate(translator)
  return clean

In [41]:
path2 = "/content/datasets/MyDrive/books (2).csv"

In [42]:
test_df = pd.read_csv(path2).drop(columns = ["Unnamed: 0"])

In [43]:
test_df["Sentiment"] = test_df["Sentiment"].apply(stop_words)

In [44]:
test_df["Sentiment"] = test_df["Sentiment"].apply(punctuations) 

In [45]:
mner = vectorizer.transform(test_df["Sentiment"])

In [48]:
test_dfpred = stack_model.predict(mner) 

In [49]:
accuracy_score(test_dfpred, test_df["Rating"]) 

0.8332

In [51]:
print(classification_report(test_dfpred, test_df["Rating"]))

              precision    recall  f1-score   support

    negative       0.86      0.26      0.40      2136
    positive       0.83      0.99      0.90      7864

    accuracy                           0.83     10000
   macro avg       0.85      0.62      0.65     10000
weighted avg       0.84      0.83      0.80     10000

