In [30]:
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
df = pd.read_csv('/content/Amazon Review Data Web Scrapping - Amazon Review Data Web Scrapping.csv')
df.head()

Unnamed: 0,Unique_ID,Category,Review_Header,Review_text,Rating,Own_Rating
0,136040,smartTv,Nice one,I liked it,5.0,Positive
1,134236,mobile,Huge battery life with amazing display,I bought the phone on Amazon and been using my...,5.0,Positive
2,113945,books,Four Stars,"Awesome book at reasonable price, must buy ......",4.0,Positive
3,168076,smartTv,Nice quality,good,5.0,Positive
4,157302,books,Nice book,"The book is fine,not bad,contains nice concept...",3.0,Neutral


In [8]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
def clean_text(text):
  text = str(text) # Convert to string to handle non-string types like floats (NaN)
  text = re.sub('[^a-zA-Z]','',text)
  text = text.lower()
  words = text.split()
  words = [stemmer.stem(word) for word in words if word not in stop_words]
  return ''.join(words)

df['clean_text'] = df['Review_text'].apply(clean_text)
df.head()

Unnamed: 0,Unique_ID,Category,Review_Header,Review_text,Rating,Own_Rating,clean_text
0,136040,smartTv,Nice one,I liked it,5.0,Positive,ilikedit
1,134236,mobile,Huge battery life with amazing display,I bought the phone on Amazon and been using my...,5.0,Positive,iboughtthephoneonamazonandbeenusingmysamsungms...
2,113945,books,Four Stars,"Awesome book at reasonable price, must buy ......",4.0,Positive,awesomebookatreasonablepricemustbuy
3,168076,smartTv,Nice quality,good,5.0,Positive,good
4,157302,books,Nice book,"The book is fine,not bad,contains nice concept...",3.0,Neutral,thebookisfinenotbadcontainsniceconceptsandnice...


In [20]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_text']).toarray()
df['Own_Rating_Numerical'] = df['Own_Rating'].map({'Positive': 1, 'Neutral': 0, 'Negative': -1})

# Drop rows where 'Own_Rating_Numerical' is NaN, and apply this to X and y
df_cleaned = df.dropna(subset=['Own_Rating_Numerical'])
y = df_cleaned['Own_Rating_Numerical']
X_indices = df_cleaned.index
X = vectorizer.fit_transform(df_cleaned['clean_text']).toarray()

In [21]:
print(X.shape)
print(y.shape)

(21799, 17611)
(21799,)


In [12]:
print(len(X.shape))

2


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 42)

In [22]:
print(X_train)
print(X_test)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [23]:
print(X_train.shape)
print(X_test.shape)

(17440, 17611)
(4360, 17611)


In [16]:
print(y_train.shape)
print(y_test.shape)

(17440,)
(4360,)


In [28]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [31]:
y_pred = model.predict(X_test)
print("Accuracy:",accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Accuracy: 0.7926605504587156
              precision    recall  f1-score   support

        -1.0       0.82      0.03      0.05       638
         0.0       0.33      0.00      0.01       282
         1.0       0.79      1.00      0.88      3440

    accuracy                           0.79      4360
   macro avg       0.65      0.34      0.32      4360
weighted avg       0.77      0.79      0.71      4360

