In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
pd.set_option('display.max_columns',50)

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kritanjalijain/amazon-reviews")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/amazon-reviews


In [4]:
df=pd.read_csv('/kaggle/input/amazon-reviews/train.csv')

In [5]:
df2=pd.read_csv('/kaggle/input/amazon-reviews/test.csv')

In [6]:
df.head()

Unnamed: 0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
0,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
1,2,Amazing!,This soundtrack is my favorite music of all ti...
2,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
3,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
4,2,an absolute masterpiece,I am quite sure any of you actually taking the...


In [7]:
df.columns=['Rating','Title','Review']
df2.columns=['Rating','Title','Review']

In [8]:
df['Rating'].nunique()

2

In [9]:
df['Sentiment']=df['Rating'].map({1:0,2:1})
df2['Sentiment']=df2['Rating'].map({1:0,2:1})

In [10]:
df2.isnull().sum()

Unnamed: 0,0
Rating,0
Title,24
Review,0
Sentiment,0


In [11]:
df2.dropna(inplace=True)

In [12]:
df.dropna(inplace=True)


In [13]:
df2.duplicated().sum()

np.int64(0)

In [14]:
X_train=df['Review']
y_train=df['Sentiment']
X_test=df2['Review']
y_test=df2['Sentiment']

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=50)),
    ('clf', LogisticRegression())
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[143311  56673]
 [ 72749 127242]]
              precision    recall  f1-score   support

           0       0.66      0.72      0.69    199984
           1       0.69      0.64      0.66    199991

    accuracy                           0.68    399975
   macro avg       0.68      0.68      0.68    399975
weighted avg       0.68      0.68      0.68    399975



In [16]:
feature_names=model.named_steps['tfidf'].get_feature_names_out()
coefficients=model.named_steps['clf'].coef_[0]

In [17]:
top_pos = np.argsort(coefficients)[-10:]
top_neg = np.argsort(coefficients)[:10]

print("🔺 Top Positive Words:", feature_names[top_pos])
print("🔻 Top Negative Words:", feature_names[top_neg])

🔺 Top Positive Words: ['album' 'read' 'little' 'years' 'good' 'recommend' 'life' 'best' 'love'
 'great']
🔻 Top Negative Words: ['money' 'bad' 'didn' 'don' 'did' 'buy' 'thing' 'product' 'better' 'just']


In [18]:
df = df[['Review', 'Sentiment']]
df = df.rename(columns={'Review': 'text', 'Sentiment': 'label'})
df2 = df2[['Review', 'Sentiment']]
df2 = df2.rename(columns={'Review': 'text', 'Sentiment': 'label'})



In [19]:
df.to_csv("amazon_reviews_train.csv", index=False)
df2.to_csv("amazon_reviews_test.csv", index=False)




In [20]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Basic parameters
vocab_size = 10000
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_token = "<OOV>"

# Tokenization
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(df['text'])

sequences = tokenizer.texts_to_sequences(df['text'])
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Encode labels
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
labels = encoder.fit_transform(df['label'])


In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


In [22]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(padded, labels, test_size=0.2, random_state=42)

history = model.fit(X_train, y_train, epochs=5, validation_data=(X_val, y_val))


Epoch 1/5
[1m89995/89995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1334s[0m 15ms/step - accuracy: 0.8999 - loss: 0.2435 - val_accuracy: 0.9307 - val_loss: 0.1791
Epoch 2/5
[1m89995/89995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1341s[0m 15ms/step - accuracy: 0.9356 - loss: 0.1675 - val_accuracy: 0.9344 - val_loss: 0.1704
Epoch 3/5
[1m89995/89995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1324s[0m 15ms/step - accuracy: 0.9424 - loss: 0.1523 - val_accuracy: 0.9355 - val_loss: 0.1676
Epoch 4/5
[1m89995/89995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1334s[0m 14ms/step - accuracy: 0.9468 - loss: 0.1425 - val_accuracy: 0.9354 - val_loss: 0.1694
Epoch 5/5
[1m89995/89995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1371s[0m 15ms/step - accuracy: 0.9497 - loss: 0.1359 - val_accuracy: 0.9348 - val_loss: 0.1719


In [34]:



# Predict
sample = ["love hate realtion with this movie"]
seq = tokenizer.texts_to_sequences(sample)
padded_seq = pad_sequences(seq, maxlen=max_length, padding=padding_type)
print("Prediction:", model.predict(padded_seq))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Prediction: [[0.28138313]]


In [37]:
model2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.summary()




In [38]:
history2 = model2.fit(X_train, y_train, epochs=5, validation_data=(X_val, y_val))


Epoch 1/5
[1m  886/89995[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m21:21[0m 14ms/step - accuracy: 0.6981 - loss: 0.5405

KeyboardInterrupt: 