In [14]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_auc_score, average_precision_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.models import Sequential


In [2]:
import pandas as pd
import tarfile

# Step 1: Open the tar.gz file
with tarfile.open('/kaggle/input/amazon-reviews/amazon_review_polarity_csv.tgz', 'r:gz') as tar:
    # Step 2: Extract the CSV file from archive
    csv_file = [m for m in tar.getmembers() if m.name.endswith('.csv')][0]
    
    # Step 3: Read CSV with pandas
    with tar.extractfile(csv_file) as f:
        df = pd.read_csv(f, 
                        encoding='utf-8',
                        header=None,
                        names=['polarity', 'title', 'text'],
                        on_bad_lines='warn')

print(df.head())


   polarity                                              title  \
0         2                     Stuning even for the non-gamer   
1         2              The best soundtrack ever to anything.   
2         2                                           Amazing!   
3         2                               Excellent Soundtrack   
4         2  Remember, Pull Your Jaw Off The Floor After He...   

                                                text  
0  This sound track was beautiful! It paints the ...  
1  I'm reading a lot of reviews saying that this ...  
2  This soundtrack is my favorite music of all ti...  
3  I truly like this soundtrack and I enjoy video...  
4  If you've played the game, you know how divine...  


In [3]:
def clean_text(text):
    text = str(text).lower()  # Lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove special characters
    return text

df['clean_text'] = df['text'].apply(clean_text)


In [4]:
df['label']= df['polarity'].map({1:0, 2:1})

In [5]:
max_words = 20000
max_len =100

tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(df['clean_text'])
sequences = tokenizer.texts_to_sequences(df['clean_text'])
word_index = tokenizer.word_index

X = pad_sequences(sequences, maxlen=max_len, padding= 'post', truncating = 'post')
y = df['label'].values

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
sentences = [text.split() for text in df['clean_text']]

In [7]:
from gensim.models import Word2Vec
embed_dim=100
wv_model=Word2Vec(sentences, vector_size=embed_dim,window=5, min_count=4, workers =5)

In [8]:
embed_matrix=np.zeros((max_words, embed_dim))
for word,i in tokenizer.word_index.items():
    if i<max_words:
        if word in wv_model.wv:
            embed_matrix[i]= wv_model.wv[word]

In [9]:
model = Sequential()
model.add(Embedding(input_dim=max_words,
                   output_dim = embed_dim,
                   weights=[embed_matrix],
                   trainable = False))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))

2025-06-10 19:25:53.931511: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [10]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics =['accuracy'])
history = model.fit(X_train[:5000], y_train[:5000], epochs=18, batch_size=64,validation_split=0.1)

Epoch 1/18
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 360ms/step - accuracy: 0.6373 - loss: 0.6275 - val_accuracy: 0.8040 - val_loss: 0.4536
Epoch 2/18
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 349ms/step - accuracy: 0.8293 - loss: 0.3956 - val_accuracy: 0.8280 - val_loss: 0.4092
Epoch 3/18
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 350ms/step - accuracy: 0.8680 - loss: 0.3311 - val_accuracy: 0.8260 - val_loss: 0.3800
Epoch 4/18
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 350ms/step - accuracy: 0.9051 - loss: 0.2474 - val_accuracy: 0.8240 - val_loss: 0.3903
Epoch 5/18
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 340ms/step - accuracy: 0.9382 - loss: 0.1737 - val_accuracy: 0.8140 - val_loss: 0.4026
Epoch 6/18
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 345ms/step - accuracy: 0.9609 - loss: 0.1167 - val_accuracy: 0.8260 - val_loss: 0.4882
Epoch 7/18
[1m71/71[

In [11]:
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob>0.5).astype(int).flatten()

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

[1m22500/22500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1894s[0m 84ms/step
Accuracy: 0.8472638888888889
F1 Score: 0.8500704860950158
Confusion Matrix:
 [[298276  61483]
 [ 48487 311754]]


In [12]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.83      0.84    359759
           1       0.84      0.87      0.85    360241

    accuracy                           0.85    720000
   macro avg       0.85      0.85      0.85    720000
weighted avg       0.85      0.85      0.85    720000



In [13]:
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_prob))
print("PR-AUC Score: ", average_precision_score(y_test, y_pred_prob))

ROC-AUC Score: 0.9212353810630568
PR-AUC Score:  0.9166504969532622
