## Amazon Reviews Sentiment Analysis

# Check GPU Availability


In [1]:
import tensorflow as tf
print("GPU Available:", tf.config.list_physical_devices('GPU'))

GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# Import Required Libraries


In [27]:
import numpy as np
import pandas as pd
import bz2
import re
import tensorflow as tf
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, BatchNormalization, MaxPool1D, LSTM, Dropout, Dense
from tensorflow.keras.models import Sequential
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Load and Preprocess Data


In [4]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts

train_labels, train_texts = get_labels_and_texts('amazonreviews/train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts('amazonreviews/test.ft.txt.bz2')

In [5]:
len(train_texts)

3600000

In [6]:
len(test_texts)

400000

# Text Cleaning Function


In [7]:
def text_cleaning(text):
    text = text.lower()  
    pattern_punc = r'[^A-Za-z\s]'
    text = re.sub(pattern_punc, '', text).strip()
    return text 

train_texts_clean = [text_cleaning(text) for text in train_texts]
test_texts_clean = [text_cleaning(text) for text in test_texts]


In [8]:
train_texts_clean[0]

'stuning even for the nongamer this sound track was beautiful it paints the senery in your mind so well i would recomend it even to people who hate vid game music i have played the game chrono cross but out of all of the games i have ever played it has the best music it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras it would impress anyone who cares to listen'

# Tokenization and Padding


In [9]:
max_words = 1000  
max_len = 100  

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_texts_clean)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(train_texts_clean), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(test_texts_clean), maxlen=max_len)


# LSTM Model with Own Embeddings

In [10]:
def build_lstm_model():
    model = Sequential([
        Embedding(input_dim=max_words, output_dim=100, input_length=max_len, trainable=True),
        LSTM(128, return_sequences=True),
        LSTM(64),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [17]:
print("Training LSTM with Own Embedding")
model = build_lstm_model()
model.fit(X_train_seq, train_labels, validation_data=(X_test_seq, test_labels), epochs=5, batch_size=128)
model.evaluate(X_test_seq, test_labels)

Training LSTM with Own Embedding
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.1624835729598999, 0.9370599985122681]

# Load GloVe Embeddings


In [11]:
def load_glove_embedding(glove_path , embedding_dim):
    embedding_index = {}
    with open(glove_path , encoding = 'utf-8') as f :
        for line in f :
            values = line.split()
            word = values[0]
            coef = np.asarray(values[1:] , dtype = 'float64')
            embedding_index[word] = coef
    return embedding_index   

In [12]:
glove_path = 'glove_path/glove.6B.100d.txt'
embedding_dim = 100
embedding_index = load_glove_embedding(glove_path , embedding_dim)

In [18]:
embedding_index['i']

array([-0.046539 ,  0.61966  ,  0.56647  , -0.46584  , -1.189    ,
        0.44599  ,  0.066035 ,  0.3191   ,  0.14679  , -0.22119  ,
        0.79239  ,  0.29905  ,  0.16073  ,  0.025324 ,  0.18678  ,
       -0.31001  , -0.28108  ,  0.60515  , -1.0654   ,  0.52476  ,
        0.064152 ,  1.0358   , -0.40779  , -0.38011  ,  0.30801  ,
        0.59964  , -0.26991  , -0.76035  ,  0.94222  , -0.46919  ,
       -0.18278  ,  0.90652  ,  0.79671  ,  0.24825  ,  0.25713  ,
        0.6232   , -0.44768  ,  0.65357  ,  0.76902  , -0.51229  ,
       -0.44333  , -0.21867  ,  0.3837   , -1.1483   , -0.94398  ,
       -0.15062  ,  0.30012  , -0.57806  ,  0.20175  , -1.6591   ,
       -0.079195 ,  0.026423 ,  0.22051  ,  0.99714  , -0.57539  ,
       -2.7266   ,  0.31448  ,  0.70522  ,  1.4381   ,  0.99126  ,
        0.13976  ,  1.3474   , -1.1753   ,  0.0039503,  1.0298   ,
        0.064637 ,  0.90887  ,  0.82872  , -0.47003  , -0.10575  ,
        0.5916   , -0.4221   ,  0.57331  , -0.54114  ,  0.1076

# Create Embedding Matrix

In [19]:
max_words = 10000
word_index = tokenizer.word_index

embedding_matrix = np.zeros((max_words , embedding_dim))

for word , i in word_index.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None :
            embedding_matrix[i] = embedding_vector
        else :
            embedding_matrix[i] = np.random.rand(embedding_dim)
        

# LSTM Model with GloVe Embeddings

In [21]:
def build_lstm_with_glove():
    model = Sequential([
        Embedding(input_dim=max_words, 
                  output_dim=embedding_dim, 
                  input_length=max_len, 
                  weights=[embedding_matrix],  
                  trainable=False), 
        LSTM(128, return_sequences=True),
        LSTM(64),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [22]:
model = build_lstm_with_glove()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          1000000   
                                                                 
 lstm (LSTM)                 (None, 100, 128)          117248    
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 1,166,721
Trainable params: 166,721
Non-trainable params: 1,000,000
_________________________________________________________________


In [23]:
print("Training LSTM with GLOVE Embedding")
model.fit(X_train_seq, train_labels, validation_data=(X_test_seq, test_labels), epochs=5, batch_size=128)
model.evaluate(X_test_seq, test_labels)

Training LSTM with GLOVE Embedding
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.17392566800117493, 0.9326574802398682]

# Hybrid CNN-LSTM Model

In [26]:
def build_hybrid_model():
    model = Sequential([
        Embedding(input_dim=max_words, output_dim=100, input_length=max_len, trainable=True),  
        
        Conv1D(64, 3, activation='relu', padding='same'),  
        BatchNormalization(),
        MaxPool1D(2),

        Conv1D(64, 5, activation='relu', padding='same'),  
        BatchNormalization(),
        MaxPool1D(2),

        LSTM(128, return_sequences=True),  
        LSTM(64),  

        Dropout(0.5),  
        Dense(1, activation='sigmoid')  # طبقة الإخراج للتصنيف الثنائي
    ])

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model    

In [28]:
print("Training hybrid_model" )
model.fit(X_train_seq, train_labels, validation_data=(X_test_seq, test_labels), epochs=5, batch_size=128)
model.evaluate(X_test_seq, test_labels)

Training hybrid_model
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.17023691534996033, 0.9340999722480774]