<a href="https://www.kaggle.com/code/humagonen/amazon-reviews-sentiment-analysis-lstm?scriptVersionId=192768471" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding, Dropout, BatchNormalization, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer    # Tensorflow un Tokenizer fonksiyonu
from tensorflow.keras.preprocessing.sequence import pad_sequences    # modele vereceğimiz squence lerin aynı boyutta olmasını sağlıyor.

## Read Data

converted train data from https://www.kaggle.com/datasets/bittlingmayer/amazonreviews/data to csv: 

https://www.kaggle.com/code/humagonen/data-conversion-from-bz2-to-csv

This dataset consists of 3.600.00M Amazon customer reviews

In [None]:
df = pd.read_csv('/kaggle/input/amazon-reviews-csv/amazon_reviews_train.csv')
df.head()

In [None]:
df.shape

In [None]:
df.label.value_counts()

## Tokenization

In [None]:
X = df['text'].values
y = df['label'].values

In [None]:
num_words= 15000

tokenizer = Tokenizer(num_words=num_words)

In [None]:
tokenizer.fit_on_texts(X)

In [None]:
tokenizer.word_index

In [None]:
X_num_tokens = tokenizer.texts_to_sequences(X)

In [None]:
X[100]

In [None]:
print(X_num_tokens[100])

In [None]:
reverse_word_index = {index: word for word, index in tokenizer.word_index.items()}
word = reverse_word_index[2]
print(word)

In [None]:
num_tokens = [len(tokens) for tokens in X_num_tokens]  # X_num_tokens deki (yorumlardaki) her bir tokeni al ve say...
num_tokens = np.array(num_tokens)
num_tokens

In [None]:
num_tokens.mean()

In [None]:
num_tokens.max()

In [None]:
num_tokens.argmax()

In [None]:
X[3071621]  # we can see the longest text from the index

In [None]:
len(num_tokens)

In [None]:
sum(num_tokens < 162) / len(num_tokens) 

In [None]:
max_tokens = 162  

In [None]:
sum(num_tokens < max_tokens) # 3.461.054 text rows have less than 162 tokens

In [None]:
sum(num_tokens > max_tokens)  # 130.761 text rows have more than 162 tokens

## Padding

In [None]:
X_pad = pad_sequences(X_num_tokens, maxlen=max_tokens)

In [None]:
X_pad.shape

In [None]:
np.array(X_num_tokens[3071621]) # kisalarin basina 0 koyar, uzunlarin da son 65 ini alir

In [None]:
X_pad[3071621]

## train test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, 
                                                    test_size=0.2, 
                                                    stratify=y, 
                                                    random_state=0)

## Modeling

In [None]:
embedding_size = 50

In [None]:
max_tokens

In [None]:
from tensorflow.keras.layers import Embedding, Dropout, Bidirectional, LSTM, Dense
from tensorflow.keras.models import Sequential

model = Sequential()

model.add(Embedding(input_dim=num_words,        # kelime uzayındaki toplam kelime sayısı
                    output_dim=embedding_size,  # her bir kelimemnin özellik sayısı
                    input_length = max_tokens))   # her bir dökümanda ki(yorum) kelime sayısı
 
model.add(Dropout(0.2))

# İlk gizli katman
model.add(Bidirectional(LSTM(units=48, return_sequences=True)))
model.add(Dropout(0.2))

# İkinci gizli katman
model.add(Bidirectional(LSTM(units=24, return_sequences=True)))
model.add(Dropout(0.2))

# Üçüncü gizli katman
model.add(Bidirectional(LSTM(units=12)))

model.add(Dense(1, activation='sigmoid'))


In [None]:
optimizer = Adam(learning_rate=0.01) 

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['Recall'])

In [None]:
model.summary() 

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor="val_recall", mode="max",   #mode= auto da trendi takip eder.             
                           verbose=1, patience = 2, restore_best_weights=True)

In [None]:
model.fit(X_train, y_train, 
          epochs=5, 
          batch_size=256,
          validation_data=(X_test, y_test), 
          callbacks=[early_stop])

## Model Evaluation

In [None]:
model_loss = pd.DataFrame(model.history.history)
model_loss.head()

In [None]:
model_loss.plot()

In [None]:
model.evaluate(X_train, y_train)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
import gc
gc.collect()

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, roc_auc_score
y_train_pred = model.predict(X_train) >= 0.5

print(confusion_matrix(y_train, y_train_pred))
print("-------------------------------------------------------")
print(classification_report(y_train, y_train_pred))

In [None]:
y_pred = model.predict(X_test) >= 0.5

print(confusion_matrix(y_test, y_pred))
print("-------------------------------------------------------")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt

y_pred_proba = model.predict(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba,)
plt.plot([1,0],[0,1],'k--')
plt.plot(recall, precision)
plt.xlabel('recall')
plt.ylabel('precision')
plt.title('precision recall curve')
plt.show()

In [None]:
average_precision_score(y_test, y_pred_proba)

## Save Model and Tokenizer

In [None]:
model.save('review_amazon_sentiment.h5')

In [None]:
import json

# Assuming tokenizer is already fitted on your training data
tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w') as f:
    f.write(tokenizer_json)

## New text prediction

In [None]:
from tensorflow.keras.models import load_model

# load model
model_review = load_model('/kaggle/working/review_amazon_sentiment.h5')

In [None]:
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import json

# Load the tokenizer JSON content as a string
with open('/kaggle/working/tokenizer.json', 'r') as f:
    data = json.load(f)  # This gives you a dictionary

# Convert the dictionary back to a JSON string
data_str = json.dumps(data)

# Use the string to load the tokenizer
tokenizer = tokenizer_from_json(data_str)

In [None]:
review1 = "I hated this product, never buying it again!"
review2 = "beautiful! fast shipping and a responsive seller"
review3 = "garbage product, no one should sell such thing"
review4 = "great price for a product like this, definitely buying it again"

reviews = [review1, review2, review3, review4]

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

num_words = 15000
max_tokens = 162

tokens = tokenizer.texts_to_sequences(reviews)
tokens_pad = pad_sequences(tokens, maxlen=max_tokens)
tokens_pad.shape

In [None]:
(model_review.predict(tokens_pad) >0.5).astype("int")  # WORKS PERFECTLY :) 

## Thank You!