# Built LSTM Model for sentiment analysis

### Read to jason and Convert to CSV

In [4]:
import pandas as pd
import json

# Load a subset of the review.json file
reviews = []
with open('yelp_academic_dataset_review.json', 'r', encoding='utf-8') as f:
    for line in f:
        # You can adjust the subset size as needed
        if len(reviews) >= 1000000:
            break
        reviews.append(json.loads(line))

# Convert to DataFrame
reviews_df = pd.DataFrame(reviews)

In [5]:
# Save the subset to a CSV file
reviews_df.to_csv('yelp_reviews_subset.csv', index=False)

print("Subset of 1 million reviews saved to yelp_reviews_subset.csv")

Subset of 1 million reviews saved to yelp_reviews_subset.csv


### Load and Preprocess the Dataset

In [6]:
import pandas as pd

# Load the CSV file
reviews_df = pd.read_csv('yelp_reviews_subset.csv')

In [7]:
# Display the first few rows
print(reviews_df.shape)

(1000000, 9)


In [8]:
def label_sentiment(row):
    if row['stars'] in [1, 2]:
        return 'negative'
    elif row['stars'] == 3:
        return 'neutral'
    else:
        return 'positive'

In [9]:
# Convert Star Ratings to Sentiment Labels

reviews_df['sentiment'] = reviews_df.apply(label_sentiment, axis=1)

# Select relevant columns
reviews_df = reviews_df[['text', 'sentiment']]

# Display the distribution of sentiments
print(reviews_df['sentiment'].value_counts())

sentiment
positive    680509
negative    216537
neutral     102954
Name: count, dtype: int64


In [10]:
# Encode the Sentiment Labels

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
reviews_df['sentiment'] = le.fit_transform(reviews_df['sentiment'])

In [11]:
# Split the Data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(reviews_df['text'], reviews_df['sentiment'], test_size=0.2, random_state=42)


### Tokenize and Pad the Text Data

In [13]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad the sequences
max_length = 200
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')


### Build and Train the LSTM Model

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D

model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_length),
    SpatialDropout1D(0.2),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(3, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
# Print the model summary
model.summary()

In [18]:
history = model.fit(train_padded, y_train, 
                    epochs=5, 
                    batch_size=64, 
                    validation_data=(test_padded, y_test))

Epoch 1/5
[1m12500/12500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3078s[0m 246ms/step - accuracy: 0.7970 - loss: 0.5295 - val_accuracy: 0.8861 - val_loss: 0.2940
Epoch 2/5
[1m12500/12500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2941s[0m 235ms/step - accuracy: 0.8879 - loss: 0.2902 - val_accuracy: 0.8929 - val_loss: 0.2750
Epoch 3/5
[1m12500/12500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2924s[0m 234ms/step - accuracy: 0.8942 - loss: 0.2726 - val_accuracy: 0.8945 - val_loss: 0.2702
Epoch 4/5
[1m12500/12500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3000s[0m 240ms/step - accuracy: 0.8977 - loss: 0.2621 - val_accuracy: 0.8956 - val_loss: 0.2684
Epoch 5/5
[1m12500/12500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2988s[0m 239ms/step - accuracy: 0.9001 - loss: 0.2559 - val_accuracy: 0.8953 - val_loss: 0.2672


### Evaluate the Model and save model

In [19]:
loss, accuracy = model.evaluate(test_padded, y_test)
print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m297s[0m 47ms/step - accuracy: 0.8956 - loss: 0.2657
Loss: 0.26723161339759827
Accuracy: 0.8953049778938293


In [20]:
# save model to h5 extention
from keras.models import load_model
model.save('yelp_review_Model.h5')
print("Model Saved")



Model Saved


In [21]:
#Save the Tokenizer
import pickle

# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


# Predict the review with alredy saved model

### Load the Saved Model and Tokenizer

In [22]:

from tensorflow.keras.models import load_model
import pickle

# Load the saved model
model = load_model('yelp_review_Model.h5')

# Load the tokenizer
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)



### Preprocess New Reviews

In [23]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_review(review, tokenizer, max_length):
    # Convert text to sequence
    sequence = tokenizer.texts_to_sequences([review])
    # Pad the sequence
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    return padded_sequence

### Make Predictions

In [24]:
import numpy as np

# Function to predict sentiment of a new review
def predict_sentiment(review, model, tokenizer, max_length):
    # Preprocess the review
    preprocessed_review = preprocess_review(review, tokenizer, max_length)
    # Predict the sentiment
    prediction = model.predict(preprocessed_review)
    # Map the prediction to the sentiment class
    sentiment_classes = ['negative', 'neutral', 'positive']
    predicted_class = sentiment_classes[np.argmax(prediction)]
    return predicted_class


In [27]:
# Example usage
new_review = "The service was excellent and the food was Average!"
predicted_sentiment = predict_sentiment(new_review, model, tokenizer, max_length)
print(f'Sentiment: {predicted_sentiment}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Sentiment: neutral
