In [1]:
import pandas as pd 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

import numpy as np
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Lambda


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

In [2]:
def load_dataset(path='Dataset/test.csv'):
    dataset = pd.read_csv(path)

    if dataset['rate'].apply(lambda x: isinstance(x, str)).any():
        dataset.rate.replace('negative', 0, inplace=True)
        dataset.rate.replace('positive', 1, inplace=True)

    dataset = dataset[(dataset['rate'] == 1) | (dataset['rate'] == 0)]
    return dataset

# 1 Cleaning 
def Clean(text):
    txt = ''
    for i in text:
        if i.isalnum():
            txt = txt + i
        else:
            txt = txt + ' '
    return txt.lower()

# 2 Removing Stopwords
def remove_stopword(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [w for w in words if w not in stop_words]

# 3 Data Lemmatization or Stemm
def stem_text(text):
    stemmer = SnowballStemmer('english')
    return " ".join([stemmer.stem(w) for w in text])

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(w, pos=wordnet.VERB) for w in text])
    

In [4]:
dataset = load_dataset()
dataset.review = dataset.review.apply(Clean) #1 Clean 
dataset.review = dataset.review.apply(remove_stopword) #2 Remove Stopwords
dataset.review = dataset.review.apply(lemmatize_text) #3 Choose stem_text or lemmatize_text

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset.rate.replace('negative', 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset.rate.replace('positive', 1, inplace=True)
  dataset.rate.replace('positive', 1, inplace=True)


In [6]:
max_len = 100
sentiment = np.array(dataset.rate.values) # commonly known as y_train
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(dataset['review'])
sequences = tokenizer.texts_to_sequences(dataset['review'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post') # commonly known as X_train
model = Sequential([
    Embedding(input_dim=tokenizer.num_words, output_dim=128, input_length=max_len),
    LSTM(64,return_sequences = True),
    Lambda(lambda x: x[:, -1, :]),
    Dense(units=1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
model.fit(padded_sequences, sentiment, epochs=10, batch_size=32)

loss, accuracy = model.evaluate(padded_sequences, sentiment)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

Epoch 1/10





[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.4355 - loss: 0.6954
Epoch 2/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.6780 - loss: 0.6869
Epoch 3/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.7058 - loss: 0.6638
Epoch 4/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.6693 - loss: 0.8629
Epoch 5/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.7245 - loss: 0.5938
Epoch 6/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.7406 - loss: 0.5614
Epoch 7/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.8199 - loss: 0.4361
Epoch 8/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.9157 - loss: 0.2500
Epoch 9/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [11]:
padded_sequences


array([[ 195, 2122, 1608, ...,  285,  101,  224],
       [1021,   14,   61, ...,    0,    0,    0],
       [  35,  101,  159, ...,    0,    0,    0],
       ...,
       [  58,  566,    9, ..., 3329,  134,  504],
       [ 201,  158,   98, ...,    0,    0,    0],
       [ 245,  471,   13, ...,    0,    0,    0]])

In [18]:
new_text_processed = Clean("I Love this movie")
new_text_processed = remove_stopword(new_text_processed)
new_text_processed = lemmatize_text(new_text_processed)

new_sequence = tokenizer.texts_to_sequences([new_text_processed])
new_sequence = pad_sequences(new_sequence, maxlen=max_len, padding='post')
print(new_sequence)


prediction = model.predict(new_sequence)[0][0]

print(prediction)

if prediction > 0.5:
  print("Sentiment: Positive")
else:
  print("Sentiment: Negative")

[[45  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
0.76483816
Sentiment: Positive
