In [58]:
import pandas as pd 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Lambda


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

In [37]:
# 1 Data Preprocessing
dataset = pd.read_csv('Dataset/test.csv')

if dataset['rate'].apply(lambda x: isinstance(x, str)).any():
    dataset.rate.replace('negative', 0, inplace=True)
    dataset.rate.replace('positive', 1, inplace=True)

dataset = dataset[(dataset['rate'] == 1) | (dataset['rate'] == 0)]

def Clean(text):
    txt = ''
    for i in text:
        if i.isalnum():
            txt = txt + i
        else:
            txt = txt + ' '
    return txt.lower()

dataset.review = dataset.review.apply(Clean)


# 2 Data Lemmatization
def remove_stopword(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [w for w in words if w not in stop_words]

dataset.review = dataset.review.apply(remove_stopword)

def stem_txt(text):
    stemmer = SnowballStemmer('english')
    return " ".join([stemmer.stem(w) for w in text])

dataset.review = dataset.review.apply(stem_txt)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset.rate.replace('negative', 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset.rate.replace('positive', 1, inplace=True)
  dataset.rate.replace('positive', 1, inplace=True)


In [37]:
# 1 Data Preprocessing
dataset = pd.read_csv('Dataset/test.csv')

if dataset['rate'].apply(lambda x: isinstance(x, str)).any():
    dataset.rate.replace('negative', 0, inplace=True)
    dataset.rate.replace('positive', 1, inplace=True)

dataset = dataset[(dataset['rate'] == 1) | (dataset['rate'] == 0)]

def Clean(text):
    txt = ''
    for i in text:
        if i.isalnum():
            txt = txt + i
        else:
            txt = txt + ' '
    return txt.lower()

dataset.review = dataset.review.apply(Clean)


# 2 Data Lemmatization
def remove_stopword(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [w for w in words if w not in stop_words]

dataset.review = dataset.review.apply(remove_stopword)

def stem_txt(text):
    stemmer = SnowballStemmer('english')
    return " ".join([stemmer.stem(w) for w in text])

dataset.review = dataset.review.apply(stem_txt)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset.rate.replace('negative', 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset.rate.replace('positive', 1, inplace=True)
  dataset.rate.replace('positive', 1, inplace=True)


In [43]:
# Tokenization and Padding
max_len = 100  # Maximum sequence length (can be adjusted based on data)
tokenizer = Tokenizer(num_words=5000)  # Create tokenizer with a vocabulary size
tokenizer.fit_on_texts(dataset['review'])
sequences = tokenizer.texts_to_sequences(dataset['review'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# Prepare labels
sentiment = np.array(dataset.rate.values)

In [59]:
def get_last_output(x):
    return x[:, -1, :]
    
model = Sequential([
    Embedding(input_dim=tokenizer.num_words, output_dim=128, input_length=max_len),
    LSTM(64,return_sequences = True),
    Lambda(get_last_output),
    Dense(units=1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Train the model (adjust epochs and batch size as needed)
model.fit(padded_sequences, sentiment, epochs=10, batch_size=32)


Epoch 1/10

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 29ms/step - accuracy: 0.5651 - loss: 0.6935
Epoch 2/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.7041 - loss: 0.6859
Epoch 3/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.6822 - loss: 0.6703
Epoch 4/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.7044 - loss: 0.5995
Epoch 5/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.7288 - loss: 0.5455
Epoch 6/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.7444 - loss: 0.4644
Epoch 7/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.9006 - loss: 0.2771
Epoch 8/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.9589 - loss: 0.1333
Epoch 9/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1af954413d0>

In [60]:
loss, accuracy = model.evaluate(padded_sequences, sentiment)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 1.0000 - loss: 0.0079
Test Loss: 0.0084, Test Accuracy: 1.0000
