In [None]:
import pandas as pd
import re
import string
import pickle

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kerol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kerol\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kerol\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kerol\AppData\Roaming\nltk_data...


True

# Preprocessing

In [4]:
TRAIN_DATA_PATH = 'dataset/cleaned_data.csv'
TEST_DATA_PATH = 'dataset/test_data.csv'

In [5]:
train_df = pd.read_csv(TRAIN_DATA_PATH)
train_df

Unnamed: 0,label,text,cleaned_text
0,1,Expensive Junk: This product consists of a pie...,expensive junk product consists piece thin fle...
1,1,"Toast too dark: Even on the lowest setting, th...",toast too dark even lowest setting toast too d...
2,2,Excellent imagery...dumbed down story: I enjoy...,excellent imagerydumbed story enjoyed disc vid...
3,1,Are we pretending everyone is married?: The au...,pretending everyone married author pretend par...
4,1,Not worth your time: Might as well just use a ...,not worth time might well use knife product ho...
...,...,...,...
999995,2,All Clad Pizza Cutter: The best pizza cutter I...,clad pizza cutter best pizza cutter ever used ...
999996,1,MEH...: Its ok. i dont hate it. i have a short...,meh ok dont hate short torso look really big r...
999997,2,Is what it is: This connector cable is easy to...,connector cable easy use work intended im not ...
999998,1,Sorry--I'm not getting it: Just read this afte...,sorryim not getting read several friend raved ...


In [10]:
train_df['label'].replace({1:0, 2:1},inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['label'].replace({1:0, 2:1},inplace=True)


In [11]:
train_df

Unnamed: 0,label,text,cleaned_text
0,0,Expensive Junk: This product consists of a pie...,expensive junk product consists piece thin fle...
1,0,"Toast too dark: Even on the lowest setting, th...",toast too dark even lowest setting toast too d...
2,1,Excellent imagery...dumbed down story: I enjoy...,excellent imagerydumbed story enjoyed disc vid...
3,0,Are we pretending everyone is married?: The au...,pretending everyone married author pretend par...
4,0,Not worth your time: Might as well just use a ...,not worth time might well use knife product ho...
...,...,...,...
999995,1,All Clad Pizza Cutter: The best pizza cutter I...,clad pizza cutter best pizza cutter ever used ...
999996,0,MEH...: Its ok. i dont hate it. i have a short...,meh ok dont hate short torso look really big r...
999997,1,Is what it is: This connector cable is easy to...,connector cable easy use work intended im not ...
999998,0,Sorry--I'm not getting it: Just read this afte...,sorryim not getting read several friend raved ...


In [6]:
train_df['label'].value_counts()

label
2    500024
1    499976
Name: count, dtype: int64

In [22]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Removing Tags
    text = re.sub('#\S+', '', text).strip()

    # Removing Mentions
    text = re.sub('@\S+', '', text).strip()

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    
    # words like "not", "no", "very", "but", and "never" can play a crucial role in determining the sentiment
    words_to_remove = ["not", "no", "never", "neither", "nor", "very", 
                   "really", "too", "extremely", "quite", "but", "however", 
                   "although", "though", "if", "unless", "except"]

    stop_words = [word for word in stop_words if word not in words_to_remove]

    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into text
    cleaned_text = ' '.join(tokens)
    
    # Remove extra whitespace
    cleaned_text = ' '.join(cleaned_text.split())
    
    return cleaned_text

In [25]:
train_df['cleaned_text'] = train_df['text'].apply(clean_text)

In [13]:
from gensim.models import Word2Vec

# Tokenize reviews
tokenized_reviews = [review.split() for review in train_df['cleaned_text'].iloc[:50000]]
word2vec_model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, min_count=1, workers=4)


In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Tokenize with Keras
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['cleaned_text'].iloc[:50000])
word_index = tokenizer.word_index

# Create embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]
    else:
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))


In [18]:
sequences = tokenizer.texts_to_sequences(train_df['cleaned_text'].iloc[:50000])
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post')
labels = np.array(train_df['label'].iloc[:50000])

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(
    input_dim=len(word_index) + 1,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=100,
    trainable=False  # Set to True if fine-tune
))
model.add(LSTM(64))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])




In [19]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val))


Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 22ms/step - accuracy: 0.5163 - loss: 0.6906 - val_accuracy: 0.5123 - val_loss: 0.6928
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 22ms/step - accuracy: 0.5350 - loss: 0.6824 - val_accuracy: 0.8530 - val_loss: 0.3517
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 23ms/step - accuracy: 0.8661 - loss: 0.3160 - val_accuracy: 0.8796 - val_loss: 0.2861
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 27ms/step - accuracy: 0.8901 - loss: 0.2686 - val_accuracy: 0.8863 - val_loss: 0.2750
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 30ms/step - accuracy: 0.9003 - loss: 0.2495 - val_accuracy: 0.8737 - val_loss: 0.2972


<keras.src.callbacks.history.History at 0x1df05e1f610>

In [None]:
# Tokenize and pad new review
text = "this product is amazing"

new_text = [clean_text(text)]
seq = tokenizer.texts_to_sequences(new_text)
padded = pad_sequences(seq, maxlen=100, padding='post')

# Predict
pred = model.predict(padded)
print("Positive" if pred[0][0] > 0.5 else "Negative")


In [None]:
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)