In [None]:
!pip install pandas keras tensorflow nltk scikit-learn

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras import layers
import nltk

In [None]:
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
df=pd.read_csv('/content/IMDBDataset.csv', nrows=2500)

df['sentiment'] = df['sentiment'].replace({'positive': 1, 'negative': 0})

In [None]:
# Create all the features to the data set
def data_cleaning(text_list):
    stopwords_rem=False
    stopwords_en=stopwords.words('english')
    lemmatizer=WordNetLemmatizer()
    tokenizer=TweetTokenizer()
    reconstructed_list=[]
    for each_text in text_list:
        lemmatized_tokens=[]
        tokens=tokenizer.tokenize(each_text.lower())
        pos_tags=pos_tag(tokens)
        for each_token, tag in pos_tags:
            if tag.startswith('NN'):
                pos='n'
            elif tag.startswith('VB'):
                pos='v'
            else:
                pos='a'
            lemmatized_token=lemmatizer.lemmatize(each_token, pos)
            if stopwords_rem: # False
                if lemmatized_token not in stopwords_en:
                    lemmatized_tokens.append(lemmatized_token)
            else:
                lemmatized_tokens.append(lemmatized_token)
        reconstructed_list.append(' '.join(lemmatized_tokens))
    return reconstructed_list


# Break data down into a training set and a testing set
X=df['review']
y=df['sentiment']
X_train, X_test, y_train, y_test=train_test_split(X, y)

# Fit and transform the data
X_train=data_cleaning(X_train)
X_test=data_cleaning(X_test)
tokenizer=Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size=len(tokenizer.word_index)+1
print(f'Vocab Size: {vocab_size}')
X_train=pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=40)
X_test=pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=40)
y_train=to_categorical(y_train)
y_test=to_categorical(y_test)

# Create an LSTM model with an Embedding layer and fit training data
model=Sequential()
model.add(layers.Embedding(input_dim=vocab_size,\
                           output_dim=100,\
                           input_length=40))
model.add(layers.Bidirectional(layers.LSTM(128)))
model.add(layers.Dense(2,activation='softmax'))
model.compile(optimizer='adam',\
              loss='categorical_crossentropy',\
              metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=256, epochs=10,validation_data=(X_test,y_test))

In [None]:
# New input string (e.g., a review)
new_sentence = "I really hated the movie, it was awful!"

# Preprocess the input string (similar to your `data_cleaning` function)
def preprocess_input(sentence):
    # Tokenize and lemmatize the input string
    sentence_cleaned = data_cleaning([sentence])  # using the same cleaning function as before

    # Tokenize and pad the sentence
    sentence_seq = tokenizer.texts_to_sequences(sentence_cleaned)  # convert to sequence
    sentence_padded = pad_sequences(sentence_seq, maxlen=40)  # pad the sequence to maxlen 40

    return sentence_padded

# Preprocess the input sentence
preprocessed_sentence = preprocess_input(new_sentence)

# Make a prediction
prediction = model.predict(preprocessed_sentence)

# Get the predicted class (0 or 1, depending on your sentiment classification)
predicted_class = np.argmax(prediction, axis=-1)

# Interpret the result
if predicted_class == 1:
    print("Positive sentiment")
else:
    print("Negative sentiment")

In [None]:
input_sequence

In [None]:
p = model.predict(input_padded)

In [None]:
p

In [None]:
import numpy as np
np.argmax(p)