<a href="https://colab.research.google.com/github/KiranEC11/Sentiment-analysis-of-movie-reviews/blob/main/sentiment_analysis_training_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score,accuracy_score
import pickle

# Using Naive Bayes

In [3]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
dataset = pd.read_csv('reviews.txt',sep = '\t', names =['Reviews','Comments'])

In [4]:
dataset

Unnamed: 0,Reviews,Comments
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...
...,...,...
6913,0,Brokeback Mountain was boring.
6914,0,So Brokeback Mountain was really depressing.
6915,0,"As I sit here, watching the MTV Movie Awards, ..."
6916,0,Ok brokeback mountain is such a horrible movie.


In [5]:
stopset = stopwords.words('english')

In [6]:
vectorizer = TfidfVectorizer(use_idf = True,lowercase = True, strip_accents='ascii',stop_words=stopset)

In [7]:
X = vectorizer.fit_transform(dataset.Comments)
y = dataset.Reviews
pickle.dump(vectorizer, open('transform.pkl', 'wb'))

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [9]:
clf = naive_bayes.MultinomialNB()
clf.fit(X_train,y_train)

In [10]:
accuracy_score(y_test,clf.predict(X_test))*100

97.47109826589595

In [11]:
clf = naive_bayes.MultinomialNB()
clf.fit(X,y)

In [12]:
accuracy_score(y_test,clf.predict(X_test))*100

98.77167630057804

In [13]:
filename = 'trained_sentiment_model.pkl'
pickle.dump(clf, open(filename, 'wb'))

# Using LSTM

In [7]:
dataset = pd.read_csv('reviews.txt',sep = '\t', names =['Reviews','Comments'])
dataset.head()

Unnamed: 0,Reviews,Comments
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


## import libraries

In [4]:
!pip install --upgrade keras
!pip install --upgrade tensorflow


Collecting keras
  Downloading keras-2.13.1-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 2.12.0
    Uninstalling keras-2.12.0:
      Successfully uninstalled keras-2.12.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.12.0 requires keras<2.13,>=2.12.0, but you have keras 2.13.1 which is incompatible.[0m[31m
[0mSuccessfully installed keras-2.13.1
Collecting tensorflow
  Downloading tensorflow-2.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (524.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m524.1/524.1 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.14,>=2.13 (from tensor

In [5]:
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras as keras
from sklearn.metrics import classification_report


In [8]:
## remove stopwords

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
dataset['Comments'] = dataset['Comments'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
dataset.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Reviews,Comments
0,1,The Da Vinci Code book awesome.
1,1,"first clive cussler i've ever read, even books..."
2,1,liked Da Vinci Code lot.
3,1,liked Da Vinci Code lot.
4,1,I liked Da Vinci Code ultimatly seem hold own.


In [9]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [10]:
## lemmatisation

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st
dataset['Comments'] = dataset['Comments'].apply(lemmatize_text)
dataset.head()

Unnamed: 0,Reviews,Comments
0,1,The Da Vinci Code book awesome.
1,1,"first clive cussler i've ever read, even book ..."
2,1,liked Da Vinci Code lot.
3,1,liked Da Vinci Code lot.
4,1,I liked Da Vinci Code ultimatly seem hold own.


### check if the data is balanced or not

In [11]:
s = 0.0
for i in dataset['Comments']:
    word_list = i.split()
    s = s + len(word_list)
print("Average length of each comment : ",s/dataset.shape[0])

Average length of each comment :  8.087597571552472


In [12]:
print(s)

55950.0


In [13]:
pos_neg = dataset['Reviews'].value_counts()

print("Percentage of reviews with positive sentiment is "+str(pos_neg[1]/dataset.shape[0]*100)+"%")
print("Percentage of reviews with negative sentiment is "+str(pos_neg[0]/dataset.shape[0]*100)+"%")

Percentage of reviews with positive sentiment is 56.99624168834924%
Percentage of reviews with negative sentiment is 43.00375831165076%


# Encoding Labels and Making Train-Test Splits

In [14]:
reviews = dataset['Comments'].values
labels = dataset['Reviews'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

In [15]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, stratify = encoded_labels)

In [16]:
# Hyperparameters of the model
vocab_size = 3000 # choose based on statistics
oov_tok = ''
embedding_dim = 100
max_length = 200 # choose based on statistics, for example 150 to 200
padding_type='post'
trunc_type='post'
# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)
# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

# Building the Model

In [17]:
# model initialization
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
# compile model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 100)          300000    
                                                                 
 bidirectional (Bidirection  (None, 128)               84480     
 al)                                                             
                                                                 
 dense (Dense)               (None, 24)                3096      
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 387601 (1.48 MB)
Trainable params: 387601 (1.48 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
num_epochs = 5
history = model.fit(train_padded, train_labels,
                    epochs=num_epochs, verbose=1,
                    validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
prediction = model.predict(test_padded)
# Get labels based on probability 1 if p>= 0.5 else 0
pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred_labels))

Accuracy of prediction on test set :  0.9895953757225433


In [21]:
filename = 'trained_sentiment_model_LSTM.pkl'
pickle.dump(model, open(filename, 'wb'))