<a href="https://colab.research.google.com/github/JawDri/Kaggle-Competitions/blob/master/Sentiment_Analysis_on_Movie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np 
import pandas as pd 
import os

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')



In [0]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from bs4 import BeautifulSoup
import re

In [0]:

import random
import tensorflow
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense,Dropout,Embedding,LSTM,Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential

Load dataset

In [0]:
train= pd.read_csv("../content/train.tsv.zip", sep="\t")
test = pd.read_csv("../content/test.tsv.zip", sep="\t")

In [5]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [6]:
train.shape

(156060, 4)

In [7]:
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [8]:
test.shape

(66292, 3)

Clean, Tokenize and Lemmatize


1.     remove html content
2.     remove non-alphabetic characters
3.     tokenize the sentences
4.     lemmatize each word to its lemma






In [0]:
#TQDM is a progress bar library with good support for nested loops and Jupyter/IPython notebooks (--tqdm == تقدم--)
from tqdm import tqdm
def clean_sentences(df):
    reviews = []

    for sent in tqdm(df['Phrase']):
        
        #remove html content
        review_text = BeautifulSoup(sent).get_text()
        
        #remove non-alphabetic characters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
        #tokenize the sentences
        words = word_tokenize(review_text.lower())
    
        #lemmatize each word to its lemma
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
    
        reviews.append(lemma_words)

    return(reviews)

In [10]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
#cleaned reviews for both train and test set retrieved
train_sentences = clean_sentences(train)
test_sentences = clean_sentences(test)
print(len(train_sentences))
print(len(test_sentences))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


100%|██████████| 156060/156060 [01:17<00:00, 2004.61it/s]
100%|██████████| 66292/66292 [00:31<00:00, 2076.33it/s]

156060
66292





Collect the dependent values and convert to one-hot encoded output using to_categorical

In [0]:
target=train.Sentiment.values
y_target=to_categorical(target)
num_classes=y_target.shape[1]

Split train/validation

In [0]:
X_train,X_val,y_train,y_val=train_test_split(train_sentences,y_target,test_size=0.2,stratify=y_target)

Number of unique words and max length of a review available in the list of cleaned reviews

In [13]:
unique_words = set()
len_max = 0

for sent in tqdm(X_train):
    
    unique_words.update(sent)
    
    if(len_max<len(sent)):
        len_max = len(sent)
        
#length of the list of unique_words gives the no of unique words
print('\n',len(list(unique_words)))
print(len_max)

100%|██████████| 124848/124848 [00:00<00:00, 555361.61it/s]


 13739
48





Tokenizer/ convert to sequences

In [14]:
tokenizer = Tokenizer(num_words=len(list(unique_words)))
tokenizer.fit_on_texts(list(X_train))

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(test_sentences)

#padding /LSTM networks needs all inputs to be same length.

X_train = sequence.pad_sequences(X_train, maxlen=len_max)
X_val = sequence.pad_sequences(X_val, maxlen=len_max)
X_test = sequence.pad_sequences(X_test, maxlen=len_max)

print(X_train.shape,X_val.shape,X_test.shape)

(124848, 48) (31212, 48) (66292, 48)


CallBack

In [0]:
early_stopping = EarlyStopping(min_delta = 0.001, mode = 'auto', monitor='val_acc', patience = 1)
callback = [early_stopping]

Modeling

In [40]:
model = Sequential()
model.add(Embedding(len(list(unique_words)),300,input_length=len_max))
model.add(LSTM(128,dropout=0.5, recurrent_dropout=0.5,return_sequences=True))
model.add(LSTM(64,dropout=0.5, recurrent_dropout=0.5,return_sequences=False))
model.add(Dense(100,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.005),metrics=['accuracy'])
model.summary()


Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 48, 300)           4121700   
_________________________________________________________________
lstm_18 (LSTM)               (None, 48, 128)           219648    
_________________________________________________________________
lstm_19 (LSTM)               (None, 64)                49408     
_________________________________________________________________
dense_16 (Dense)             (None, 100)               6500      
_________________________________________________________________
dropout_10 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 5)                 505       
Total params: 4,397,761
Trainable params: 4,397,761
Non-trainable params: 0
____________________________________________

In [41]:
history=model.fit(X_train, y_train, validation_data=(X_val, y_val),epochs=6, batch_size=256, verbose=1, callbacks=callback)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [42]:
from tensorflow.keras import regularizers
model = Sequential()
model.add(Embedding(len(list(unique_words)), 100, input_length=len_max))
model.add(Bidirectional(LSTM(150, return_sequences = True)))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(num_classes/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 48, 100)           1373900   
_________________________________________________________________
bidirectional_8 (Bidirection (None, 48, 300)           301200    
_________________________________________________________________
dropout_11 (Dropout)         (None, 48, 300)           0         
_________________________________________________________________
lstm_21 (LSTM)               (None, 100)               160400    
_________________________________________________________________
dense_18 (Dense)             (None, 2)                 202       
_________________________________________________________________
dense_19 (Dense)             (None, 5)                 15        
Total params: 1,835,717
Trainable params: 1,835,717
Non-trainable params: 0
___________________________________________

Fit the model

In [43]:
history=model.fit(X_train, y_train, validation_data=(X_val, y_val),epochs=6, batch_size=100, verbose=1, callbacks=callback)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


Submission

In [0]:
#make the predictions with trained model and submit the predictions.
y_pred=model.predict_classes(X_test)

sub_file = pd.read_csv('../content/sampleSubmission.csv',sep=',')
sub_file.Sentiment=y_pred
sub_file.to_csv('Submission.csv',index=False)