In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from keras.preprocessing.sequence import pad_sequences
import re
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, InputLayer, Activation, Flatten, Embedding, LSTM,GRU, Bidirectional, SimpleRNN
from keras.initializers import Constant
from keras.optimizers import Adam
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import seaborn as sns
import gensim
import gensim.downloader
from nltk.corpus import brown
import string

In [2]:
import nltk
nltk.download('brown')
nltk.download('punkt')

[nltk_data] Downloading package brown to /home/piyush/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /home/piyush/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
sents = brown.sents()
cleaned_sents = []
for i in sents:
    sentence = []
    for j in i :
        if j not in string.punctuation :
            sentence.append(j.lower())
    cleaned_sents.append(sentence)

In [4]:
model = gensim.models.Word2Vec(cleaned_sents,size=50,window=7,min_count=1)

In [14]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")  

In [18]:
train['reviews']

0        This book was very informative, covering all a...
1        I am already a baseball fan and knew a bit abo...
2        I didn't like this product it smudged all unde...
3        I simply love the product. I appreciate print ...
4        It goes on very easily and makes my eyes look ...
                               ...                        
49995                         it does not work((((((((((((
49996    Really worthless, loud motor with absolutely n...
49997    Don't waste your money on this. It does nothin...
49998    Product does not remove ear wax. No suction, j...
49999    If you wear hearing aids these are great for r...
Name: reviews, Length: 50000, dtype: object

In [6]:
def class_counts(ratings):    
    # find # of elements of each class
    cnts = ratings.value_counts()
    class1_count = cnts[1]
    class2_count = cnts[2]
    class3_count = cnts[3]
    class4_count = cnts[4]
    class5_count = cnts[5]
    
    return class1_count, class2_count, class3_count, class4_count, class5_count

def majority_class_count(ratings):
    return max(class_counts(ratings))


In [7]:
class1_data = train[train['ratings']==1]
class2_data = train[train['ratings']==2]
class3_data = train[train['ratings']==3]
class4_data = train[train['ratings']==4]
class5_data = train[train['ratings']==5]

majority_class_cnt = majority_class_count(train['ratings'])
class1_data = class1_data.sample(majority_class_cnt, random_state=1,replace=True)
class2_data = class2_data.sample(majority_class_cnt, random_state=1,replace=True)
class3_data = class3_data.sample(majority_class_cnt, random_state=1,replace=True)
class4_data = class4_data.sample(majority_class_cnt, random_state=1,replace=True)
class5_data = class5_data.sample(majority_class_cnt, random_state=1,replace=True)

train = pd.concat([class1_data, class2_data, class3_data, class4_data, class5_data], axis=0)

In [13]:
train['reviews']

16477    Brush overheats.  I have bought several and th...
4339     The bottle developed a huge crack within a cou...
49474    4 of the same rings and they don't fit my fing...
16781    I wa really hoping this smelled like almond. H...
48624    I wore them attached to my lanyard. After less...
                               ...                        
29858    Better than triple antibiotic creams. I have a...
43880    We have got one of these for each station - ou...
27885    Really Beautiful color looks great and color i...
2832     Go ahead and buy this, it is perfectly suited ...
6673     Lotion is smooth and rich. The Bath Gel and Sh...
Name: reviews, Length: 165965, dtype: object

In [8]:
def convert_to_lower(text):
    # return the reviews after convering then to lowercase
    lower_text = text.copy()
    for i in range(len(text)):
        lower_text[i] = text[i].lower()
    return lower_text

In [9]:
def remove_punctuation(text):
    #stop_words = set(stopwords.words('english'))
    without_punctuation_text  = text.copy()
    for i in range(len(text)):
        without_punctuation_text[i] = [w for w in text[i] if w.isalpha()]
    return without_punctuation_text

In [10]:
def perform_tokenization(text):
    tokenize_text = text.copy()
    for i in range(len(text)):
        tokenize_text[i] = nltk.word_tokenize(text[i])
    return tokenize_text

In [11]:
def get_dicts(train):
    reviews = train["reviews"].to_list()
    reviews = convert_to_lower(reviews)
    reviews = perform_tokenization(reviews)
    reviews = remove_punctuation(reviews)
    return reviews

In [12]:
review = get_dicts(train)

AttributeError: 'Series' object has no attribute 'lower'

In [None]:
review = review.to_list()


In [None]:
max_length=50
Embedding_dimension=50

In [None]:
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(review) 
sequences = tokenizer_obj.texts_to_sequences(review)
word_index = tokenizer_obj.word_index
review_pad = pad_sequences(sequences, maxlen=max_length) 

In [None]:
num_words = len(word_index)+1
embedding_matrix = np.zeros ((num_words, Embedding_dimension))
for word, i in word_index.items():
  try:
    embedding_vector = model.wv[word]
    embedding_matrix[i] = embedding_vector
  except:
    continue 

In [None]:
train_rating_list = train['ratings'].to_list()
Y = [str(i-1) for i in train_rating_list]
y_train = tf.keras.utils.to_categorical(Y,num_classes=5)

In [None]:
def preprocess_test(test_reviews):
  reviews = test_reviews
  reviews = convert_to_lower(reviews)
  reviews = perform_tokenization(reviews)
  reviews = remove_punctuation(reviews)
  return reviews

In [None]:
test_review = preprocess_test(test['reviews'])
test_review = test_review.to_list()
sequences_test = tokenizer_obj.texts_to_sequences(test_review)
test_review_pad = pad_sequences(sequences_test, maxlen=max_length) 

In [None]:
y_test = test['ratings']

In [None]:
def predict(model, test_reviews):
        y_pred = model.predict(test_reviews)
        pred1 = []
        for i in range(len(y_pred)):
            pred1.append(np.argmax(y_pred[i])+1)
        return pred1

In [None]:
def report(y_pred,y_test,Model,modelName):
    print("For model = ",modelName)
    Classification_report = classification_report(y_test,y_pred,target_names=['1','2','3','4','5'])
    cm  = confusion_matrix(test['ratings'],y_pred)
    print("Classification Report : \n",Classification_report)
    print("Heat Map :\n")
    sns.heatmap(cm,cmap="Blues",annot=True,fmt='.4g',xticklabels=['1','2','3','4','5'],yticklabels=['1','2','3','4','5'])

In [None]:
def lstmModel():
    model = Sequential()
    embedding_layer = Embedding(num_words,Embedding_dimension,embeddings_initializer=Constant(embedding_matrix), input_length = max_length, trainable=False)
    model.add(embedding_layer)
    model.add(LSTM(units=64, dropout=0.2,recurrent_dropout=0.2))
    model.add (Dense (64, activation='sigmoid'))
    model.add (Dense (5, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
lstm  = lstmModel()
lstm.fit(review_pad, y_train, batch_size=64, epochs=15, validation_split=0.25)


In [None]:
y_pred = predict(lstm, test_review_pad)

In [None]:
report(y_pred, y_test, lstm,"LSTM Model")

In [None]:
def bilstmModel():
    model = Sequential()
    embedding_layer = Embedding(num_words,Embedding_dimension,embeddings_initializer=Constant(embedding_matrix), input_length = max_length, trainable=False)
    model.add(embedding_layer)
    model.add(Bidirectional(LSTM(units=64, dropout=0.2,recurrent_dropout=0.2)))
    model.add (Dense (64, activation='sigmoid'))
    model.add (Dense (5, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
bilstm  = bilstmModel()
bilstm.fit(review_pad, y_train, batch_size=64, epochs=15, validation_split=0.25)

In [None]:
y_pred = predict(bilstm, test_review_pad)
report(y_pred, y_test, bilstm,"Bi-LSTM Model")

In [None]:
def gruModel():
    model = Sequential()
    embedding_layer = Embedding(num_words,Embedding_dimension,embeddings_initializer=Constant(embedding_matrix), input_length = max_length, trainable=False)
    model.add(embedding_layer)
    model.add(GRU(units= 64,dropout=0.2,recurrent_dropout=0.2))
    model.add (Dense (64, activation='sigmoid'))
    model.add (Dense (5, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [None]:
gru  = gruModel()
gru.fit(review_pad, y_train, batch_size=64, epochs=15, validation_split=0.25)

In [None]:
y_pred = predict(gru, test_review_pad)
report(y_pred, y_test, gru,"GRU Model")

In [None]:
def bigruModel():
    model = Sequential()
    embedding_layer = Embedding(num_words,Embedding_dimension,embeddings_initializer=Constant(embedding_matrix), input_length = max_length, trainable=False)
    model.add(embedding_layer)
    model.add(Bidirectional(GRU(units= 64,dropout=0.2,recurrent_dropout=0.2)))
    model.add (Dense (64, activation='sigmoid'))
    model.add (Dense (5, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
bigru  = bigruModel()
bigru.fit(review_pad, y_train, batch_size=64, epochs=15, validation_split=0.25)

In [None]:
y_pred = predict(bigru, test_review_pad)
report(y_pred, y_test, bigru,"Bidirectional GRU Model")

In [None]:
def rnnModel():
    model = Sequential()
    embedding_layer = Embedding(num_words,Embedding_dimension,embeddings_initializer=Constant(embedding_matrix), input_length = max_length, trainable=False)
    model.add(embedding_layer)
    model.add(SimpleRNN(units= 64,dropout=0.2,recurrent_dropout=0.2))
    model.add (Dense (64, activation='sigmoid'))
    model.add (Dense (5, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
rnn  = rnnModel()
rnn.fit(review_pad, y_train, batch_size=64, epochs=15, validation_split=0.25)


In [None]:
y_pred = predict(rnn, test_review_pad)
report(y_pred, y_test, rnn,"RNN Model")