In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from keras.preprocessing.sequence import pad_sequences
import re
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, InputLayer, Activation, Flatten
from keras.optimizers import Adam
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import seaborn as sns
import gensim
import gensim.downloader

In [2]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [3]:
# word2vec_vectors_emb = gensim.downloader.load('word2vec-google-news-300')
# fasttext_vectors_emb = gensim.downloader.load('fasttext-wiki-news-subwords-300')
glove_vectors_emb = gensim.downloader.load('glove-wiki-gigaword-300')

In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")  

In [5]:
train['reviews']

0        This book was very informative, covering all a...
1        I am already a baseball fan and knew a bit abo...
2        I didn't like this product it smudged all unde...
3        I simply love the product. I appreciate print ...
4        It goes on very easily and makes my eyes look ...
                               ...                        
49995                         it does not work((((((((((((
49996    Really worthless, loud motor with absolutely n...
49997    Don't waste your money on this. It does nothin...
49998    Product does not remove ear wax. No suction, j...
49999    If you wear hearing aids these are great for r...
Name: reviews, Length: 50000, dtype: object

In [6]:
def convert_to_lower(text):
    # return the reviews after convering then to lowercase
    lower_text = text.copy()
    for i in range(len(text)):
        lower_text[i] = text[i].lower()
    return lower_text

In [7]:
def remove_punctuation(text):
    #stop_words = set(stopwords.words('english'))
    cleanedText = []
    for test_str in text:
        res = re.sub(r'[^\w\s]', '', test_str) 
        cleanedText.append(res)
    return cleanedText

In [8]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    without_stopwords_text  = text.copy()
    for i in range(len(text)):
        without_stopwords_text[i] = [w for w in text[i] if w not in stop_words]
    return without_stopwords_text

In [9]:
def perform_tokenization(text):
    t = Tokenizer()
    t.fit_on_texts(text)
    encoded = t.texts_to_sequences(text)
    return encoded

In [10]:
def get_dicts(train):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train)
    words_to_index = tokenizer.word_index
    return words_to_index

In [11]:
# def gloveVector():
#     wordMapping = {}
#     fileName = ''
#     with open(fileName, 'r', encoding='UTF-8') as f:
#         for line in f:
#             w_line = line.split()
#             curr_word = w_line[0]
#             wordMapping[curr_word] = np.array(w_line[1:], dtype=np.float64)
#     return (wordMapping)



In [12]:
# word_to_index = get_dicts(train)
# wordMap = gloveVector()
# vocabSize = len(word_to_index)

In [13]:
MAX_LENGTH = 100 

In [14]:
# def embeddingMatrix():
#     vocab_len = len(word_to_index)
#     embed_vector_len = wordMap['moon'].shape[0]
#     emb_matrix = np.zeros((vocab_len, embed_vector_len))
#     for word, index in word_to_index.items():
#     embedding_vector = wordMap.get(word)
#     if embedding_vector is not None:
#         emb_matrix[index, :] = embedding_vector
#     embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=MAX_LENGTH, weights = [emb_matrix], trainable=False)
#     return embedding_layer


In [15]:
def tokens(text):
    a = text.copy()
    for i in range(len(text)):
        a[i] = nltk.word_tokenize(text[i])
    return a

In [16]:
def perform_padding(data):
  z=[]
  for i in data:
    k = [r for j in i for r in j]
    result = np.zeros(100*300)
    result[:len(k)] = np.array(k)
    z.append(result)
  return (np.array(z,dtype='float64'))

In [17]:
def preprocess_data(data):
    reviews = data["reviews"]
    reviews = convert_to_lower(reviews)
    reviews = remove_punctuation(reviews)
    reviews = tokens(reviews)
    emb = []
    for i in reviews:
      tmp=[]
      for j in i:
        try:
          tmp.append(glove_vectors_emb[j])
        except KeyError:
          tmp.append(np.zeros((300)))
      emb.append(tmp)
    padded = perform_padding(emb)    
    return padded

In [None]:
preprocessed_reviews = preprocess_data(train)

In [None]:
preprocessed_test_reviews = preprocess_data(test)

In [None]:
def softmax_activation(x):
    exp_x = np.exp(x)
    return exp_x/np.sum(exp_x)

In [None]:
class NeuralNetGlove:

    def __init__(self, reviews, ratings):

        self.reviews = reviews
        self.ratings = ratings

    def build_nn(self,hiddenLayers,activationHidden):
        #add the input and output layer here; you can use either tensorflow or pytorch
        self.model = Sequential()
        self.model.add(InputLayer(input_shape=(MAX_LENGTH*300,)))
        for i in range(hiddenLayers):
            self.model.add(Dense(64,activation=activationHidden))
        self.model.add(Dense(5,activation='softmax'))
        self.model.compile(loss='categorical_crossentropy', optimizer=Adam(0.0001), metrics=['accuracy'])
        self.model.summary()
        
    def train_nn(self,batch_size,epochs):
        # write the training loop here; you can use either tensorflow or pytorch
        # print validation accuracy
        y_train = tf.keras.utils.to_categorical(self.ratings,num_classes=5)
        self.history = self.model.fit(self.reviews, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)
        

    def predict(self, reviews):
        # return a list containing all the ratings predicted by the trained model
        y_pred = self.model.predict(reviews)
        pred1 = []
        for i in range(len(y_pred)):
            pred1.append(np.argmax(y_pred[i])+1)
        return pred1

In [None]:
train_rating_list = train['ratings'].to_list()
Y = [str(i-1) for i in train_rating_list]

In [None]:
###### Code to find the model with best training and test set 
def bestModelNN():
    trainAccuraciesReLU = []
    testAccuraciesReLU = []
    trainAccuraciesSigmoid = []
    testAccuraciesSigmoid = []
    
    for i in range(0,15):
        M = NeuralNetGlove(preprocessed_reviews,Y)
        M.build_nn(hiddenLayers=i,activationHidden='sigmoid')
        M.train_nn(64,15)
        y_pred = M.predict(preprocessed_test_reviews)
        accuracyTest = accuracy_score(test['ratings'],y_pred)
        accuracyTrain = M.history.history['accuracy'][-1]
        trainAccuraciesSigmoid.append(accuracyTrain)
        testAccuraciesSigmoid.append(accuracyTest)
        del M
        
    for i in range(0,15):
        M = NeuralNetGlove(preprocessed_reviews,Y)
        M.build_nn(hiddenLayers=i,activationHidden='relu')
        M.train_nn(64,15)
        y_pred = M.predict(preprocessed_test_reviews)
        accuracyTest = accuracy_score(test['ratings'],y_pred)
        accuracyTrain = M.history.history['accuracy'][-1]
        trainAccuraciesReLU.append(accuracyTrain)
        testAccuraciesReLU.append(accuracyTest)
        del M 
        
    print(trainAccuraciesReLU)
    print(testAccuraciesReLU)
        



In [None]:
##### Develop Report for the best model and prediction for some examples
def report(y_pred,test,Model):
    Classification_report = classification_report(test['ratings'],y_pred,target_names=['1','2','3','4','5'])
    cm  = confusion_matrix(test['ratings'],y_pred)
    print("Classification Report : \n",Classification_report)
    print("Heat Map :\n")
    sns.heatmap(cm,cmap="Blues",annot=True,fmt='.4g',xticklabels=['1','2','3','4','5'],yticklabels=['1','2','3','4','5'])
    tried_examples = [['I like it but dont think I would buy again.'], ['Nice looking cleaner but way smaller than 2 liters. Not as advertised.'],['Total waste of money, I used all 10 of these and got 0 results from it.']] 
    # Create the pandas DataFrame 
    df = pd.DataFrame(tried_examples, columns = ['reviews']) 
    pre_tried_examples = preprocess_data(df,word_to_index)
    z = Model.predict(pre_tried_examples)
    print("Examples :\n",df)
    print("Predicted Values: \n",z)

In [None]:
bestModelNN()
