In [None]:
import pandas as pd
import numpy as np
import os
import glob 
import csv
import ast
import nltk
import string
import re

nltk.download('stopwords')
nltk.download('wordnet')


from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split



########################################################################################################

'''read input csv file which has the information pertaining to the movie id, name and genre to which a movie belongs to'''
def read_Movie_Id_Name_Genre_File(filePath):
    movie_id_name_genre_dicts_list = []
    count = 0
    with open(filePath) as file:
        
        movieFile = csv.reader(file, delimiter = '\t')
        for row in movieFile:
            
            movie_Id = row[0]
            movie_Name = row[2]
            genreListOfTheMovie = getTheGenresFromString(row[-1])
            movie_id_name_genre_dicts_list.append(get_the_genres_dict_of_a_movie(movie_Id, movie_Name, genreListOfTheMovie))
            
    return pd.DataFrame(movie_id_name_genre_dicts_list)


####################

def getTheGenresFromString(genreString):
    '''This function receives a genre string (of dictionary form) and returns a list of genres to which a movie belongs to'''
    genreOfTheMovie = []
    
    '''ast.literal_eval() converts a  string of type "/m/09c7w0": "United States of America" to a dictionary of 
    {/m/09c7w0": "United States of America}'''
    genreDictionary = ast.literal_eval(genreString)
    for _, genre in genreDictionary.items():
        genreOfTheMovie.append(genre)
    
    return genreOfTheMovie  


#####################

def get_the_genres_dict_of_a_movie(movie_Id, movie_Name, movie_genres_list):
    '''input data has genres of type Action/Comedy and also Romantic comedy'''
    whitespace = " "
    backslash = "/"
    movie_id_name_genre_dict = {'movie_Id' : movie_Id, 'movie_Name' : movie_Name}
    genres = []
    
    for every_genre in movie_genres_list:
        if whitespace in every_genre:
            genres = genres + every_genre.split(whitespace)
        elif backslash in every_genre:
            genres = genres + every_genre.split(backslash)
        else:
            genres.append(every_genre)
        
    genres_list = create_genres_dict(genres)
        
    return add_genre_tuples_to_the_dict(movie_id_name_genre_dict, genres_list)



#####################

def create_genres_dict(genres):
    
    primary_genres = ['Action', 'Adventure', 'Animation', 'Biopic', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-noir',
         'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Prison', 'Reality', 'Romance', 'Science Fiction', 'Sport', 'Thriller', 'War', 'Western']
    
    genres_list = []
    if len(genres) == 1 and genres[0] in primary_genres:
        #movie_id_name_genre_dict[genres[0]] = 1
        genres_list.append(genres[0])
    else:
        for genre in genres:
            if genre in primary_genres:
                #movie_id_name_genre_dict[every_genre] = 1
                genres_list.append(genre)
                
    return genres_list


######################
    
def add_genre_tuples_to_the_dict(movie_id_name_genre_dict, genres_list):
    for genre in genres_list:
        movie_id_name_genre_dict[genre] = 1 
    
    return movie_id_name_genre_dict





########################################################################################################

def read_movie_plot_summary_file(filePath):
    movie_id_plot_summary_dicts_list = []
    
    
    with open(filePath) as file:
        moviePlotSummaryFile = csv.reader(file, delimiter = '\t')
        
        for row in moviePlotSummaryFile:
            #print("before - " + str(len(row[1])))
            movie_id_plot_summary_dict = {}
            lemmatizedSummary = list(set(lemmatizeAndRemoveStopWordsFromTheSummary(row[1])))
            
            
            '''converting a list of string words to a string separated by a whitespace'''
            lemmatizedSummary_words_string = " ".join(lemmatizedSummary)
            
            movie_id_plot_summary_dict['movie_Id'] = row[0]
            movie_id_plot_summary_dict['plot_summary_tokens'] = lemmatizedSummary_words_string
            
            movie_id_plot_summary_dicts_list.append(movie_id_plot_summary_dict)
                                      
    return pd.DataFrame(movie_id_plot_summary_dicts_list)




#####################

def lemmatizeAndRemoveStopWordsFromTheSummary(plotSummary):
    wordLemmatizer = WordNetLemmatizer()
    stopWords = set(stopwords.words('english'))
#     punctuations = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
    
    summary_words = nltk.word_tokenize(plotSummary.lower())
    modified_summary = []
    #wordIndex = 0
    
    for word in summary_words:
            
        if not word in stopWords and len(word) > 2:
            
            lemmatizedWord = wordLemmatizer.lemmatize(remove_punctuation_marks(word), pos = "v")
            modified_summary.append(lemmatizedWord)
                
    return modified_summary



def remove_punctuation_marks(word):
    #return re.sub('[!\"#$%&\'\'()*+,-./:;<=>?@\[\]^_{|}~`\s+]', '', word)
    return re.sub('[0-9]+$', '', (re.sub('[!\"#$%&\'\'()*+,-./:;<=>?@\[\]^_{|}~`\s+]', '', word)))


#######################

def convert_text_data_to_matrix_of_tokens(text_data):
    count_vectorizer = CountVectorizer(analyzer = 'word', tokenizer = None, preprocessor = None, stop_words = None, max_features = 3000)
    transformed_data = count_vectorizer.fit_transform(text_data)
    return pd.DataFrame(transformed_data.toarray())





#######################################################################################################################
###################################################################

def merge_the_movieIdNameGenre_plot_Dfs(movie_Id_Name_Genre_Df, movie_plot_Df):
    return pd.merge(movie_Id_Name_Genre_Df, movie_plot_Df, on = 'movie_Id')


####################################################################

def fill_NaNs_with_Zeros(movie_id_name_genre_plot_df):
    return movie_id_name_genre_plot_df.fillna(0)



###################################################################
def get_transformed_train_test_data(movie_id_name_genre_plot_df):
    
    #test_columns = [column for column in movie_id_name_genre_plot_df.columns if column not in ['movie_Id', 'movie_Name', 'plot_summary_tokens']]
    test_columns = [column for column in movie_id_name_genre_plot_df.columns if column not in ['plot_summary_tokens',
                                                                                    'movie_Id', 'movie_Name']]    
    train_data = movie_id_name_genre_plot_df.loc[:,'plot_summary_tokens']
    test_data = movie_id_name_genre_plot_df.loc[:, test_columns]
    
    #train_data_transformed = convert_text_data_to_matrix_of_tokens(train_data)
    
    
    return [train_data, test_data]


#####################################################################

def get_train_and_label_data(train_data, test_data):
    ## train_data is a list object
    
    #train_X = train_data.loc[0:35000,]
    train_X = train_data[0 : 35000]
    train_Y = test_data.loc[0:34999,]
    
    #test_X = train_data.loc[35000: ,]
    test_X = train_data[35000 : ]
    test_Y = test_data.loc[35000: ,]
    
    return [[train_X, train_Y], [test_X, test_Y]]



In [2]:
movie_id_name_genre_df = read_Movie_Id_Name_Genre_File('/Users/vijay/Downloads/Datasets/MovieSummaries/movie.metadata.tsv')
movie_id_plot_summary_df = read_movie_plot_summary_file('/Users/vijay/Downloads/Datasets/MovieSummaries/plot_summaries.txt')
movie_id_name_genre_plot_df = merge_the_movieIdNameGenre_plot_Dfs(movie_id_name_genre_df, movie_id_plot_summary_df)






In [3]:
movie_id_name_genre_plot_df_zeros = fill_NaNs_with_Zeros(movie_id_name_genre_plot_df)




In [4]:


train_test_data = get_transformed_train_test_data(movie_id_name_genre_plot_df_zeros)
train_data = train_test_data[0]
test_data = train_test_data[1]



In [5]:

from collections import Counter

'''
Out of all the words in plot_summary_tokens, only those which have occurred more than 15
times in all the training samples were used in BoW representation. 
'''

def get_summary_as_dictionary_of_words(summary):
    return Counter(summary)
    
    
def get_most_repeated_words_from_plot_summary_counter(plot_summary_counter):
    plot_summary_dict = dict(plot_summary_counter)
    
    most_repeated_words = [word for word, count in plot_summary_dict.items() if count >= 15]
    
    return most_repeated_words



def get_bag_of_words_representation_of_all_plots(plot_summary_tokens_series):
    plot_summary_tokens_array = np.array(plot_summary_tokens_series)
    plot_summary_counter = get_summary_as_dictionary_of_words(plot_summary_tokens_array[0])
    
    length_of_plot_summary_array = len(plot_summary_tokens_array)
    
    for summary_index in range(1, length_of_plot_summary_array):
        plot_summary_counter = plot_summary_counter + get_summary_as_dictionary_of_words(plot_summary_tokens_array[summary_index].split())
        
    most_repeated_words = get_most_repeated_words_from_plot_summary_counter(plot_summary_counter)
    return most_repeated_words

    
    


def remove_non_most_repeated_words_from_summary(most_repeated_words, plot_summary_tokens_series):
    
    plot_summary_tokens_array = np.array(plot_summary_tokens_series)
    length_of_plot_summary_array = len(plot_summary_tokens_array)
    count = 0
   
    for summary_index in range(0, length_of_plot_summary_array):
       
        indices_of_words_to_be_removed = []
        plot_summary_words_list = plot_summary_tokens_array[summary_index].split()
        no_of_words_in_summary = len(plot_summary_words_list)
        plot_summary_words_list = [word for word in plot_summary_words_list if word in most_repeated_words]
        
        plot_summary_tokens_array[summary_index] = ' '.join(plot_summary_words_list)
    return plot_summary_tokens_array



In [6]:

most_repeated_words = get_bag_of_words_representation_of_all_plots(train_data)
plot_summary = remove_non_most_repeated_words_from_summary(most_repeated_words, train_data)


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


def tfidf_vectorizer_on_train_and_test_data(plot_summary):
    
    tfidf_vectorizer = TfidfVectorizer()
    plot_summary_vectors = tfidf_vectorizer.fit_transform(plot_summary)
    
    return plot_summary_vectors





In [9]:
plot_summary_vectors = tfidf_vectorizer_on_train_and_test_data(plot_summary)


In [31]:

from sklearn.feature_selection import SelectKBest, chi2

def get_top_8000_features_from_plot_summary_vector(plot_summary_vectors, test_data):
    plot_summary_vectors_new = SelectKBest(chi2, k = 8000).fit_transform(plot_summary_vectors, test_data)
    return plot_summary_vectors_new

In [32]:

plot_summary_vectors_new = get_top_8000_features_from_plot_summary_vector(plot_summary_vectors, test_data)


In [37]:
len(plot_summary_vectors_new.toarray())

42204

In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize

def standardize_the_data_X(train_X):
    
    scaler = StandardScaler()
    scaler.fit(train_X)
    return scaler.transform(train_X)


def normalize_the_data(train_data):
    
    train_data_normalized = normalize(train_data, norm = 'l1', axis = 0)
    return train_data_normalized

In [35]:
# plot_summary_vectors_standardized = standardize_the_data_X(plot_summary_vectors.toarray())
plot_summary_vectors_standardized = standardize_the_data_X(plot_summary_vectors_new.toarray())


In [36]:


# X_Y_data = get_train_and_label_data(plot_summary_vectors_new.toarray(), test_data)
X_Y_data = get_train_and_label_data(plot_summary_vectors_standardized, test_data)
train_X, train_Y = X_Y_data[0]
test_X, test_Y = X_Y_data[1]

In [27]:
#train_X_array = train_X.toarray()
train_Y_array = train_Y.values
#test_X_array = test_X.toarray()
test_Y_array = test_Y.values

In [11]:
plot_summary_vector_df = pd.DataFrame(plot_summary_vectors_new.toarray())

In [12]:


X_Y_data = get_train_and_label_data(plot_summary_vector_df.values, test_data)
train_X_ann, train_Y_ann = X_Y_data[0]
test_X_ann, test_Y_ann = X_Y_data[1]

In [117]:

def get_weights_for_neural_networks(no_of_columns_in_train_x, hidden_neurons, no_of_diff_labels_in_train_Y):
    weights_1 = np.random.random((no_of_columns_in_train_x, hidden_neurons[0]))
    weights_2 = np.random.random((hidden_neurons[0], hidden_neurons[1])) * np.sqrt(2 / 
                                                                                    no_of_columns_in_train_x)
    weights_3 = np.random.random((hidden_neurons[1], no_of_diff_labels_in_train_Y)) * np.sqrt(2 
                                                                                        / hidden_neurons[0])
    
    return weights_1, weights_2, weights_3


def get_biases_for_neural_networks(no_of_rows, hidden_neurons, no_of_diff_labels_in_train_Y):
        
    bias_1 = np.full((no_of_rows, hidden_neurons[0]), 0.1)
    bias_2 = np.full((no_of_rows, hidden_neurons[1]), 0.1)
    bias_3 = np.full((no_of_rows, no_of_diff_labels_in_train_Y), 0.1)
    
    return bias_1, bias_2, bias_3



def ReLu(x):
    indices = np.where(x <= 0)
    x[indices] = 0
    return x
    
def der_ReLu(x):
    return 1. * (x > 0)


def stable_softmax(x):
    B = np.exp(x - np.max(x))
    C = np.sum(B)
    return B / C



    
class Neural_Network:
    def __init__(self, train_X, train_Y, hidden_neurons, params):
        self.training_data   = train_X
        self.training_labels = train_Y
        self.hidden_neurons  = hidden_neurons
        

        self.weights_1 = params[0]
        self.weights_2 = params[1]
        self.weights_3 = params[2]
        
        
        
        self.bias_1 = params[3]
        self.bias_2 = params[4]
        self.bias_3 = params[5]
        

    def feed_forward_neural_network(self):
        
        self.layer_0 = self.training_data
        
        self.z_1 = np.dot(self.layer_0, self.weights_1) + self.bias_1
        self.layer_1 = ReLu(self.z_1)
        
        self.z_2 = np.dot(self.layer_1, self.weights_2) + self.bias_2
        self.layer_2 = ReLu(self.z_2)
        
        self.z_3 = np.dot(self.layer_2, self.weights_3) + self.bias_3
        self.output = stable_softmax(self.z_3)
     
    

    def backpropagate_neural_network(self):
        
        '''applying chain rule to find the derivative of loss function with respect to the 
        weights w_1, w_2, w_3'''
        
        self.diff_btw_output_labels = self.output - self.training_labels
        prod_diff_w3_ReLu_z2 = np.dot(self.diff_btw_output_labels, self.weights_3.T) * der_ReLu(self.z_2)
        temp_res_for_der_w1  = np.dot(prod_diff_w3_ReLu_z2, self.weights_2.T) * der_ReLu(self.z_1)
        
        
        self.der_weights_3   = np.dot(self.layer_2.T, self.diff_btw_output_labels)
        self.der_weights_2   = np.dot(self.layer_1.T, prod_diff_w3_ReLu_z2)
        self.der_weights_1   = np.dot(self.layer_0.T, temp_res_for_der_w1)
        
        self.der_bias_3      = self.diff_btw_output_labels
        self.der_bias_2      = prod_diff_w3_ReLu_z2
        self.der_bias_1      = temp_res_for_der_w1
        
        
    

In [None]:
batch_size = 3500

def train(train_x, train_y, no_of_epochs, hidden_neurons, learning_rate, batch_size):
    no_of_rows = len(train_x)
    
    no_of_rows_in_a_batch        = batch_size
    no_of_columns_in_train_x     = np.shape(train_x)[1]
    no_of_diff_labels_in_train_y = np.shape(train_y)[1]
    
    w1, w2, w3 = get_weights_for_neural_networks(no_of_columns_in_train_x, hidden_neurons, no_of_diff_labels_in_train_y)
    b1, b2, b3 = get_biases_for_neural_networks(no_of_rows_in_a_batch, hidden_neurons, no_of_diff_labels_in_train_y)
    params = w1, w2, w3, b1, b2, b3
    cost = 0
    
    for epoch in range(0, no_of_epochs):
        
        for row in range(0, no_of_rows, batch_size):
            batch_x = train_x[row : row + batch_size]
            batch_y = train_y[row : row + batch_size]
            
            nn = Neural_Network(batch_x, batch_y, hidden_neurons, params)
            nn.feed_forward_neural_network()
            nn.backpropagate_neural_network()
            
            w1 += learning_rate * nn.der_weights_1
            w2 += learning_rate * nn.der_weights_2
            w3 += learning_rate * nn.der_weights_3
            
            b1 += learning_rate * nn.der_bias_1
            b2 += learning_rate * nn.der_bias_2
            b3 += learning_rate * nn.der_bias_3
            
            params = [w1, w2, w3, b1, b2, b3]
            cost   = np.sum(nn.diff_btw_output_labels ** 2) / (2 * batch_size)
            
        print('epoch is - ' + str(epoch) + ' loss is - ' + str(cost))
    return params, cost, nn
            
            
parameters, cost, nn = train(train_X, train_Y_array[:, 0:1], 3, [200, 100], 0.02, batch_size)