In [None]:
import pandas as pd
import numpy as np
import os
import glob 
import csv
import ast
import nltk
import string
import re


from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score


nltk.download('stopwords')
nltk.download('wordnet')


from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

### DATA PRE-PROCESSING

In [None]:


#####################################################################  ######

'''read input csv file which has the information pertaining to the movie id, 
    name and genre to which a movie belongs to'''
def read_Movie_Id_Name_Genre_File(filePath):
    movie_id_name_genre_dicts_list = []
    count = 0
    with open(filePath) as file:
        
        movieFile = csv.reader(file, delimiter = '\t')
        for row in movieFile:
            
            movie_Id = row[0]
            movie_Name = row[2]
            genreListOfTheMovie = getTheGenresFromString(row[-1])
            movie_id_name_genre_dicts_list.append(get_the_genres_dict_of_a_movie(movie_Id, 
                                                   movie_Name, genreListOfTheMovie))
            
    return pd.DataFrame(movie_id_name_genre_dicts_list)


####################

def getTheGenresFromString(genreString):
    '''This function receives a genre string (of dictionary form) and 
         returns a list of genres to which a movie belongs to'''
    genreOfTheMovie = []
    
    '''ast.literal_eval() converts a  string of type "/m/09c7w0": "United States of America" to a 
    dictionary of  {/m/09c7w0": "United States of America}'''
    genreDictionary = ast.literal_eval(genreString)
    for _, genre in genreDictionary.items():
        genreOfTheMovie.append(genre)
    
    return genreOfTheMovie  


#####################

def get_the_genres_dict_of_a_movie(movie_Id, movie_Name, movie_genres_list):
    '''input data has genres of type Action/Comedy and also Romantic comedy'''
    whitespace = " "
    backslash = "/"
    movie_id_name_genre_dict = {'movie_Id' : movie_Id, 'movie_Name' : movie_Name}
    genres = []
    
    for every_genre in movie_genres_list:
        if whitespace in every_genre:
            genres = genres + every_genre.split(whitespace)
        elif backslash in every_genre:
            genres = genres + every_genre.split(backslash)
        else:
            genres.append(every_genre)
        
    genres_list = create_genres_dict(genres)
        
    return add_genre_tuples_to_the_dict(movie_id_name_genre_dict, genres_list)



#####################

def create_genres_dict(genres):
    
    primary_genres = ['Action', 'Adventure', 'Animation', 'Biopic', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-noir',
         'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Prison', 'Reality', 'Romance', 'Science Fiction', 'Sport', 'Thriller', 'War', 'Western']
    
    genres_list = []
    if len(genres) == 1 and genres[0] in primary_genres:
        #movie_id_name_genre_dict[genres[0]] = 1
        genres_list.append(genres[0])
    else:
        for genre in genres:
            if genre in primary_genres:
                #movie_id_name_genre_dict[every_genre] = 1
                genres_list.append(genre)
                
    return genres_list


######################
    
def add_genre_tuples_to_the_dict(movie_id_name_genre_dict, genres_list):
    for genre in genres_list:
        movie_id_name_genre_dict[genre] = 1 
    
    return movie_id_name_genre_dict





############################################################################################

def read_movie_plot_summary_file(filePath):
    movie_id_plot_summary_dicts_list = []
    
    
    with open(filePath) as file:
        moviePlotSummaryFile = csv.reader(file, delimiter = '\t')
        
        for row in moviePlotSummaryFile:
            #print("before - " + str(len(row[1])))
            movie_id_plot_summary_dict = {}
            lemmatizedSummary = list(set(lemmatizeAndRemoveStopWordsFromTheSummary(row[1])))
            
            
            '''converting a list of string words to a string separated by a whitespace'''
            lemmatizedSummary_words_string = " ".join(lemmatizedSummary)
            
            movie_id_plot_summary_dict['movie_Id'] = row[0]
            movie_id_plot_summary_dict['plot_summary_tokens'] = lemmatizedSummary_words_string
            
            movie_id_plot_summary_dicts_list.append(movie_id_plot_summary_dict)
                                      
    return pd.DataFrame(movie_id_plot_summary_dicts_list)




#####################

def lemmatizeAndRemoveStopWordsFromTheSummary(plotSummary):
    wordLemmatizer = WordNetLemmatizer()
    stopWords = set(stopwords.words('english'))
#     punctuations = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
    
    summary_words = nltk.word_tokenize(plotSummary.lower())
    modified_summary = []
    #wordIndex = 0
    
    for word in summary_words:
            
        if not word in stopWords and len(word) > 2:
            
            lemmatizedWord = wordLemmatizer.lemmatize(remove_punctuation_marks(word), pos = "v")
            modified_summary.append(lemmatizedWord)
                
    return modified_summary


#######################
def remove_punctuation_marks(word):
    #return re.sub('[!\"#$%&\'\'()*+,-./:;<=>?@\[\]^_{|}~`\s+]', '', word)
    return re.sub('[0-9]+$', '', (re.sub('[!\"#$%&\'\'()*+,-./:;<=>?@\[\]^_{|}~`\s+]', '', word)))




###################################################################
###################################################################

def merge_the_movieIdNameGenre_plot_Dfs(movie_Id_Name_Genre_Df, movie_plot_Df):
    return pd.merge(movie_Id_Name_Genre_Df, movie_plot_Df, on = 'movie_Id')


####################################################################

def fill_NaNs_with_Zeros(movie_id_name_genre_plot_df):
    return movie_id_name_genre_plot_df.fillna(0)



###################################################################
def get_transformed_train_test_data(movie_id_name_genre_plot_df):
    
    #test_columns = [column for column in movie_id_name_genre_plot_df.columns if column not in ['movie_Id', 'movie_Name', 'plot_summary_tokens']]
    test_columns = [column for column in movie_id_name_genre_plot_df.columns if column not in ['plot_summary_tokens',
                                                                                    'movie_Id', 'movie_Name']]    
    train_data = movie_id_name_genre_plot_df.loc[:,'plot_summary_tokens']
    test_data = movie_id_name_genre_plot_df.loc[:, test_columns]
    return [train_data, test_data]


#####################################################################

def get_train_and_label_data(train_data, test_data):
    ## train_data is a list object
    
    train_X = train_data[0 : 35000]
    train_Y = test_data.loc[0:34999,]

    test_X = train_data[35000 : ]
    test_Y = test_data.loc[35000: ,]
    
    return [[train_X, train_Y], [test_X, test_Y]]



#####################################################################

def remove_rows_in_df_with_0_genre_tags(movie_id_name_genre_df):
    indices_with_0_genre_tags = (np.where ((movie_id_name_genre_plot_df_zeros).iloc[:, 0 : 21].sum(axis = 1) == 0))[0]
    return movie_id_name_genre_plot_df_zeros.drop(indices_with_0_genre_tags, axis = 0, inplace = True)





In [None]:
movie_id_name_genre_df = read_Movie_Id_Name_Genre_File('/Users/vijay/Downloads/Datasets/MovieSummaries/movie.metadata.tsv')
movie_id_plot_summary_df = read_movie_plot_summary_file('/Users/vijay/Downloads/Datasets/MovieSummaries/plot_summaries.txt')
movie_id_name_genre_plot_df = merge_the_movieIdNameGenre_plot_Dfs(movie_id_name_genre_df, movie_id_plot_summary_df)

movie_id_name_genre_plot_df_zeros = fill_NaNs_with_Zeros(movie_id_name_genre_plot_df)
movie_id_name_genre_plot_df_zeros = remove_rows_in_df_with_0_genre_tags(movie_id_name_genre_plot_df_zeros)

In [None]:
train_test_data = get_transformed_train_test_data(movie_id_name_genre_plot_df_zeros)
train_data = train_test_data[0]
test_data = train_test_data[1]


### FINDING AND UPDATING PLOT WITH MOST REPEATED WORDS

In [None]:

from collections import Counter

'''
Out of all the words in plot_summary_tokens, only those which have occurred more than 15
times in all the training samples were used in BoW representation. 
'''

def get_summary_as_dictionary_of_words(summary):
    return Counter(summary)
    
    
def get_most_repeated_words_from_plot_summary_counter(plot_summary_counter):
    plot_summary_dict = dict(plot_summary_counter)
    
    most_repeated_words = [word for word, count in plot_summary_dict.items() if count >= 15]
    
    return most_repeated_words



def get_bag_of_words_representation_of_all_plots(plot_summary_tokens_series):
    plot_summary_tokens_array = np.array(plot_summary_tokens_series)
    plot_summary_counter = get_summary_as_dictionary_of_words(plot_summary_tokens_array[0])
    
    length_of_plot_summary_array = len(plot_summary_tokens_array)
    
    for summary_index in range(1, length_of_plot_summary_array):
        plot_summary_counter = plot_summary_counter + get_summary_as_dictionary_of_words(plot_summary_tokens_array[summary_index].split())
        
    most_repeated_words = get_most_repeated_words_from_plot_summary_counter(plot_summary_counter)
    return most_repeated_words

    
    


def remove_non_most_repeated_words_from_summary(most_repeated_words, plot_summary_tokens_series):
    
    plot_summary_tokens_array = np.array(plot_summary_tokens_series)
    length_of_plot_summary_array = len(plot_summary_tokens_array)
    count = 0
   
    for summary_index in range(0, length_of_plot_summary_array):
       
        indices_of_words_to_be_removed = []
        plot_summary_words_list = plot_summary_tokens_array[summary_index].split()
        no_of_words_in_summary = len(plot_summary_words_list)
        plot_summary_words_list = [word for word in plot_summary_words_list if word in most_repeated_words]
        
        plot_summary_tokens_array[summary_index] = ' '.join(plot_summary_words_list)
    return plot_summary_tokens_array



In [None]:
most_repeated_words = get_bag_of_words_representation_of_all_plots(train_data)
plot_summary = remove_non_most_repeated_words_from_summary(most_repeated_words, train_data)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


def tfidf_vectorizer_on_train_and_test_data(plot_summary):
    
    tfidf_vectorizer = TfidfVectorizer(max_df = 0.8, max_features = 10000)
    plot_summary_vectors = tfidf_vectorizer.fit_transform(plot_summary)
    
    return tfidf_vectorizer, plot_summary_vectors





In [None]:
tfidf_vectorizer, plot_summary_vectors = tfidf_vectorizer_on_train_and_test_data(plot_summary)


X_Y_data = get_train_and_label_data(plot_summary_vectors.toarray(), test_data)

train_X, train_Y = X_Y_data[0]
test_X, test_Y = X_Y_data[1]
train_Y_array = train_Y.values
test_Y_array = test_Y.values

### OneVsRest + Logistic Regression

In [None]:
logistic_regressor = LogisticRegression()
OvR_classifier     = OneVsRestClassifier(logistic_regressor)

OvR_classifier.fit(train_X, train_Y_array)

In [None]:
threshold = 0.3
y_pred_probabilites = OvR_classifier.predict_proba(test_X)
y_pred_new = (y_pred_probabilites >= threshold).astype(int)
f1_score(test_Y_array, y_pred_new, average="micro")

### OneVsRest + LinearSVC

In [None]:
from sklearn.svm import LinearSVC
linear_svc = LinearSVC(random_state=0, tol=1e-5)
OvR_classifier_svc = OneVsRestClassifier(linear_svc)
OvR_classifier_svc.fit(train_X, train_Y_array)

In [None]:
y_predicted_svc = OvR_classifier_svc.predict(test_X)
f1_score(test_Y_array, y_predicted_svc, average = 'micro')