# Movie recommendations based on movie similarities (using unsupervied version of knn)

### Section 1

Obtain the data from the imdb dataset (https://www.imdb.com/interfaces/). Chosen datasets: *title.basics.tsv.gz* and *title.ratings.tsv.gz*. 
Columns used:
- *tconst* used as identification and merge column, later replaced by auto increment pandas index
- *primaryTitle* as movie name
- *genres* to take and modify into multiple columns, one for each genre

Remove movies with no genres. Only select movies.

Resulted data frame: *movie_data*

**No need to run section 1 and 2, just get the final csv file from here: https://drive.google.com/drive/folders/1Mhz8BYiivOi3Ze0KLpSxDuDJUC-_Ac1_?usp=sharing and save it to the data folder**

In [6]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np

# obtain pandas dataframe of movies and their rating from IMDB
movie_general_data = pd.read_table('../data/general_data.tsv', delimiter='\t', usecols=['tconst','titleType', 'primaryTitle','genres'], dtype={'tconst':'string','titleType':'string', 'primaryTitle':'string', 'genres':'string'})
movie_rating_data = pd.read_table('../data/rating_data.tsv', delimiter='\t', usecols=['tconst','averageRating'], dtype={'tconst':'string','averageRating':'float'})
movie_data = pd.merge(movie_general_data, movie_rating_data, on='tconst')

# get only movies
options = ['movie']
movie_data = movie_data[movie_data['titleType'].isin(options)]

#clean data 
#remove titleTtype
movie_data.pop('titleType')

# remove null genres values
null_values = ['\\N']
movie_data = movie_data[movie_data.genres.isin(null_values) == False]

# redefine the index column based on the new data
movie_data.index = [x for x in range(1, len(movie_data.values)+1)]
movie_data.index.name = 'id'

movie_data.pop('tconst')

#preview pandas dataframe 
display(movie_data.head(5))

Unnamed: 0_level_0,primaryTitle,genres,averageRating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Miss Jerry,Romance,5.3
2,The Corbett-Fitzsimmons Fight,"Documentary,News,Sport",5.3
3,The Story of the Kelly Gang,"Action,Adventure,Biography",6.0
4,The Prodigal Son,Drama,4.4
5,Robbery Under Arms,Drama,4.3


### Section 2

In order to use KNN, data needs to be manipulated. Each genre will became its own column and gain a value of either a 0 or a 1. 

In [3]:
def get_csv_data(path, columns = []):
    if columns:
        data = pd.read_csv(path, usecols=columns) 
    else:   
        data = pd.read_csv(path, index_col= 'id') 
    return data 

In [15]:

def add_to_global_csv(name, value):
    try:
        existing_data = get_csv_data('../global/movie_vars.csv',['Name','Value'])

    except FileNotFoundError:
        print('Global var file will be created.')
        existing_data =  pd.DataFrame(columns=['Name', 'Value'])
        
    finally:
        new_data = pd.DataFrame(columns=['Name', 'Value'])

        new_data.at[1, 'Name'] = name
        new_data.at[1, 'Value'] = value

        new_data['Value']=new_data['Value'].astype('object')

        final_data = pd.concat([existing_data, new_data], ignore_index = True )

        final_data.index = [x for x in range(1, len(final_data.values)+1)]
        final_data.index.name = 'Id'

        final_data.to_csv('../global/movie_vars.csv') 
    

In [14]:
def read_from_global_csv(name, type):

    global_vars = pd.read_csv('../global/movie_vars.csv', index_col='Name', usecols = ['Name','Value'])

    if type == list:
        global_vars['Value'] = global_vars['Value'].apply(eval)

    value = global_vars.loc[name]['Value']      
    
    return(value)


In [9]:
# get all uniques genres in a list
genres = movie_data.genres.unique().tolist()

split_genres_list = [item.split(',') for item in genres]

flat_genres_list = [item for l in split_genres_list for item in l]

unique_geners_set = set(flat_genres_list)
unique_geners_list = list(unique_geners_set)


add_to_global_csv("unique_geners_list", unique_geners_list)


# add new columns to the dataset for each genre
for genre in unique_geners_list:
    movie_data[genre] = np.where(movie_data['genres'].str.contains(genre), 1, 0)

# remove genres column
movie_data.pop('genres')

# write the new data into a csv file in the data folder
movie_data.to_csv('../data/modified_movie_data.csv') 

# preview panda dataframe
display(movie_data.head(5))

Global var file will be created.


Unnamed: 0_level_0,primaryTitle,averageRating,Adventure,Film-Noir,War,News,Action,Horror,Family,Musical,...,Talk-Show,Romance,Game-Show,Sci-Fi,Comedy,Documentary,History,Reality-TV,Sport,Drama
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Miss Jerry,5.3,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,The Corbett-Fitzsimmons Fight,5.3,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,The Story of the Kelly Gang,6.0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Prodigal Son,4.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,Robbery Under Arms,4.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


### Section 3

Split dataset into trainig and testing data, fit the model.

In [283]:
# get relevant columns
columns = ['averageRating'] + unique_geners_list

# get testing data as sample
test_data = movie_data.sample()[columns]
train_data = movie_data[columns]

# construct a NearestNeighbors class from the dataframe
k_neighbours = NearestNeighbors(n_neighbors=5)

# fitting the model 
k_neighbours.fit(train_data)

### Section 4


Function that iterates through a DataFrame columns and removes columns with 0 values

In [17]:
def remove_zero(dataframe, columns = []):
    #make a copy to modify on
    new_data = dataframe.copy()

    for col in columns:
        if ~new_data[col].any():
            new_data.drop(columns = col, inplace=True)

    return new_data

### Section 5

Make a new DataFrame with the results

In [288]:
# get the neighbours ids
neighbours_ids = k_neighbours.kneighbors(test_data)[1][0]

# get the data of the test movie and add it to the results DataFrame
test_data_row  = movie_data.loc[test_data.index]

relevant_test_results = remove_zero(test_data_row, unique_geners_list)
relevant_test_results['Type'] = 'current'

# get the recommendations
rec_data_row = movie_data.iloc[neighbours_ids]

relevant_rec_results = remove_zero(rec_data_row, unique_geners_list)
relevant_rec_results['Type'] = 'recommendation'

# make a final results DataFrame
results = pd.concat([relevant_test_results,relevant_rec_results])

display(results)
    

Unnamed: 0_level_0,primaryTitle,averageRating,Documentary,Type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
99157,Great White Death,5.6,1,current
49848,Dirigenterna,5.6,1,recommendation
63884,The Volcano Man,5.6,1,recommendation
47895,Die Kümmeltürkin geht,5.6,1,recommendation
28511,España insólita,5.6,1,recommendation
44392,Too Early/Too Late,5.6,1,recommendation


## Application
**A small application where the user can choose a movie and receive recommendations.**

### Section 1

Start with the function that will take the id and return the neighbours from the model as the recommendations.(almost the same code as Section 5 above)

!*Run all sections above for the app to work*


In [11]:
def prepare_model( movie_dataset):
    # get the geners list global variable 
    unique_geners_list = read_from_global_csv('unique_geners_list', list)

    columns = ['averageRating'] + unique_geners_list

    # get training data by using only the numeric columns
    train_data = movie_dataset[columns]

    # construct a NearestNeighbors class from the dataframe
    k_neighbours = NearestNeighbors(n_neighbors=5)

    # fitting the model 
    k_neighbours.fit(train_data)
    return (k_neighbours, columns)

In [12]:
def get_recom(id, dataset, k_neighbours, columns):
    
     # get requested data using the id from the input
     req_data = dataset.loc[[id]][columns]

     # get the neighbours ids
     neighbours_ids = k_neighbours.kneighbors(req_data)[1][0]
     #n_neighbors=2

     # get the data of the requested movie and add it to the results DataFrame
     req_data_row  = dataset.loc[[id]]

     genre_list = read_from_global_csv('unique_geners_list', list)

     req_data_row['Type'] = 'requested'

     # get the recommendations
     rec_data_row = dataset.loc[neighbours_ids + 1]

     rec_data_row['Type'] = 'recommendation'

     # make a final results DataFrame
     results = pd.concat([req_data_row, rec_data_row])

     # remove the columns that only contain 0 values
     results = remove_zero(results, genre_list)
     
     return results

In [18]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np


ready = False
retry = False
load = False

movie_dataset = get_csv_data('../data/modified_movie_data.csv' )

max_index = len(movie_dataset)-1

choice = input('Welcome to the movie recommendation app! \nPlease choose a number between 1 and {}, or write 0 to exit.'.format(max_index))
print('You said: {}'.format(choice))

while  choice not in [0, '0']:
# check the validity of the user data
    while not ready:
        if retry:
            choice = input('Please choose a number between 1 and {}, or write 0 to exit.'.format(max_index))
            print('You said: {}'.format(choice))

            retry = False
        try:
            choice = int(choice)
        except ValueError:
            print('Wrong data type. Try again:')
            retry = True
        else:
            if choice == 0:
                break

            elif choice < 0 or choice > len(movie_data)-1:
                print('Wrong number. Try again:')
                retry = True
            else:
                ready = True 
                retry = False 

    if not load:
        (k_neighbours, columns) = prepare_model(movie_dataset)
        load = True

    if ready:
        recom = get_recom(choice, movie_dataset, k_neighbours, columns)

        print('This is your data:')
        display(recom)

        ready = False
        retry = True
    
print('Thanks, bye!')

    

You said: 1
This is your data:


Unnamed: 0_level_0,primaryTitle,averageRating,Romance,Type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Miss Jerry,5.3,1,requested
2715,The Third Degree,5.3,1,recommendation
23602,El amor empieza en sábado,5.3,1,recommendation
1,Miss Jerry,5.3,1,recommendation
3547,Hearts in Exile,5.3,1,recommendation
21084,Ännchen von Tharau,5.3,1,recommendation


You said: 0
Thanks, bye!
