In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing, neighbors
from itertools import chain

In [2]:
df_init = pd.read_csv('E:\\greenfox\\ProjectPhase\\omdb_clean.csv')

In [3]:
df_init['Runtime'].fillna(df_init['Runtime'].mean(), inplace=True)

In [4]:
df_init['Oscar_winner'] = df_init['Oscar_winner'].str.extract(r'(\d+)')

In [5]:
df_init['Oscar_winner'].fillna(0, inplace= True)

In [6]:
df_init['Release_year'] = df_init['Released'].str[-4:]

In [7]:
df_init['Release_year'] = df_init['Release_year'].astype(str)

In [8]:
def date_convert(row):
    if '.' in row:
        date_slice = row[-2:]
        date_int = int(date_slice)
        if date_int < 20:
            date_proper = '20' + str(date_slice)
            row = date_proper
        else:
            date_proper = '19' + str(date_slice)
            row = date_proper
        return row
    else:
        return row

In [9]:
df_init['Release_year'] = df_init['Release_year'].map(date_convert)

In [10]:
df_init['Genre'].fillna('', inplace = True)

In [11]:
splitted_list = []
for item in df_init['Genre']:
    splitted_genres = item.split(', ')
    splitted_list.append(splitted_genres)

genres = (list(set(chain(*splitted_list))))

In [12]:
for genre in genres:
    genre_type = genre
    confirm_list = []
    for movie_genres in df_init['Genre'].values:
        if genre_type in movie_genres:
            confirm_list.append(1)
        else:
            confirm_list.append(0)
    df_init[genre_type] = confirm_list

In [13]:
min_max_scaler = preprocessing.MinMaxScaler()

# Normalizing runtime column

In [14]:
runtime = df_init[['Runtime']].values.astype(float)

In [15]:
runtime_norm = min_max_scaler.fit_transform(runtime)

# Normalizing Oscar wins column

In [16]:
oscars = df_init[['Oscar_winner']].values.astype(float)

In [17]:
oscars_norm = min_max_scaler.fit_transform(oscars)

# Normalizing imdb votes column

In [18]:
imdbvotes = df_init[['imdbVotes']].values.astype(float)

In [19]:
imdbvotes_norm = min_max_scaler.fit_transform(imdbvotes)

# Normalizing boxoffice column 

In [20]:
boxoffice = df_init[['BoxOffice']].values.astype(float)

In [21]:
boxoffice_norm = min_max_scaler.fit_transform(boxoffice)

# Normalizing release year column

In [22]:
release_year = df_init[['Release_year']].values.astype(float)

In [23]:
release_year_norm = min_max_scaler.fit_transform(release_year)

# Create dataframe from normalized columns

In [24]:
df_normalized = pd.DataFrame({'Oscars': [elem[0] for elem in oscars_norm],
                              'Runtime': [elem[0] for elem in runtime_norm],
                              'Boxoffice': [elem[0] for elem in boxoffice_norm],
                              'IMDB_votes': [elem[0] for elem in imdbvotes_norm],
                              'Release_year': [elem[0] for elem in release_year_norm],})

In [25]:
df_normalized['IMDB_rating'] = df_init['imdb_Rating']
df_normalized['Rotten_rating'] = df_init['RottenTomatoes_Rating']
df_normalized['Metacritic_rating'] = df_init['Metacritic_Rating']
df_normalized[genres] = df_init[genres] * 0.7
df_normalized['Title'] = df_init['Title']
# df_normalized['Rated'] = df_init['Rated']

In [26]:
df_normalized['IMDB_rating'].mean()
df_normalized['IMDB_rating'].fillna(df_normalized['IMDB_rating'].mean(), inplace = True)

In [27]:
df_normalized['IMDB_votes'].fillna(df_normalized['IMDB_votes'].mean(), inplace = True)
df_normalized['Rotten_rating'].fillna(df_normalized['Rotten_rating'].mean(), inplace = True)
df_normalized['Metacritic_rating'].fillna(df_normalized['Metacritic_rating'].mean(), inplace = True)
df_normalized['Release_year'].fillna(df_normalized['Release_year'].mean() , inplace = True)
df_normalized['Boxoffice'].fillna(df_normalized['Boxoffice'].mean(), inplace = True)
# df_normalized['Rated'].fillna('R2', inplace = True)
# df_normalized['Rated'].replace(r'U(NRATED|nrated)', r'NOT RATED', inplace = True, regex = True)
# df_normalized['Rated'].replace(r'Not (r|R)ated', r'NOT RATED', inplace = True, regex = True)
# df_normalized['Rated'].replace('Approved', 'APPROVED', inplace = True)
# df_normalized['Rated'].replace('Passed', 'PASSED', inplace = True)

# Create model to find nearest neighbours

In [28]:
df_normalized.head()

Unnamed: 0,Oscars,Runtime,Boxoffice,IMDB_votes,Release_year,IMDB_rating,Rotten_rating,Metacritic_rating,Unnamed: 9,Fantasy,...,Action,Romance,Reality-TV,Drama,Mystery,Animation,Sci-Fi,Sport,Crime,Title
0,0.0,0.142568,0.029992,1.0,0.827586,0.693716,0.91,0.8,0.7,0.0,...,0.0,0.0,0.0,0.7,0.0,0.0,0.0,0.0,0.0,The Shawshank Redemption
1,0.181818,0.152679,0.569381,0.980305,0.924138,0.9,0.94,0.84,0.7,0.0,...,0.7,0.0,0.0,0.7,0.0,0.0,0.0,0.0,0.7,The Dark Knight
2,0.363636,0.148635,0.312354,0.8698,0.937931,0.693716,0.86,0.74,0.7,0.0,...,0.7,0.0,0.0,0.0,0.0,0.0,0.7,0.0,0.0,Inception
3,0.0,0.139535,0.029992,0.80042,0.862069,0.693716,0.79,0.66,0.7,0.0,...,0.0,0.0,0.0,0.7,0.0,0.0,0.0,0.0,0.0,Fight Club
4,0.090909,0.154702,0.029992,0.777017,0.827586,0.693716,0.94,0.94,0.7,0.0,...,0.0,0.0,0.0,0.7,0.0,0.0,0.0,0.0,0.7,Pulp Fiction


In [29]:
movie_title = 'Fight Club'
movie_index = df_normalized.loc[df_normalized['Title'] == movie_title].index[0]

In [30]:
X = np.array(df_normalized.drop(['Title'],1))

In [31]:
nbrs = neighbors.NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(X)

In [32]:
nearest = nbrs.kneighbors(X[movie_index].reshape(1, -1), return_distance=False)

# Recommendations 

In [33]:
print(f"You searched for the movie: {df_init.iloc[nearest.flat[0]]['Title']}")
print('')
print('Recommendations:')
for index in nearest.flat[1:]:
    title = df_init.iloc[index]['Title']
    print(f"{index} - {df_init.iloc[index]['Title']}")

You searched for the movie: Fight Club

Recommendations:
0 - The Shawshank Redemption
82 - Requiem for a Dream
89 - Gran Torino
132 - Trainspotting
33 - American Beauty
253 - House of Cards
298 - The Help
54 - One Flew Over the Cuckoo's Nest
221 - There Will Be Blood
532 - Babel
319 - Lost in Translation
542 - Magnolia
241 - Rain Man
668 - Scent of a Woman
856 - What's Eating Gilbert Grape
644 - The Hunt
1089 - 25th Hour
1013 - Shame
723 - Boogie Nights
