In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
# import cudf

In [7]:
df = pd.read_csv('imdb_top_1000.csv')

# cudf
# df = cudf.read_csv('imdb_top_1000.csv')

In [8]:
df = df[['Series_Title', 'Genre', 'IMDB_Rating', 'Meta_score', 'Director', 'Star1', 'Star2']]

df.dropna(inplace=True)

In [15]:
df['Combined_Features'] = df['Genre'] + ' ' + df['Director'] + ' ' + df['Star1'] + ' ' + df['Star2']

In [16]:
df.head()

Unnamed: 0,Series_Title,Genre,IMDB_Rating,Meta_score,Director,Star1,Star2,Combined_Features
0,The Shawshank Redemption,Drama,9.3,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Drama Frank Darabont Tim Robbins Morgan Freeman
1,The Godfather,"Crime, Drama",9.2,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,"Crime, Drama Francis Ford Coppola Marlon Brand..."
2,The Dark Knight,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,Christian Bale,Heath Ledger,"Action, Crime, Drama Christopher Nolan Christi..."
3,The Godfather: Part II,"Crime, Drama",9.0,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,"Crime, Drama Francis Ford Coppola Al Pacino Ro..."
4,12 Angry Men,"Crime, Drama",9.0,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,"Crime, Drama Sidney Lumet Henry Fonda Lee J. Cobb"


In [17]:
vectorizer = CountVectorizer(stop_words='english')
feature_vector = vectorizer.fit_transform(df['Combined_Features'])

# cudf
# feature_vector = vectorizer.fit_transform(df['Combined_Features'].tp_pandas().values.astype('U'))

feature_vector

<843x2271 sparse matrix of type '<class 'numpy.int64'>'
	with 7327 stored elements in Compressed Sparse Row format>

In [18]:
similarities = cosine_similarity(feature_vector)

similarities

array([[1.        , 0.12598816, 0.12598816, ..., 0.12598816, 0.13363062,
        0.        ],
       [0.12598816, 1.        , 0.22222222, ..., 0.11111111, 0.11785113,
        0.11111111],
       [0.12598816, 0.22222222, 1.        , ..., 0.11111111, 0.11785113,
        0.11111111],
       ...,
       [0.12598816, 0.11111111, 0.11111111, ..., 1.        , 0.23570226,
        0.        ],
       [0.13363062, 0.11785113, 0.11785113, ..., 0.23570226, 1.        ,
        0.23570226],
       [0.        , 0.11111111, 0.11111111, ..., 0.        , 0.23570226,
        1.        ]])

In [44]:
def recommended_movie(movie_title, num_of_recommendations=5):
    if movie_title not in df['Series_Title'].values:
        return "Movie is not in my dataset"

    idx = df[df['Series_Title'] == movie_title].index[0]

    similarity_score = list(enumerate(similarities[idx]))
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)

    recommended_idx = [i[0] for i in similarity_score[1:num_of_recommendations + 1]]
    return df['Series_Title'].iloc[recommended_idx]



In [46]:
print(recommended_movie('The Godfather', 7))

3       The Godfather: Part II
974    The Godfather: Part III
74              Apocalypse Now
108                   Scarface
164                       Heat
305          On the Waterfront
416          Dog Day Afternoon
Name: Series_Title, dtype: object
