In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_df = pd.read_csv('movies.csv', usecols = ['movieId', 'title'])
ratings_df = pd.read_csv('ratings.csv', usecols = ['userId', 'movieId', 'rating'])

In [3]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [4]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
df = pd.merge(movies_df,ratings_df, on = 'movieId')
df.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [6]:
df.describe()

Unnamed: 0,movieId,userId,rating
count,100836.0,100836.0,100836.0
mean,19435.295718,326.127564,3.501557
std,35530.987199,182.618491,1.042529
min,1.0,1.0,0.5
25%,1199.0,177.0,3.0
50%,2991.0,325.0,3.5
75%,8122.0,477.0,4.0
max,193609.0,610.0,5.0


In [7]:
df.shape

(100836, 4)

In [8]:
#drop missing values (NaN)
#ROWS (AXIS = 0), COLUMNS (AXIS = 1)
#subset = title (which says drop missing values in title column)

combine_movie_rating = df.dropna(axis = 0, subset = ['title'])
combine_movie_rating

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5
...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),184,4.0
100832,193583,No Game No Life: Zero (2017),184,3.5
100833,193585,Flint (2017),184,3.5
100834,193587,Bungo Stray Dogs: Dead Apple (2018),184,3.5


In [9]:
# Pandas dataframe.groupby() function is used to split the data into groups based on some criteria.
movie_ratingCount = (combine_movie_rating.
                     groupby(by = ['title'])['rating'].count().reset_index().rename(columns = {'rating': 'totalRatingCount'})
                    )
movie_ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [10]:
rating_with_totalRatingCount = pd.merge(combine_movie_rating,movie_ratingCount, on = 'title')
rating_with_totalRatingCount.head()

Unnamed: 0,movieId,title,userId,rating,totalRatingCount
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [11]:
# Query the columns of a DataFrame with a boolean expression.
popularity_threshold = 50
rating_popular_movie= rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_movie.count()

movieId             41362
title               41362
userId              41362
rating              41362
totalRatingCount    41362
dtype: int64

In [12]:
rating_popular_movie.shape

(41362, 5)

In [13]:
## First lets create a Pivot matrix

movie_features_df=rating_popular_movie.pivot_table(index='title',columns='userId',values='rating').fillna(0)
movie_features_df.tail()

# The pivot_table() function is used to create a spreadsheet-style pivot table as a DataFrame. 

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
X2: X-Men United (2003),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0
You've Got Mail (1998),0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0
Young Frankenstein (1974),5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0
Zombieland (2009),0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5
Zoolander (2001),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0


In [14]:
movie_features_df.shape

(450, 606)

In [15]:
movie_features_df.values

array([[0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 3. , 0. , 4.5],
       ...,
       [5. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 3. , 0. , ..., 0. , 0. , 3.5],
       [0. , 0. , 0. , ..., 3. , 0. , 4. ]])

In [16]:
from scipy.sparse import csr_matrix

movie_features_df_matrix = csr_matrix(movie_features_df.values)
movie_features_df_matrix
movie_features_df_matrix.shape

(450, 606)

In [17]:
from scipy.sparse import csr_matrix

movie_features_df_matrix = csr_matrix(movie_features_df.values)

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movie_features_df_matrix)

# Brute Force Algorithm: 
# This is the most basic and simplest type of algorithm. A Brute Force Algorithm is the straightforward approach to a problem 
# i.e., the first approach that comes to our mind on seeing the problem. More technically it is just like iterating every 
# possibility available to solve that problem.
# Example: 
# If there is a lock of 4-digit PIN. The digits to be chosen from 0-9 then the brute force will be trying all possible 
# combinations one by one like 0001, 0002, 0003, 0004, and so on until we get the right PIN. In the worst case, it will take 
# 10,000 tries to find the right combination.

NearestNeighbors(algorithm='brute', metric='cosine')

In [18]:
movie_features_df.shape

(450, 606)

In [19]:
movie_features_df.shape[0]

450

In [20]:
query_index = np.random.choice(movie_features_df.shape[0])
print(query_index)

# Generate a random sample from np.arange(5) of size 3:
# np.random.choice(5, 3)
# array([0, 3, 4]) # random

155


In [21]:
#Slicing Rows in dataframe
movie_features_df.iloc[query_index,:]


#Example
# # Slicing columnss in data frame
# df1 = df.iloc[:, 0:2]
# # data frame after slicing
# df1

userId
1      0.0
2      0.0
3      0.0
4      0.0
5      0.0
      ... 
606    4.0
607    0.0
608    4.0
609    0.0
610    4.0
Name: Finding Nemo (2003), Length: 606, dtype: float64

In [22]:
movie_features_df.iloc[query_index,:].values.reshape(1, -1)

array([[0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3.5, 0. , 0. , 0. ,
        0. , 3. , 0. , 0. , 4. , 0. , 0. , 0. , 3.5, 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 4. , 0. , 0. , 0. , 5. , 0. , 0. , 0. , 4. , 3. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 5. , 0. , 4.5,
        4. , 4.5, 0. , 0. , 0. , 0. , 4.5, 0. , 0. , 0.5, 0. , 0. , 0. ,
        0. , 0. , 3.5, 0. , 0. , 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. ,
        0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 3.5, 4. , 3.5,
        0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 3. , 0. , 0. , 0. , 0. ,
        4. , 0. , 0. , 4.5, 0. , 5. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
        3. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 4. , 4.5, 0. , 4. , 5. ,
        0. , 0. , 0. , 3. , 0. , 0. , 0. , 0. , 1. , 0. , 0. , 4. , 0. ,
        0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. , 4.5, 3.5, 0. , 5. , 0. ,
        0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. 

In [23]:
distances, indices = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)
distances, indices

(array([[5.55111512e-16, 2.73625754e-01, 2.98565279e-01, 3.02660433e-01,
         3.47734600e-01, 3.74925995e-01]]),
 array([[155, 213, 352, 275, 308, 353]], dtype=int64))

In [24]:
distances.flatten()

array([5.55111512e-16, 2.73625754e-01, 2.98565279e-01, 3.02660433e-01,
       3.47734600e-01, 3.74925995e-01])

In [25]:
indices

array([[155, 213, 352, 275, 308, 353]], dtype=int64)

In [26]:
len(distances.flatten())

6

In [27]:
movie_features_df.index

Index(['10 Things I Hate About You (1999)', '12 Angry Men (1957)',
       '2001: A Space Odyssey (1968)', '28 Days Later (2002)', '300 (2007)',
       '40-Year-Old Virgin, The (2005)', 'A.I. Artificial Intelligence (2001)',
       'Abyss, The (1989)', 'Ace Ventura: Pet Detective (1994)',
       'Ace Ventura: When Nature Calls (1995)',
       ...
       'Willy Wonka & the Chocolate Factory (1971)',
       'Wizard of Oz, The (1939)', 'Wolf of Wall Street, The (2013)',
       'X-Men (2000)', 'X-Men: The Last Stand (2006)',
       'X2: X-Men United (2003)', 'You've Got Mail (1998)',
       'Young Frankenstein (1974)', 'Zombieland (2009)', 'Zoolander (2001)'],
      dtype='object', name='title', length=450)

In [28]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Finding Nemo (2003):

1: Incredibles, The (2004), with distance of 0.2736257535426838:
2: Shrek (2001), with distance of 0.2985652788100034:
3: Monsters, Inc. (2001), with distance of 0.3026604331933861:
4: Pirates of the Caribbean: The Curse of the Black Pearl (2003), with distance of 0.3477346004344566:
5: Shrek 2 (2004), with distance of 0.3749259945672617:
