# Nearest Neighbor item based Collaborative Filtering

In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_df = pd.read_csv('movies.csv',usecols=['movieId','title'])
rating_df=pd.read_csv('ratings.csv',usecols=['userId', 'movieId', 'rating'])

In [3]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [4]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
df = pd.merge(rating_df,movies_df,on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [6]:
combine_movie_rating = df.dropna(axis = 0, subset = ['title'])
combine_movie_rating.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


In [7]:
print(len(df))
print(len(combine_movie_rating))

100836
100836


In [8]:
movie_ratingCount = (combine_movie_rating.
     groupby(by = ['title'])['rating'].
     count().reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
    )
movie_ratingCount.head(10)

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2
5,'Tis the Season for Love (2015),1
6,"'burbs, The (1989)",17
7,'night Mother (1986),1
8,(500) Days of Summer (2009),42
9,*batteries not included (1987),7


In [9]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, how= 'inner', on='title')
rating_with_totalRatingCount = rating_with_totalRatingCount.sort_values('totalRatingCount', ascending= False)
rating_with_totalRatingCount.head(1000)


Unnamed: 0,userId,movieId,rating,title,totalRatingCount
2562,251,356,5.0,Forrest Gump (1994),329
2652,430,356,3.0,Forrest Gump (1994),329
2650,426,356,5.0,Forrest Gump (1994),329
2649,425,356,5.0,Forrest Gump (1994),329
2648,423,356,5.0,Forrest Gump (1994),329
...,...,...,...,...,...
4456,329,593,2.0,"Silence of the Lambs, The (1991)",279
4454,325,593,5.0,"Silence of the Lambs, The (1991)",279
4463,339,593,5.0,"Silence of the Lambs, The (1991)",279
4453,321,593,4.0,"Silence of the Lambs, The (1991)",279


In [11]:
movie_features_df=rating_with_totalRatingCount.pivot_table(index='title',columns='userId',values='rating').fillna(0)
movie_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Implementing KNN Algorithm for getting the Most Similar Movies based on Collaborative Filtering.

In [12]:
from scipy.sparse import csr_matrix

movie_features_df_matrix = csr_matrix(movie_features_df.values)

from sklearn.neighbors import NearestNeighbors


model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movie_features_df_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [13]:
movie_features_df.shape

(9719, 610)

### Basic Execution:

In [14]:
query_index = np.random.choice(movie_features_df.shape[0])
print(query_index)
distances, indices = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

5108


In [15]:
movie_features_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Little Ashes (2008):

1: Little Ashes (2008), with distance of 0.0:
2: Cold Creek Manor (2003), with distance of 0.0:
3: Cheaper by the Dozen (1950), with distance of 0.06367082243095545:
4: Young Victoria, The (2009), with distance of 0.2525906813163403:
5: Letters to Juliet (2010), with distance of 0.2603997383663612:


### Advanced Execution:

In [17]:
from fuzzywuzzy import process
# For matching with the nearest input of our Target Movie(entered by the user) to get its most relevant search even when
# the precise movie name is not present in our dataset. ==> Using Fuzzy Logic



In [38]:
def Recommender(movie_name, data,model, n_recommendations ):
    model.fit(data)
    idx=process.extractOne(movie_name, movies_df['title'])[2]
    distances, indices=model.kneighbors(data[idx], n_neighbors=n_recommendations)
    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Movie Selected: ',movies_df['title'][idx])
            print('Searching for recommendations.....')
        else:
            print('{0}: {1}'.format(i, movie_features_df.index[indices.flatten()[i]]))
    
    

In [41]:
Recommender('star wars', movie_features_df_matrix , model_knn,20)
# let's say we take Top 20 recommended movies which are highly correlated to our Target Movie.

Movie Selected:  Star Wars: Episode IV - A New Hope (1977)
Searching for recommendations.....
1: Crocodile Dundee (1986)
2: Hook (1991)
3: Arthur (1981)
4: Beetlejuice (1988)
5: Willow (1988)
6: Austin Powers: International Man of Mystery (1997)
7: Cocoon: The Return (1988)
8: Labyrinth (1986)
9: Little Shop of Horrors (1986)
10: Batman & Robin (1997)
11: Addams Family Values (1993)
12: George of the Jungle (1997)
13: One Hour Photo (2002)
14: Lemony Snicket's A Series of Unfortunate Events (2004)
15: Peter Pan (1953)
16: Popeye (1980)
17: Fisher King, The (1991)
18: Tremors (1990)
19: Animal House (1978)
