In [1]:
import pandas

rating = pandas.read_table("ratings.csv", sep=",")

rating.iloc[:5]

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [2]:
rating.tail()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352
100835,610,170875,3.0,1493846415


In [3]:
rating.isnull().sum().sum()

0

In [4]:
total_user = rating["UserID"].nunique()

total_user

610

In [5]:
total_film = rating["MovieID"].nunique()

total_film

9724

In [6]:
rata_rating = rating["Rating"].mean().round(3)

rata_rating

3.502

On Average, Our Users Have Given a Rating of 3.5 out of 5.0

In [7]:
rating_table = rating.pivot_table(index="MovieID", columns="UserID", values="Rating")

SparseRate = lambda table : round((table.isnull().sum().sum() / table.size) * 100, 3)

SparseRate(rating_table)

98.3

Our 98.3 % Sparsity Rate Signaling Many Things.

* Our Table Mostly Consists of 0

* Our Films Haven't been Watched

* Our Films is Watched, but the User Didn't Rate the Film

* One or Two Films is Watch Less than 10 Times or User Only Watch Less Than 50 Movie

Our Solution is to Set a Minimum Threshold to Film - User Interaction

The First Threshold is Film. Our Minimum Threshold for a Film is

*This Film Must be Watched, Minimum of 10 Times*

In [8]:
minimum_film_interaction = 10

WatchFilm = rating.groupby("MovieID")["Rating"].agg(["count", "mean"])

WatchFilm.columns = ["TotalWatch", "Rating"]

WatchFilm = WatchFilm[WatchFilm["TotalWatch"] > minimum_film_interaction]

rating = rating[rating["MovieID"].isin(WatchFilm.index)]

rating["MovieID"].nunique()

2121

The Last Threshold is User. Our Minimum Threshold for a User is

*This User Must be Watch & Rate, Minimum of 50 Films*

In [9]:
minimum_user_interaction = 50

UserWatch = rating.groupby("UserID")[["Rating"]].count()

UserWatch = UserWatch[UserWatch["Rating"] > minimum_user_interaction]

rating = rating[rating["UserID"].isin(UserWatch.index)]

In [10]:
rating_table = rating.pivot_table(index="MovieID", columns="UserID", values="Rating")

SparseRate(rating_table)

90.396

Our Final Sparsity Rate is 90.39 %

In [11]:
rating_table.iloc[:5, :8]

UserID,1,4,6,7,10,11,15,16
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,4.0,,,4.5,,,2.5,
2,,,4.0,,,,,
3,4.0,,5.0,,,,,
5,,,5.0,,,,,
6,4.0,,4.0,,,5.0,,


In [12]:
rating_table = rating_table.fillna(0)

rating_table.iloc[:5, :8]

UserID,1,4,6,7,10,11,15,16
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,4.0,0.0,0.0,4.5,0.0,0.0,2.5,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0


In [13]:
rating_table.index.name

'MovieID'

In [14]:
rating_table.columns.name

'UserID'

In [15]:
usecols = ["MovieID", "Title"]

table_film = pandas.read_table("movies.csv", sep=",", usecols=usecols)

rating = rating.join(table_film.set_index("MovieID"), on="MovieID")

rating.iloc[:5]

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title
0,1,1,4.0,964982703,Toy Story (1995)
1,1,3,4.0,964981247,Grumpier Old Men (1995)
2,1,6,4.0,964982224,Heat (1995)
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995)
4,1,50,5.0,964982931,"Usual Suspects, The (1995)"


What is Sparse Matrix ? Sparse Matrix is Matrix that Many of the Value is 0

Usually Sparse Matrix is Mostly Use in Collaborative Method Recommendation System

In [16]:
from scipy.sparse import csr_matrix

rating_matrix = csr_matrix(rating_table.values)

rating_matrix.shape

(2121, 352)

In [17]:
rating_reset = rating_table.reset_index()

rating_reset.shape

(2121, 353)

In [18]:
rating_reset.iloc[:5, :10]

UserID,MovieID,1,4,6,7,10,11,15,16,17
0,1,4.0,0.0,0.0,4.5,0.0,0.0,2.5,0.0,4.5
1,2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0


In [19]:
rating_reset.columns.name

'UserID'

In [20]:
rating_reset.iloc[:10]

UserID,MovieID,1,4,6,7,10,11,15,16,17,...,600,601,602,603,604,605,606,607,608,610
0,1,4.0,0.0,0.0,4.5,0.0,0.0,2.5,0.0,4.5,...,2.5,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,5.0
1,2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0
2,3,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
3,5,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
4,6,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,5.0
5,7,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0
6,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,10,0.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
8,11,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,2.5,3.0,0.0,0.0
9,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(metric="cosine", algorithm="brute")

model.fit(rating_matrix)

## `Main Model`

### 🕵 Challenges

* Find Similar Movie to `Avengers, The (2012)`

* Find Similar Movie to `Guardians of the Galaxy (2014)`

* Find Similar Movie to `Forrest Gump (1994)`

* Find Similar Movie to `Toy Story (1995)`

In [22]:
input_title = "Avengers"

movies_index = table_film[table_film["Title"].str.contains(input_title)]

movies_index = movies_index.iloc[2]['MovieID']

table_film[table_film["MovieID"] == movies_index]

Unnamed: 0,MovieID,Title
7693,89745,"Avengers, The (2012)"


In [23]:
rating[rating["MovieID"] == movies_index].iloc[:8]

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title
1535,15,89745,2.0,1510572842,"Avengers, The (2012)"
2182,18,89745,4.0,1455050365,"Avengers, The (2012)"
3520,21,89745,4.0,1418063440,"Avengers, The (2012)"
7891,52,89745,5.0,1468051351,"Avengers, The (2012)"
9030,62,89745,4.0,1521488914,"Avengers, The (2012)"
9400,63,89745,3.5,1443199946,"Avengers, The (2012)"
11555,68,89745,4.5,1336605252,"Avengers, The (2012)"
11950,73,89745,4.0,1464199263,"Avengers, The (2012)"


In [24]:
movies_index = rating_reset[rating_reset["MovieID"] == movies_index]

movies_index = movies_index.index[0]

movies_index

1961

In [25]:
rating_reset.iloc[1960:1965, :10]

UserID,MovieID,1,4,6,7,10,11,15,16,17
1960,89492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1961,89745,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
1962,89774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1963,89864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1964,89904,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
rating[rating["UserID"] == 1961]

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title


In [27]:
total_limit = 10

total_limit += 1

distances, table_indexes = model.kneighbors(rating_matrix[movies_index], n_neighbors=total_limit)

sorter = list(zip(table_indexes.flatten().tolist(), distances.flatten().tolist()))

table_indexes = sorted(sorter, key=lambda x: x[1])

table_indexes[:5]

[(1961, 0.0),
 (2060, 0.23810542775945065),
 (1821, 0.2771270484831855),
 (1907, 0.29182116047789386),
 (1970, 0.2938930732962426)]

In [28]:
molist = [i[0] for i in table_indexes]

molist[:5]

[1961, 2060, 1821, 1907, 1970]

In [29]:
rating_reset.iloc[molist, :10]

UserID,MovieID,1,4,6,7,10,11,15,16,17
1961,89745,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
2060,112852,0.0,0.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0
1821,59315,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
1907,77561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1970,91529,0.0,0.0,0.0,0.0,5.0,0.0,2.0,4.5,0.0
2047,111362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1948,87232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1912,79132,0.0,0.0,0.0,0.0,0.0,0.0,3.5,3.0,4.5
2043,110102,0.0,0.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0
2077,122886,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0


In [30]:
molist = [rating_reset.iloc[i, 0] for i in molist]

molist[:5]

[89745, 112852, 59315, 77561, 91529]

In [31]:
table_film[table_film["MovieID"].isin(molist)]

Unnamed: 0,MovieID,Title
6743,59315,Iron Man (2008)
7324,77561,Iron Man 2 (2010)
7372,79132,Inception (2010)
7620,87232,X-Men: First Class (2011)
7693,89745,"Avengers, The (2012)"
7768,91529,"Dark Knight Rises, The (2012)"
8151,102125,Iron Man 3 (2013)
8395,110102,Captain America: The Winter Soldier (2014)
8425,111362,X-Men: Days of Future Past (2014)
8475,112852,Guardians of the Galaxy (2014)


In [32]:
def OutputFilm(title, limit=5):

  movies_index = table_film[table_film["Title"] == title]
  movies_index = movies_index["MovieID"].values[0]
  movies_index = rating_reset[rating_reset["MovieID"] == movies_index].index[0]

  limit += 1

  distances, table_indexes = model.kneighbors(rating_matrix[movies_index], n_neighbors=limit)
  sorter = list(zip(table_indexes.flatten().tolist(), distances.flatten().tolist()))
  table_indexes = sorted(sorter, key=lambda x: x[1])

  molist = [i[0] for i in table_indexes]

  molist = [i for i in rating_reset.iloc[molist, 0].values][1:]
  result = table_film[table_film["MovieID"].isin(molist)]
  return result

title = "Guardians of the Galaxy (2014)"

OutputFilm(title, 15)

Unnamed: 0,MovieID,Title
6743,59315,Iron Man (2008)
7324,77561,Iron Man 2 (2010)
7589,86332,Thor (2011)
7620,87232,X-Men: First Class (2011)
7646,88140,Captain America: The First Avenger (2011)
7693,89745,"Avengers, The (2012)"
8395,110102,Captain America: The Winter Soldier (2014)
8425,111362,X-Men: Days of Future Past (2014)
8438,111759,Edge of Tomorrow (2014)
8683,122886,Star Wars: Episode VII - The Force Awakens (2015)


In [33]:
title = "Forrest Gump (1994)"

OutputFilm(title, 5)

Unnamed: 0,MovieID,Title
97,110,Braveheart (1995)
257,296,Pulp Fiction (1994)
277,318,"Shawshank Redemption, The (1994)"
418,480,Jurassic Park (1993)
1939,2571,"Matrix, The (1999)"


In [34]:
title = "Toy Story (1995)"

OutputFilm(title, 8)

Unnamed: 0,MovieID,Title
123,150,Apollo 13 (1995)
224,260,Star Wars: Episode IV - A New Hope (1977)
257,296,Pulp Fiction (1994)
314,356,Forrest Gump (1994)
418,480,Jurassic Park (1993)
911,1210,Star Wars: Episode VI - Return of the Jedi (1983)
2355,3114,Toy Story 2 (1999)
3194,4306,Shrek (2001)
