In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy.sparse import csr_matrix
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import warnings
warnings.filterwarnings('ignore')
import pickle

# import helper

# Import the Movies dataset
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [2]:
# Import the ratings dataset
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [3]:
print('The dataset contains: ', len(ratings), ' ratings of ', len(movies), ' movies.')

The dataset contains:  25000095  ratings of  62423  movies.


In [4]:
full_train = pd.read_csv('data_for_clustering.csv')
# full_train = full_train.drop('Unnamed: 0', axis = 1)
full_train = full_train.set_index('title')

def clean_imdb_id(imdb_id):
    return(imdb_id[2:len(imdb_id)])

title_imdbId_links = pd.read_csv('Title IMDBID Links.csv')
title_imdbId_links['imdb_id'] = title_imdbId_links['imdb_id'].map(str)
title_imdbId_links = title_imdbId_links.drop('Unnamed: 0', axis = 1)
title_imdbId_links['imdb_id'] = title_imdbId_links['imdb_id'].map(clean_imdb_id)

links = pd.read_csv('links.csv')
links['imdbId'] = links['imdbId'].map(str)

In [5]:
links = links.merge(title_imdbId_links, left_on = 'imdbId', right_on = 'imdb_id', how = 'inner')
# links.to_csv('full_links.csv')

In [6]:
full_train.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,Fiction,...,ZoeSaldana,ZoeyDeutch,ZooeyDeschanel,ZoëBell,ZoëKravitz,ZuleikhaRobinson,am,eJ,ÓscarJaenada,МоррисЧестнат
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Patti Cake$,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
What Happened to Monday,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Good Time,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The Glass Castle,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Wind River,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
counts = full_train.sum(axis = 0)
with open('counts_for_clustering.pickle','wb') as handle:
    pickle.dump(counts, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [8]:
full_train = full_train.reset_index()
full_train = full_train.merge(links[['title','movieId']], left_on = 'title',right_on = 'title',how = 'inner')

In [9]:
full_train.head()

Unnamed: 0,title,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,ZoeyDeutch,ZooeyDeschanel,ZoëBell,ZoëKravitz,ZuleikhaRobinson,am,eJ,ÓscarJaenada,МоррисЧестнат,movieId
0,Patti Cake$,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,171983
1,What Happened to Monday,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,173925
2,Good Time,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,174727
3,The Glass Castle,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,169656
4,Wind River,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,175569


In [10]:
full_train.to_csv('clustering_train_data.csv')

In [13]:
avg_ratings = pd.DataFrame()
columns = list(full_train.columns)
columns.remove('title')
columns.remove('movieId')
column_list = []

for column in columns:
    if column in ('title','movieId'):
        continue
    
    if counts[column] < 20:
        continue
        
    print(column)
    
    column_list.append(column)
    temp_movies = full_train[full_train[column]!=0]    
        
    avg_votes_per_user = ratings[ratings['movieId'].isin(temp_movies['movieId'])].loc[:, ['userId', 'rating']]\
    .groupby(['userId'])['rating'].mean().round(3).reset_index()
    
    avg_votes_per_user.columns = ['userId',column]
    
    if column == columns[0]:
        avg_ratings = avg_votes_per_user
    else:
        avg_ratings = avg_ratings.merge(avg_votes_per_user, how = 'outer')

Action
Adventure
Animation
Comedy
Crime
Documentary
Drama
Family
Fantasy
Fiction
History
Horror
Movie
Music
Mystery
Romance
Science
TV
Thriller
War
Western
BillyWilder
WoodyAllen
AlPacino
AlanArkin
AlanRickman
AlanTudyk
AlecBaldwin
AlfredMolina
AllisonJanney
AmyAdams
AndyGarcía
AngelinaJolie
AnjelicaHuston
AnthonyHopkins
AnthonyMackie
BenAffleck.1
BenFoster
BenKingsley
BenStiller.1
BenedictCumberbatch
BeniciodelToro
BillHader
BillMurray
BillNighy
BillPaxton
BillPullman
BillyCrudup
BobBalaban
BobHoskins
BradPitt
BradleyWhitford
BrendanGleeson
BrianCox
Brien.1
Brown
BruceDern
BruceGreenwood
BruceMcGill
BruceWillis
BurtLancaster
Bush
CaryGrant
CateBlanchett
CatherineKeener
ChanningTatum
CharlesDurning
CharlizeTheron
ChrisCooper
ChrisO
ChristianBale
ChristopherLee
ChristopherLloyd
ChristopherMcDonald
ChristopherPlummer
ChristopherWalken
CiaránHinds
ClintEastwood
CliveOwen
ColinFarrell
ColinFirth
Connell
Connor.1
DanAykroyd.1
DanHedaya
DannyDeVito
DannyGlover
DannyHuston
DavidMorse
DavidStr

In [14]:
avg_ratings.head()

Unnamed: 0,userId,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,WallaceShawn,WalterMatthau,WhoopiGoldberg,WillemDafoe,WilliamH.1,WilliamHurt,WinonaRyder,WoodyAllen.1,WoodyHarrelson,ZoeSaldana
0,3,3.676,3.691,3.929,3.603,3.625,4.0,3.86,3.654,3.688,...,,,,4.0,,3.5,,,3.5,3.875
1,4,3.302,2.974,3.786,3.881,4.0,5.0,3.941,3.438,1.917,...,,,,4.25,,,,,2.25,3.0
2,13,3.7,3.6,3.75,3.889,3.333,,3.833,3.6,3.0,...,,,,,,,,,4.0,
3,14,4.75,3.0,3.0,4.25,5.0,,4.5,3.0,,...,,,,5.0,,,,,,
4,17,3.929,3.722,3.833,3.6,5.0,,2.5,3.833,3.0,...,,,,,,4.0,,,5.0,


In [15]:
avg_ratings = avg_ratings.fillna(0)

In [48]:
avg_ratings = pd.read_csv('avg_ratings.csv')
avg_ratings = avg_ratings.drop('Unnamed: 0', axis = 1)

In [49]:
user_ids = avg_ratings['userId']
avg_ratings = avg_ratings.drop('userId', axis = 1)
avg_ratings.head()

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,Fiction,...,WallaceShawn,WalterMatthau,WhoopiGoldberg,WillemDafoe,WilliamH.1,WilliamHurt,WinonaRyder,WoodyAllen.1,WoodyHarrelson,ZoeSaldana
0,3.676,3.691,3.929,3.603,3.625,4.0,3.86,3.654,3.688,3.759,...,0.0,0.0,0.0,4.0,0.0,3.5,0.0,0.0,3.5,3.875
1,3.302,2.974,3.786,3.881,4.0,5.0,3.941,3.438,1.917,2.919,...,0.0,0.0,0.0,4.25,0.0,0.0,0.0,0.0,2.25,3.0
2,3.7,3.6,3.75,3.889,3.333,0.0,3.833,3.6,3.0,4.056,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
3,4.75,3.0,3.0,4.25,5.0,0.0,4.5,3.0,0.0,4.5,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.929,3.722,3.833,3.6,5.0,0.0,2.5,3.833,3.0,3.571,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0


In [53]:
kmeans = KMeans(n_clusters=100)

# TODO: use fit_predict to cluster the dataset
kmeans.fit(avg_ratings)

with open('kmeans_100.pickle','wb') as handle:
    pickle.dump(kmeans, handle, protocol = pickle.HIGHEST_PROTOCOL)


avg_ratings['predictions'] = kmeans.predict(avg_ratings)
avg_ratings_full = pd.concat([user_ids,avg_ratings], axis = 1)    

In [56]:
avg_ratings_full.to_csv('cluster_data_w_predictions.csv', index = False)

In [19]:
candidate_ks = range(2, len(avg_ratings)+1, 100)

In [20]:
len(candidate_ks)

661

In [24]:
errors_per_k = {}
for k in [2, 30, 50, 70, 100]:
    print(k)
    errors_per_k[k] = KMeans(n_clusters = k).fit(avg_ratings).inertia_

2
30
50
70
100


In [25]:
errors_per_k

{2: 36134116608639.57,
 30: 161000838292.02127,
 50: 58007771541.91981,
 70: 29636971012.130146,
 100: 14504652009.809826}

In [27]:
kmeans = KMeans(n_clusters=100)

# TODO: use fit_predict to cluster the dataset
predictions = kmeans.fit_predict(avg_ratings)
print(kmeans.inertia_)

14554314180.909794


In [67]:
temp = full_train[['Action','Adventure','Animation','Comedy']].reset_index()
temp = temp.merge(links[['title','movieId']], left_on = 'title',right_on = 'title',how = 'inner')
temp.head()

Unnamed: 0,title,Action,Adventure,Animation,Comedy,movieId
0,Restoration,0,0,0,0,158390
1,Unforgettable,0,0,0,0,170815
2,Unforgettable,0,0,0,0,170815
3,Bad Boys,1,0,0,1,132561
4,Bad Boys,0,0,0,0,132561


In [32]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [15]:
title_imdbId_links.head()

Unnamed: 0,title,imdb_id
0,Toy Story,114709
1,Jumanji,113497
2,Grumpier Old Men,113228
3,Waiting to Exhale,114885
4,Father of the Bride Part II,113041


In [17]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId,title,imdb_id
0,53519,1028528,1991.0,Death Proof,1028528
1,54995,1077258,1992.0,Planet Terror,1077258
2,55063,1093842,13241.0,My Winnipeg,1093842
3,57368,1060277,7191.0,Cloverfield,1060277
4,57532,1073498,7278.0,Meet the Spartans,1073498


In [34]:
ratings = ratings.merge(links, right_on = 'movieId', left_on = 'movieId', how = 'inner')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId,title,imdb_id
0,3,57368,2.5,1439474739,1060277,7191.0,Cloverfield,1060277
1,13,57368,4.5,1297917798,1060277,7191.0,Cloverfield,1060277
2,57,57368,3.5,1201054472,1060277,7191.0,Cloverfield,1060277
3,113,57368,3.5,1478899202,1060277,7191.0,Cloverfield,1060277
4,181,57368,1.5,1547803701,1060277,7191.0,Cloverfield,1060277


In [46]:
Action_Movies = temp[temp['Action']==1]

In [48]:
Action_Movies

Unnamed: 0,title,Action,Adventure,Animation,Comedy,movieId
3,Bad Boys,1,0,0,1,132561
6,Safe,1,0,0,0,94405
7,Wild Bill,1,0,0,0,95804
21,No Escape,1,0,0,0,140713
22,No Escape,1,0,0,0,140713
...,...,...,...,...,...,...
5151,Wind River,1,0,0,0,175569
5154,S.W.A.T.: Under Siege,1,0,0,0,175605
5159,First Kill,1,0,0,0,175783
5161,Descendants 2,1,1,0,1,175795


In [49]:
avg_action_votes_per_user = ratings[ratings['movieId'].isin(Action_Movies['movieId'])]\
.loc[:, ['userId', 'rating']].groupby(['userId'])['rating'].mean().round(2)

In [50]:
avg_action_votes_per_user

userId
3         3.57
4         3.12
13        3.10
14        4.75
17        3.93
          ... 
162533    2.50
162534    2.75
162536    3.21
162538    3.92
162540    4.50
Name: rating, Length: 57458, dtype: float64

In [75]:
avg_ratings = pd.DataFrame()
columns = list(temp.columns)
columns.remove('title')
columns.remove('movieId')
for column in columns:
    if column in ('title','movieId'):
        continue
        
    temp_movies = temp[temp[column]==1]    
        
    avg_votes_per_user = ratings[ratings['movieId'].isin(temp_movies['movieId'])].loc[:, ['userId', 'rating']]\
    .groupby(['userId'])['rating'].mean().round(3)
    
    avg_ratings = pd.concat([avg_ratings, avg_votes_per_user], axis = 1) 
        
    print(column)

avg_ratings.columns = columns

Action
Adventure
Animation
Comedy


In [76]:
avg_ratings

Unnamed: 0,Action,Adventure,Animation,Comedy
3,3.571,3.564,3.846,3.538
4,3.120,2.884,3.875,3.780
12,,,,4.125
13,3.100,2.967,3.417,3.679
14,4.750,3.000,3.000,4.250
...,...,...,...,...
162533,2.500,3.000,2.000,3.875
162534,2.750,2.803,2.947,3.044
162536,3.208,3.357,,4.875
162538,3.917,3.833,4.000,2.733


In [36]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [42]:
genres = ['Romance', 'Sci-Fi']
genre_ratings = pd.DataFrame()
for genre in genres:        
    genre_movies = movies[movies['genres'].str.contains(genre) ]
    avg_genre_votes_per_user = ratings[ratings['movieId'].isin(genre_movies['movieId'])].loc[:, ['userId', 'rating']].groupby(['userId'])['rating'].mean().round(2)

    genre_ratings = pd.concat([genre_ratings, avg_genre_votes_per_user], axis=1)


In [43]:
genre_movies

Unnamed: 0,movieId,title,genres
23,24,Powder (1995),Drama|Sci-Fi
28,29,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
31,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
65,66,Lawnmower Man 2: Beyond Cyberspace (1996),Action|Sci-Fi|Thriller
75,76,Screamers (1995),Action|Sci-Fi|Thriller
...,...,...,...
62124,207998,Hyperspace (1984),Comedy|Sci-Fi
62150,208082,Dark Encounter (2019),Horror|Sci-Fi
62158,208104,Portals (2019),Horror|Sci-Fi
62198,208239,S.S. Doomtrooper (2006),Action|Horror|Sci-Fi


In [26]:
genre_ratings.head()

Unnamed: 0,rating,rating.1,rating.2
1,4.17,3.7,
2,3.16,4.02,
3,3.53,3.7,
4,3.45,3.16,
5,3.55,4.09,


In [23]:
ratings[ratings['movieId'].isin(genre_movies['movieId'])].loc[:, ['userId', 'rating']]

Unnamed: 0,userId,rating
11,1,4.0
12,1,2.5
13,1,2.5
48,1,5.0
66,1,4.5
...,...,...
25000056,162541,4.0
25000060,162541,2.5
25000069,162541,4.0
25000074,162541,2.0
