In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [2]:
train = pd.read_csv('unsupervised_data/unsupervised_movie_data/train.csv')
train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837


In [3]:
movies = pd.read_csv('unsupervised_data/unsupervised_movie_data/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
df = pd.merge(train,movies, on='movieId')

In [5]:
df.drop(['genres'], axis =1, inplace=True)

In [9]:
df = df.dropna(axis=0, subset=['title'])
rating_count = (df.groupby(by=['title'])['rating'].count().reset_index().rename(columns={'rating':'totalRatingCount'})[['title','totalRatingCount']])
rating_count.head()

Unnamed: 0,title,totalRatingCount
0,"""BLOW THE NIGHT!"" Let's Spend the Night Togeth...",1
1,"""Great Performances"" Cats (1998)",67
2,#1 Cheerleader Camp (2010),5
3,#Female Pleasure (2018),1
4,#FollowMe (2019),5


In [None]:
rating_with_totalRatingCount = df.merge(rating_count,left_on='title',right_on='title', how='left')
rating_with_totalRatingCount.head()

In [None]:
pd.set_option('display.float_format' ,lambda x: '%.3f' % x)
print(rating_count['totalRatingCount'].describe())

In [12]:
popularity_threshold = 50
rating_popular_movie = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,totalRatingCount
0,5163,57669,4.0,1518349992,In Bruges (2008),4253
1,87388,57669,3.5,1237455297,In Bruges (2008),4253
2,137050,57669,4.0,1425631854,In Bruges (2008),4253
3,120490,57669,4.5,1408228517,In Bruges (2008),4253
4,50616,57669,4.5,1446941640,In Bruges (2008),4253


In [13]:
rating_popular_movie.shape

(9744920, 6)

In [14]:
movie_features_df = rating_popular_movie.pivot_table(index='title',columns='userId', values='rating').fillna(0)
movie_features_df

userId,1,2,3,4,5,6,7,8,9,10,...,162532,162533,162534,162535,162536,162537,162538,162539,162540,162541
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Great Performances"" Cats (1998)",0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
'71 (2014),0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
'Round Midnight (1986),0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
'Salem's Lot (2004),0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
'Til There Was You (1997),0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xXx (2002),0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
xXx: Return of Xander Cage (2017),0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
xXx: State of the Union (2005),0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
¡Three Amigos! (1986),0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000


In [15]:
from scipy.sparse import csr_matrix

movie_features_df_matrix = csr_matrix(movie_features_df.values)

from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric= 'cosine', algorithm='brute')
model_knn.fit(movie_features_df_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [16]:
movie_features_df.shape

(9566, 162540)

In [17]:
query_index = np.random.choice(movie_features_df.shape[0])
print(query_index)
distances,indices = model_knn.kneighbors(movie_features_df.iloc[query_index,:].values.reshape(1,-1),n_neighbors=6)

8112


In [18]:
for i in range(0,len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i,movie_features_df.index[indices.flatten()[i]],distances.flatten()[i]))

Recommendations for Switch, The (2010):

1: Ugly Truth, The (2009), with distance of 0.8606708323871464:
2: Just Go with It (2011), with distance of 0.8693750364744713:
3: Bounty Hunter, The (2010), with distance of 0.87759654328282:
4: No Strings Attached (2011), with distance of 0.8790024369094077:
5: Letters to Juliet (2010), with distance of 0.881869050298792:


In [3]:
data = train

In [5]:
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from collections import defaultdict

In [6]:
ratings_dict = {'itemID': list(train.movieId),
                'userID': list(train.userId),
                'rating': list(train.rating)}

df = pd.DataFrame(ratings_dict)
df.shape

(10000038, 3)

In [7]:

reader = Reader(line_format='user item rating timestamp', sep='\t')

reader = Reader(rating_scale=(0.5, 5.0))

data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)


In [8]:
from surprise.model_selection import train_test_split

from surprise.model_selection import KFold

kf = KFold(n_splits=10)
kf.split(data)
algo = SVD(n_factors=35, n_epochs=25, lr_all=0.008, reg_all=0.08)
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f2e90c4f650>

In [9]:
test = pd.read_csv('unsupervised_data/unsupervised_movie_data/test.csv')
test.head()

Unnamed: 0,userId,movieId
0,1,2011
1,1,4144
2,1,5767
3,1,6711
4,1,7318


In [10]:
test = test.rename(columns={'movieId': 'itemId'})

In [11]:
test['Id'] = test[['userId', 'itemId']].apply(tuple, axis=1)

In [12]:
test['rating'] = test['Id'].apply(lambda x: algo.predict(x[0],x[1])[3])

In [13]:
test['Id'] = test['Id'].apply(lambda x: str(x[0])+'_'+ str(x[1]))

In [14]:
sub = test[['Id','rating']]

In [15]:
sub.to_csv('submission.csv', index=False)