In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import operator
import heapq
import warnings
import math
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/unsuplearnpred/sample_submission.csv
/kaggle/input/unsuplearnpred/movies.csv
/kaggle/input/unsuplearnpred/imdb_data.csv
/kaggle/input/unsuplearnpred/genome_tags.csv
/kaggle/input/unsuplearnpred/genome_scores.csv
/kaggle/input/unsuplearnpred/train.csv
/kaggle/input/unsuplearnpred/test.csv
/kaggle/input/unsuplearnpred/tags.csv
/kaggle/input/unsuplearnpred/links.csv


In [2]:
train_df = pd.read_csv('../input/unsuplearnpred/train.csv')
test_df = pd.read_csv('../input/unsuplearnpred/test.csv')

In [3]:
train_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837


# **EDA**

In [4]:
# Top 10 movies
TopTen = train_df.query("rating == 5")
print(TopTen["movieId"].value_counts().head(10))

318     16042
296     12859
260     10358
356     10235
2571    10125
527      9936
593      9832
858      9687
2959     8597
50       8495
Name: movieId, dtype: int64


In [5]:
count_df = train_df.copy()
count_df['Ratings_per_user'] = count_df.groupby('userId')['userId'].transform('count')
count_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,Ratings_per_user
0,5163,57669,4.0,1518349992,22
1,106343,5,4.5,1206238739,160
2,146790,5459,5.0,1076215539,39
3,106362,32296,2.0,1423042565,721
4,9041,366,3.0,833375837,42


# **Data Preprocessing**

In [6]:
train_df.drop(['timestamp'], axis = 1, inplace = True)

In [7]:
#sampling test data
sample_df = train_df.sample(n=5000, random_state = 42)

In [8]:
pvt_table = pd.pivot_table(sample_df, values = 'rating', columns = 'movieId', index = 'userId', fill_value = 0)
pvt_table_trans = pvt_table.T

In [9]:
pvt_table

movieId,1,2,3,4,5,6,7,10,11,12,...,195159,196417,196915,197199,197879,198789,201242,203334,203881,205425
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0,0.0,0.0,0,0,0,0.0,0.0,0,0,...,0,0,0.0,0,0,0,0,0,0,0
12,0,0.0,0.0,0,0,0,0.0,0.0,0,0,...,0,0,0.0,0,0,0,0,0,0,0
80,0,0.0,0.0,0,0,0,0.0,0.0,0,0,...,0,0,0.0,0,0,0,0,0,0,0
120,0,0.0,0.0,0,0,0,0.0,0.0,0,0,...,0,0,0.0,0,0,0,0,0,0,0
123,0,0.0,0.0,0,0,0,0.0,0.0,0,0,...,0,0,0.0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162271,0,0.0,0.0,0,0,0,0.0,0.0,0,0,...,0,0,0.0,0,0,0,0,0,0,0
162368,0,0.0,0.0,0,0,0,0.0,0.0,0,0,...,0,0,0.0,0,0,0,0,0,0,0
162457,0,0.0,0.0,0,0,0,0.0,0.0,0,0,...,0,0,0.0,0,0,0,0,0,0,0
162465,0,0.0,0.0,0,0,0,0.0,0.0,0,0,...,0,0,0.0,0,0,0,0,0,0,0


In [10]:
pvt_table_trans

userId,2,12,80,120,123,141,175,296,301,399,...,161995,162083,162148,162152,162188,162271,162368,162457,162465,162512
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
201242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
203334,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
203881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
user_similarity = cosine_similarity(pvt_table)
user_similarity

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [12]:
user_sim_df = pd.DataFrame(user_similarity,
                          index = pvt_table_trans.columns,
                          columns = pvt_table_trans.columns)
user_sim_df

userId,2,12,80,120,123,141,175,296,301,399,...,161995,162083,162148,162152,162188,162271,162368,162457,162465,162512
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
123,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162271,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
162368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
162457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
162465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [13]:
def movie_rating_test(reference_user, reference_movie, k):
    
    #extract similar user from user similarity matrix and sort
    users_sorted = pd.DataFrame(user_sim_df.loc[:,reference_user]).sort_values(by = reference_user, ascending = False)
    #remove reference_user
    most_similar_users = users_sorted.drop(users_sorted.iloc[0:]).rename(columns = {reference_user:'sim_to_current_user'})
    #select k amount of observation
    k_most_similar_users = most_similar_users[:k]
    #select movie rating of reference_movie by similar users
    most_similar_users_rating_of_current_movie = pd.DataFrame(pvt_table_trans.loc[reference_movie,k_most_similar_users.index]).rename(columns = {reference_movie: 'most_similar_users_rating_of_current_movie'})
    #create new dataframe
    testing_df = k_most_similar_users.merge(most_similar_users_rating_of_current_movie, on='userId', how='left')
    #add simliarity_x_rating
    testing_df['simliarity_x_rating'] = testing_df['sim_to_current_user'] * testing_df['most_similar_users_rating_of_current_movie']
    
    
    #user_rating = round(user_sim_df['similarity_x_rating'].sum() / user_sim_df['user_similarity'].sum())
    user_rating = round(testing_df['simliarity_x_rating'].sum() / testing_df['sim_to_current_user'].sum() * 2) / 2
    
    #return testing_df
    #return most_similar_users_rating_of_current_movie
    #return user_rating
    return user_rating

In [14]:
movie_rating_test(104339,356,10)

4.0

In [15]:
def avg_rating_user(reference_user):
    #find the mean rating value of each individual user
    mean_rating = train_df.groupby('userId').mean()
    #rating = round(mean_rating.loc[reference_user,'rating'])
    rating = round(mean_rating.loc[reference_user,'rating'] * 2) / 2
    return rating

In [16]:
avg_rating_user(21)

4.5

In [17]:
def avg_rating_movie(reference_movie):
    #find mean rating of movies
    movie_mean = train_df.groupby('movieId').mean()
    rating = round(movie_mean.loc[reference_movie,'rating'] * 2) / 2
    return rating

In [18]:
avg_rating_movie(204698)

4.0

In [19]:
def movie_rating(reference_user, reference_movie):
    #6a - Isolate the userid and the movieid (store them in variables user_id and movie_id)
    sample_user_id = pd.DataFrame(user_sim_df.index)
    sample_movie_id = pd.DataFrame(pvt_table_trans.index)
    
    if reference_user not in sample_user_id['userId'].values and reference_movie not in sample_movie_id['movieId'].values:
        return 3.5  #user_train false - avg_rating_all_movies
    elif reference_user not in sample_user_id['userId'].values and reference_movie in sample_movie_id['movieId'].values:
        return avg_rating_movie(reference_movie) #user_sample fasle movie_sample true - avg_rating_movie
    elif reference_user in sample_user_id['userId'].values and reference_movie not in sample_movie_id['movieId'].values:
        return avg_rating_user(reference_user)
    elif reference_user in sample_user_id['userId'].values and reference_movie in sample_movie_id['movieId'].values:
        # user true movie true
        k = 10
        #extract similar user from user similarity matrix and sort
        users_sorted = pd.DataFrame(user_sim_df.loc[:,reference_user]).sort_values(by = reference_user, ascending = False)
        #remove reference_user
        most_similar_users = users_sorted.drop(users_sorted.iloc[0:]).rename(columns = {reference_user:'sim_to_current_user'})
        #select k amount of observation
        k_most_similar_users = most_similar_users[:k]
        #select movie rating of reference_movie by similar users
        most_similar_users_rating_of_current_movie = pd.DataFrame(pvt_table_trans.loc[reference_movie,k_most_similar_users.index]).rename(columns = {reference_movie: 'most_similar_users_rating_of_current_movie'})
        #create new dataframe
        testing_df = k_most_similar_users.merge(most_similar_users_rating_of_current_movie, on='userId', how='left')
        #add simliarity_x_rating
        testing_df['simliarity_x_rating'] = testing_df['sim_to_current_user'] * testing_df['most_similar_users_rating_of_current_movie']
        if testing_df['sim_to_current_user'].sum() == 0:
            return avg_rating_movie(reference_movie)
        else:
            user_rating = round(testing_df['simliarity_x_rating'].sum() / testing_df['sim_to_current_user'].sum() * 2) / 2
            if user_rating == 0:
                return avg_rating_user(reference_user)
            else:
                return user_rating
                return user_rating

In [20]:
movie_rating(3,778) # false true

4.0

In [21]:
movie_rating(3,207404) # false false

3.5

In [22]:
movie_rating(21,207404) # true false

3.5

In [23]:
movie_rating(104339,356) # true true

4.0

In [24]:
test_sample = test_df[:10]
test_sample.head()

Unnamed: 0,userId,movieId
0,5,788
1,68,7438
2,336,40412
3,803,3822
4,547,903


In [25]:
rating_list = []

for x in test_sample.index:
    reference_user = test_sample.iloc[x][0]
    reference_movie = test_sample.iloc[x][1]
    
    rating = movie_rating(reference_user, reference_movie)
    
    rating_list.append(rating)

In [26]:
rating_list

[3.0, 4.0, 3.5, 3.5, 4.0, 3.5, 3.5, 3.5, 3.5, 4.0]

In [27]:
from IPython.display import clear_output

In [28]:
rating_list = []

for x in test_df.index:
    clear_output(wait=True)
    reference_user = test_df.iloc[x][0]
    reference_movie = test_df.iloc[x][1]
    
    rating = movie_rating(reference_user, reference_movie)
    
    rating_list.append(rating)
    print("Current Progress:",np.round(x/len(test_df)*100, 2),"%")

Current Progress: 99.99 %


In [29]:
submit_all = test_df.copy()
submit_all['rating'] = rating_list
submit_all = submit_all.astype(str)
submit_all['Id'] = submit_all.apply(lambda x:'%s_%s' % (x['userId'],x['movieId']),axis=1)
submit_all.drop(['userId', 'movieId'], axis=1, inplace=True)
submit_all = submit_all[['Id','rating']]
submit_all

Unnamed: 0,Id,rating
0,5_788,3.0
1,68_7438,4.0
2,336_40412,3.5
3,803_3822,3.5
4,547_903,4.0
...,...,...
14305,294_30707,4.0
14306,803_780,3.5
14307,519_912,3.5
14308,628_6764,3.5


In [30]:
submit_all.to_csv("./submission.csv", index=False)
