In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd# data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Import Packages**

In [2]:
import scipy as sp
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor as KNN

# Libraries used during sorting procedures.
import operator # <-- Convienient item retrieval during iteration 
import heapq # <-- Efficient sorting of large lists

# Imported for our sanity
import warnings
warnings.filterwarnings('ignore')

**Read Data**

In [3]:
movies = pd.read_csv('/kaggle/input/ul-predict-dataset/movies.csv')
movies.head()

In [4]:
imdb = pd.read_csv('/kaggle/input/ul-predict-dataset/imdb_data.csv')
imdb.head()

In [5]:
train = pd.read_csv("/kaggle/input/ul-predict-dataset/train.csv")
train.head()

In [6]:
test = pd.read_csv('/kaggle/input/ul-predict-dataset/test.csv')
test.shape

In [7]:
data = pd.merge(train, movies, on=["movieId", "movieId"])
data = data.dropna(axis = 0, subset = ['title'])
print(data.head())

# **EDA: Exploring Data in Datasets**

In [8]:
print (f'Number of movies in movies data: {movies.shape[0]}')
print (f'Number of movies in imdb data: {imdb.shape[0]}')
print (f'Number of ratings in train data: {train.shape[0]}')

In [9]:
# Average rating in dataset
avg_rating_in_dataset = np.mean(train["rating"])
print('Average rating in dataset: ', avg_rating_in_dataset)

In [10]:
# Distribution of Ratings
ratings = data["rating"].value_counts()
numbers = ratings.index
quantity = ratings.values

fig = px.bar(data, y=quantity, x=numbers)
fig.show()

In [11]:
# Top 10 movies
TopTen = data.query("rating == 5")
print(TopTen["title"].value_counts().head(10))

In [12]:
# Calculate mean rating of all movies
data.groupby('title')['rating'].mean().sort_values(ascending=False).head()

In [13]:
# Calculate number of ratings for all movies
data.groupby('title')['rating'].count().sort_values(ascending=False).head()

In [14]:
# Count of ratings per user
count_df = train.copy()
count_df['Ratings_per_user'] = count_df.groupby('userId')['userId'].transform('count')
count_df.head()

In [15]:
# Count of ratings per movie
count_df['Ratings_per_movie'] = count_df.groupby('movieId')['movieId'].transform('count')
count_df[count_df['movieId'] == 5]

In [16]:
# Average rating per movie
count_df['avg_rating'] = count_df.groupby('movieId')['rating'].transform('mean')
count_df.head()

In [17]:
# Average rating per user
count_df['avg_rating_user'] = count_df.groupby('userId')['rating'].transform('mean')
count_df.head()

In [18]:
movie_count_df = count_df.drop(['userId', 'timestamp', 'Ratings_per_user', 'avg_rating_user'], axis=1)
movie_count_df.drop_duplicates(subset='movieId', inplace=True)
movie_count_df.set_index('movieId', inplace=True)
movie_count_df.head()

In [19]:
user_count_df = count_df.drop(['movieId', 'timestamp', 'Ratings_per_movie', 'avg_rating', 'rating'], axis=1)
user_count_df.drop_duplicates(subset='userId', inplace=True)
user_count_df.set_index("userId", inplace=True)
user_count_df.head()

In [20]:
# Count of genres
genre_dum = pd.get_dummies(movies.genres.str.split('|', expand=True).stack())
genre_count = [x for x in genre_dum.columns]
print('Number of genres: ', len(genre_count))

In [21]:
# Count of movies per director
dir_count = imdb
dir_count['movies_per_dir'] = dir_count.groupby('director')['director'].transform('count')
print("Number of directors: ", dir_count['director'].nunique())
print("Average number of movies per director: ", dir_count['movies_per_dir'].mean())
print("Min number of movies per director: ", dir_count['movies_per_dir'].min())
print("Max number of movies per director: ", dir_count['movies_per_dir'].max())
plt.hist(dir_count['movies_per_dir'], bins=15)
plt.show()

In [22]:
# Average rating per genre

In [23]:
# Count of genres per director

# Count of movies per genre
# genre_dum['movies_per_gen'] = genre_dum.groupby('title')['title'].transform('count')

# **Collaborative Filtering**

In [24]:
# Sampling from dataset
train_samp = train.sample(n=5000, random_state=12)
train_samp.head()

In [25]:
# Determine how many users in test are not in sample
missing_uid = []
for userid in test['userId']:
    if userid in train_samp['userId']:
        continue
    else:
        missing_uid.append(userid)
        
print(len(set(missing_uid)))

In [26]:
# Determine how many movies in test are not in sample
missing_mid = []
for movieid in test['movieId']:
    if movieid in train_samp['userId']:
        continue
    else:
        missing_mid.append(movieid)
        
print(len(set(missing_mid)))

In [27]:
# Add missing users from train to sample
missing_df = train[train['userId'].isin(missing_uid)]
missing_df

In [28]:
missing_mdf = train[train['movieId'].isin(missing_mid)]
samp_uid = []
for userid in test['userId']:
    if userid in train_samp['userId']:
        samp_uid.append(userid)
    else:
        pass
    
missing_movies_only = missing_mdf[missing_mdf['userId'].isin(samp_uid)]
missing_movies_only

In [29]:
all_df = train_samp.append(missing_df, ignore_index=True)
all_df.shape

In [30]:
# all_pivot = all_df.pivot_table(index=['userId'], columns=['movieId'], values='rating')
# all_pivot.head()

In [31]:
# Create pivot table of userId ratings per movieId
train_pivot = train_samp.pivot_table(index=['userId'], columns=['movieId'], values='rating') 
train_pivot.head()

In [32]:
train_pivot[train_pivot.notnull() == True]

In [33]:
# Visualise Sparse Matrix 
fig, ax = plt.subplots(figsize=(15,5))
 
_ = sns.heatmap(train_pivot[:100], annot=False, ax=ax).set_title('Movie Ratings Matrix')

In [34]:
# Neutralising the data 
avg_ratings = train_pivot.mean(axis=1)

# Center each users ratings around 0
train_pivot_centered = train_pivot.sub(avg_ratings, axis=0)

# Fill in the missing data with 0s
train_pivot_norm = train_pivot_centered.fillna(0)

# Create copy
user_ratings_table = train_pivot_norm.copy()

**Item-Based Collaborative Filtering**

In [35]:
# Transpose matrix
movie_ratings_table = train_pivot_norm.T
movie_ratings_table.head()

In [36]:
# Drop users with no ratings
movie_ratings_table = movie_ratings_table.loc[:, (movie_ratings_table != 0).any(axis=0)]
movie_ratings_table.head()

In [37]:
# Save the utility matrix in sparse matrix format
movie_ratings_sparse = sp.sparse.csr_matrix(movie_ratings_table.values)

In [38]:
# Create item similarities
item_sim = cosine_similarity(movie_ratings_sparse)
item_sim_df = pd.DataFrame(item_sim, index=movie_ratings_table.index, columns=movie_ratings_table.index)
item_sim_df.head()

**User-Based Recommendations**

In [39]:
# Transpose matrix
user_ratings_table = user_ratings_table.T

# Drop users with no ratings
user_ratings_table = user_ratings_table.loc[:, (user_ratings_table != 0).any(axis=0)]

# Save the utility matrix in sparse matrix format
user_ratings_sparse = sp.sparse.csr_matrix(user_ratings_table.values)

In [40]:
# Create similarity table
user_sim = cosine_similarity(user_ratings_table.T)
user_sim_df = pd.DataFrame(user_sim, index = user_ratings_table.columns, columns = user_ratings_table.columns)
user_sim_df[:5]

**Calculate Predictions**

In [41]:
# Scenario 1 Predictions 
#   Case 0: userId and movieId in sample --> pred=similarity_score*rating_by_sim_users
#   Case 1: userId not in sample --> pred=average_rating_for_movie_in_train
#   Case 2: movieId not in sample --> pred=average_rating_from_user_overall
#   Case 3: neither userId nor movieId in sample --> pred=average_rating_in_train
def predict_ratings(user_id, movie_id, k=3):
    if user_id not in user_sim_df.columns:
            if movie_id not in item_sim_df.columns: # user and movie not in sample
                return avg_rating_in_dataset # could consider average rating per genre?
            else: # user not in sample, but movie is in sample
                average_m_rating = movie_count_df.loc[movie_id,'avg_rating']
                return average_m_rating
    
    if movie_id not in item_sim_df.columns: # user in sample, but movie not in sample
        average_u_rating = user_count_df.loc[user_id,'avg_rating_user']
        return average_u_rating
    
    # both user and movie are in sample
    # isolate and rank most similar users in dataframe
    sim_users = pd.DataFrame(user_sim_df[user_id]).sort_values(by=user_id, ascending=False).iloc[:k+1]
    sim_users['sim_to_user_id'] = sim_users[user_id]
    sim_users.drop(user_id, axis=1, inplace=True)
    
    # add ratings for movie_id from most similar users to dataframe
    movie_ratings = train_pivot[movie_id]
    sim_users_ratings = pd.merge(sim_users, movie_ratings, on=['userId'], how='left')
    sim_users_ratings['ratings'] = sim_users_ratings[movie_id]
    sim_users_ratings.drop(movie_id, axis=1, inplace=True)
        
    # add similarity * rating
    sim_users_ratings['sim_x_rating'] = sim_users_ratings['sim_to_user_id'] * sim_users_ratings['ratings']
    
    # predict rating for user_id movie_id combo
    pred_rating = sim_users_ratings['sim_x_rating'].sum() / sim_users_ratings['sim_to_user_id'].sum()
    
    return pred_rating

predict_ratings(44587, 4993)

In [42]:
# Create Sample of Test to Test Algorithm
test_samp = test.sample(n=1000, random_state=12)
test_samp.head()

# Test on sample
test_samp['pred'] = test_samp.apply(lambda x: predict_ratings(x['userId'], x['movieId']), axis=1)
test_samp['rating'] = (round(test_samp['pred']*2)/2)
test_samp.drop('pred', axis=1, inplace=True)
test_samp['Id'] = test_samp.apply(lambda a:'%s_%s' % (a['userId'].astype('int64'),a['movieId'].astype('int64')),axis=1)
test_samp = test_samp[['Id', 'rating']]
test_samp.set_index('Id', inplace=True)

test_samp

In [43]:
# Apply Algorithm1 to Test Set
submission_df = test.copy()
submission_df['pred'] = submission_df.apply(lambda b: predict_ratings(b['userId'], b['movieId']), axis=1)
submission_df['rating'] = (round(submission_df['pred']*2)/2)
submission_df['Id'] = submission_df.apply(lambda c:'%s_%s' % (c['userId'].astype('int64'),c['movieId'].astype('int64')),axis=1)
submission_df = submission_df[['Id', 'rating']]
# submission_df.set_index("Id", inplace=True)
    
submission_df.head()

In [44]:
submission_df.shape

In [45]:
submission_df.to_csv('JO_Sub_7.csv', index=False)