## Baseline Method for Movie Recommendation System

In [1]:
import pandas as pd
import numpy as np

### Load Dataset

In [2]:
ratings = pd.read_csv('./data/ml-small/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
ratings = ratings.drop(['timestamp'],axis=1)

In [4]:
movies = pd.read_csv('processed_data/movies_new.csv')
movies.head()

Unnamed: 0,movieId,title,genres,num_ratings,avg_rating,year,weighted_avg
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0,3.92093,1995.0,3.787796
1,2,Jumanji (1995),Adventure|Children|Fantasy,110.0,3.431818,1995.0,3.465027
2,3,Grumpier Old Men (1995),Comedy|Romance,52.0,3.259615,1995.0,3.418787
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,7.0,2.357143,1995.0,3.426689
4,5,Father of the Bride Part II (1995),Comedy,49.0,3.071429,1995.0,3.360105


In [5]:
# Stats of ratings by each user.
ratings.groupby('userId').size().describe()

count     610.000000
mean      165.304918
std       269.480584
min        20.000000
25%        35.000000
50%        70.500000
75%       168.000000
max      2698.000000
dtype: float64

### Data Partitioning

#### Method 1: Leave-one-out
Leave-one-out is withhold one rated item from each user for evaluation. This train/test split is better used to evaluate users who are already in the system.

In [6]:
def leave_one_out(df): # ratings dataframe, cols: userId, movieId, rating
    user_names = df.userId.unique()
    test = pd.DataFrame()
    for user in user_names:
        test = test.append(df[df['userId']==user].sample(n=1))
    train = df.loc[~df.index.isin(test.index)]
    return train,test

# Uncomment below if use this method
# train,test = leave_one_out(ratings)
# print(test.info())

#### Method 2: Holdout: m-fold cross validation
Suggested number of fold is 10. This technique is suitable to evaluate capability of the model dealing with new users.

In [7]:
from sklearn.model_selection import KFold
kf = KFold(n_splits = 10, shuffle = True, random_state=1234)
for train_idx,test_idx in kf.split(ratings):
    train = ratings.iloc[train_idx]
    test = ratings.iloc[test_idx]

### Baseline Method
The baseline method used here is basic linear regression. Features (X) used are user average rating and weighted movie rating. And predictions(y) are user's rating for each movie.

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

#User Rating Average Matrix
user_avg = ratings.groupby('userId').mean().rating.values
#Movie Rating Average Matrix
movie_avg = movies[['movieId','weighted_avg']]

def data_split(idx):
    data = ratings.iloc[idx]
    X = data[['userId','movieId']]
    X['u_bias'] = X.apply(lambda x: user_avg[x['userId']-1], axis=1)
    X = X.drop(['userId'],axis=1)
    X = pd.merge(X,movie_avg,on='movieId')
    X = X.drop(['movieId'],axis=1).values
    y = data.rating.values
    return (X,y)

RMSEs = []
for train_idx,test_idx in kf.split(ratings):
    train_X,train_y = data_split(train_idx)
    test_X, test_y = data_split(test_idx)    
    clf = LinearRegression()
    clf.fit(train_X,train_y)
    y_pred = clf.predict(test_X)
    rmse = np.sqrt(mean_squared_error(y_pred,test_y))
    RMSEs.append(rmse)
    print('RMSE: ',rmse)
print('The average of rmse is', np.mean(RMSEs))

RMSE:  1.042592009041935
RMSE:  1.0504471358505316
RMSE:  1.0386264206084166
RMSE:  1.049384149259346
RMSE:  1.0522461872327022
RMSE:  1.0443170119984602
RMSE:  1.024050141009059
RMSE:  1.0385977498056458
RMSE:  1.0399070153938026
RMSE:  1.0448155066265055
The average of rmse is 1.0424983326826405
