In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import datetime
from dateutil.parser import parse
from tqdm import tqdm
from tqdm import trange
import sys
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

In [47]:
def mae(pred, true):
    return np.mean(np.abs(pred - true))

In [3]:
######################################
#    DOWNLOAD DATA
######################################

# https://drive.google.com/drive/folders/1Xt9EcMG64I_2y_fxu4d5Zr-qo2VlwiOP?usp=sharing

In [5]:
data = pd.read_csv('data/data.csv')

In [11]:
details = pd.read_csv('data/movie_details.csv')

In [12]:
data.head()

Unnamed: 0.1,Unnamed: 0,MovieID,UserID,Rating,Rated Date,Released Date,Title,Released Since,show_id
0,0,30,2173336,5,2004-06-22,2003,Something's Gotta Give,538 days,s8056
1,1,30,2473170,5,2004-05-22,2003,Something's Gotta Give,507 days,s8056
2,2,30,900816,3,2005-07-08,2003,Something's Gotta Give,919 days,s8056
3,3,30,1990901,4,2004-05-24,2003,Something's Gotta Give,509 days,s8056
4,4,30,662337,4,2005-07-08,2003,Something's Gotta Give,919 days,s8056


In [13]:
details.head()

Unnamed: 0.1,Unnamed: 0,show_id,Title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,0,s8056,Something's Gotta Give,Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",United States,"August 1, 2019",2003,PG-13,128 min,"Comedies, Romantic Movies","Still sexy at 60, Harry Sanborn wines and dine..."
1,118413,s6642,Dragonheart,Rob Cohen,"Sean Connery, Dennis Quaid, David Thewlis, Pet...",United States,"January 1, 2020",1996,PG-13,103 min,"Action & Adventure, Sci-Fi & Fantasy",In ancient times when majestic fire-breathers ...
2,135818,s568,Congo,Frank Marshall,"Dylan Walsh, Laura Linney, Ernie Hudson, Tim C...",United States,"July 1, 2021",1995,PG-13,108 min,"Action & Adventure, Thrillers","Eight people, some with ulterior motives, go o..."
3,149416,s601,The Game,David Fincher,"Michael Douglas, Sean Penn, Deborah Kara Unger...",United States,"July 1, 2021",1997,R,129 min,Thrillers,An aloof investment banker's life spirals into...
4,187778,s8111,Stuart Little 2,Rob Minkoff,"Michael J. Fox, Geena Davis, Hugh Laurie, Jona...",United States,"January 1, 2020",2002,PG,78 min,"Children & Family Movies, Comedies",Zany misadventures are in store as lovable cit...


In [23]:
data['Released Since'] = pd.to_datetime(data['Rated Date']) - pd.to_datetime(data['Released Date'], format='%Y')

# * NOTE * 5% (70,5564/14,111,287) of data is used below since Data has 14M rows to reduce runtimes/failures

# Baseline 1

In [15]:
from tqdm import tqdm

In [22]:
def mae(pred, true):
    return np.mean(np.abs(pred - true))
def rmse(pred, true):
    return np.sqrt(((pred - true) ** 2).mean())

In [23]:
from sklearn.model_selection import train_test_split

seed = 20
#working with half to reduce runtimes/failures
small = data.sample(frac=0.05, replace=False, random_state=seed)


# 60 / 20 / 20 split
X_train, X_test, y_train, y_test = train_test_split(small,
                    small.Rating, test_size=0.4, random_state=seed)

X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test,
                                                   test_size=0.5, random_state=seed)

In [18]:
X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape, y_test.shape

((423338, 9), (141113, 9), (141113, 9), (423338,), (141113,), (141113,))

In [25]:
#baseline 1
global_mean = X_train.Rating.mean()
y_pred1 = [global_mean for _ in y_valid]
rmse1 = rmse(y_valid, y_pred1)
mae1 = mae(y_valid, y_pred1)
print("Global mean RMSE:" + str(rmse1))
print("Global mean MAE:" + str(mae1))

Global mean RMSE:1.0549462083557153
Global mean MAE:0.8810598653527241


# Baseline 2

In [26]:
#baseline2
ratings_per_user = defaultdict(list)
ratings_per_movie = defaultdict(list)
movies_per_user= defaultdict(set)
users_per_movie = defaultdict(set)
for ind, row in tqdm(X_train.iterrows(), total=X_train.shape[0]):
    ratings_per_user[row.UserID].append(row.Rating)
    ratings_per_movie[row.MovieID].append(row.Rating)
    movies_per_user[row.UserID].add(row.MovieID)
    users_per_movie[row.MovieID].add(row.UserID)

100%|██████████| 423338/423338 [00:21<00:00, 19244.30it/s]


In [27]:
avg_user_rating = {u: (sum(ratings_per_user[u])/len(ratings_per_user[u]))
    if len(ratings_per_user[u]) > 0 else 0 for u in ratings_per_user}

In [28]:
avg_ratings_per_user = {}
for u in ratings_per_user:
    avg_ratings_per_user[u] = np.mean(ratings_per_user[u])

In [80]:
dev_per_user = {u: (sum(ratings_per_user[u])/len(ratings_per_user[u])) - global_mean
    if len(ratings_per_user[u]) > 0 else 0 for u in ratings_per_user}

dev_per_movie = {m: (sum(ratings_per_movie[m])/len(ratings_per_movie[m])) - global_mean 
    if len(ratings_per_movie[m]) > 0 else 0 for m in ratings_per_movie}

def baseline_2_user(row):
    movie = row.MovieID
    user = row.UserID
    out = global_mean
    if user in dev_per_user.keys():
        out += dev_per_user[user]
    return out

def baseline_2_movie(row):
    movie = row.MovieID
    out = global_mean
    if movie in dev_per_movie.keys():
        out += dev_per_movie[movie]
    return out

y_pred_user = []
y_pred_movie = []
for ind, row in X_valid.iterrows():
    y_pred_user.append(baseline_2_user(row))
    y_pred_movie.append(baseline_2_movie(row))

rmse_user = rmse(y_valid, y_pred_user)
rmse_movie = rmse(y_valid, y_pred_movie)

mae_user = mae(y_valid, y_pred_user)
mae_movie = mae(y_valid, y_pred_movie)


print("Global mean with user devations RMSE:" + str(rmse_user))
print("Global mean with user deviations MAE:" + str(mae_user))
print("Global mean with movie devations RMSE:" + str(rmse_movie))
print("Global mean with movie deviations MAE:" + str(mae_movie))

Global mean with user devations RMSE:1.1616527467847417
Global mean with user deviations MAE:0.9001950895586495
Global mean with movie devations RMSE:1.0003645740883667
Global mean with movie deviations MAE:0.8030538466243965


# Baseline 3

In [31]:
mean_per_movie = {m: (sum(ratings_per_movie[m])/len(ratings_per_movie[m])) 
    if len(ratings_per_movie[m]) > 0 else 0 for m in ratings_per_movie}

mean_per_user = {m: (sum(ratings_per_movie[m])/len(ratings_per_movie[m])) 
    if len(ratings_per_movie[m]) > 0 else 0 for m in ratings_per_movie}



def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

max_sims = []

def predict_rating(row,thresh):
    ratings = []
    sims = []
    max_sim = 0
    movie = row.MovieID
    user = row.UserID
    rating = row.Rating
    for m in movies_per_user[user]:
        if m == movie: continue
        ratings.append(rating - mean_per_movie[m])
        sim = Jaccard(users_per_movie[movie],users_per_movie[m])
        if sim > max_sim:
            max_sim = sim
            
        sims.append(sim)
    max_sims.append(max_sim)
    if (max_sim > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,sims)] #NEW TEMPORAL WEIGHT GOES HERE
        return mean_per_movie[movie]  + sum(weightedRatings) / sum(sims)
    else:
        return baseline_2_movie(row)

#for tresh in np.arange(0, 2, .005):
tresh = 0.000
y_pred3 = [predict_rating(row,tresh) for ind, row in tqdm(X_valid.iterrows(), total=X_valid.shape[0], position=0)]
rmse3 = rmse(y_valid, y_pred3)
mae3 = mae(y_valid, y_pred3)
print(f"User similarity (tresh = {tresh}) baseline RMSE:" + str(rmse3))
print(f"User similarity (tresh = {tresh}) baseline MAE:" + str(mae3))



100%|██████████| 141113/141113 [00:49<00:00, 2842.83it/s]

User similarity (tresh = 0.0) baseline MSE:0.6264050965178989
User similarity (tresh = 0.0) baseline MAE:0.44898433351154887





### 11/28 7:30AM - added
### Temporal Models

# Baseline 5

1. Moving average window by movie.

In [79]:
#trivial linear regressor with day month and year
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
samp = data.sample(n=500000)
train, test = train_test_split(samp, test_size=0.25)
x_train = pd.to_datetime(train['Rated Date'])
x_test = pd.to_datetime(test['Rated Date'])
y_train = train.Rating
y_test = test.Rating

def just_time(date):
    day = date.day
    month = date.month
    year = date.year
    return [day] + [month] + [year]
train_dates = [just_time(date) for date in x_train]
test_dates = [just_time(date) for date in x_test]
model = LinearRegression()
model.fit(train_dates, y_train)
preds = model.predict(test_dates)
print(rmse(preds,y_test))



1.048393112636199


In [34]:
from datetime import datetime
from sklearn.linear_model import LinearRegression
import pickle

In [35]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,MovieID,UserID,Rating,Rated Date,Released Date,Title,Released Since,show_id
6328647,6552913,8669,552382,4,2005-08-08,1996,Executive Decision,3507 days,s6720
5754832,5858377,7509,374423,3,2003-10-22,2002,Die Another Day,659 days,s6610
8914216,9177702,11781,2207196,4,2004-01-02,1984,Indiana Jones and the Temple of Doom,7306 days,s7074
7484300,7747786,10231,2550125,1,2005-11-06,2004,Kung Fu Hustle,675 days,s7240
373631,384234,457,1424596,5,2004-08-23,2004,Kill Bill: Vol. 2,235 days,s7199


In [36]:
ratings_per_user_stamped = defaultdict(list)
ratings_per_movie_stamped = defaultdict(list)
for i, row in X_train.iterrows():
    # rating ID, rated date, rating
    ratings_per_user_stamped[row['UserID']].append([i, datetime.strptime(row['Rated Date'], "%Y-%m-%d"), row.Rating])
    ratings_per_movie_stamped[row['MovieID']].append([i, datetime.strptime(row['Rated Date'], "%Y-%m-%d"), row.Rating])

# sort by rated date
for user, stamp in ratings_per_user_stamped.items():
    stamp.sort(key=lambda val: val[1], reverse=True)
    
for user, stamp in ratings_per_movie_stamped.items():
    stamp.sort(key=lambda val: val[1], reverse=True)

In [40]:
# 1) Predict user rating by previous x timesteps of that movie.
# If previous rating doesn't exist, return mean of that movie.

# keep track of ratings per users with rated date
# to predict rating of the user, look at the timesteps from before of that user.

def movie_window_features(stamps, window_size, mean):
    # stamps: [ [ratingID, ratedDate, Rating]]
    # if not enough window size, use mean of the movie
    features = [mean] * window_size
    for i in range(min(window_size, len(stamps))):
        features[i] = stamps[i][2]
    return features        
    
def predict_rating(ind, row, window_size):
    movie = row.MovieID
    movie_stamps = ratings_per_movie_stamped[movie]
    
    if movie in mean_per_movie.keys():
        movie_mean = mean_per_movie[movie]
    else:
        movie_mean = global_mean

    features = [movie_mean] * window_size
    rated_date = datetime.strptime(row['Rated Date'], "%Y-%m-%d")

    for i, stamp in enumerate(movie_stamps):
        if stamp[1] <= rated_date:
            features = movie_window_features(movie_stamps[i: i+ind], window_size, movie_mean)
            break
    return features
        
window_size = 10

model = LinearRegression()

X_features = []

for ind, row in tqdm(X_train.iterrows(), total=X_train.shape[0]):
    X_features.append(predict_rating(ind, row, window_size))

model.fit(X_features, y_train)

# train score
print("Sliding window training RMSE:", rmse(y_train, model.predict(X_features)))
print("Sliding window training MAE:", mae(y_train, model.predict(X_features)))

100%|██████████| 423338/423338 [02:43<00:00, 2596.64it/s]


Sliding window training RMSE: 0.8595499925022362
Sliding window training MAE: 0.6671385837891767


100%|██████████| 423338/423338 [02:47<00:00, 2534.64it/s]


Sliding window training RMSE: 0.8595155114284472
Sliding window training MAE: 0.6670961340606323


 68%|██████▊   | 286582/423338 [01:54<00:54, 2510.74it/s]


KeyboardInterrupt: 

In [230]:
mse5 = []
mae5 = []
for i, (ind, row) in enumerate(tqdm(X_valid.iterrows(), total=X_valid.shape[0])):
    features = predict_rating(ind, row, window_size)
    pred = model.predict([features])
    mse5.append(mse(pred, y_valid.iloc[[i]]))
    mae5.append(mae(pred, y_valid.iloc[[i]]))

print("Baseline 5 valid MSE:", np.mean(mse5))
print("Baseline 5 valid MAE:", np.mean(mae5))

100%|███████████████████████████████████████████| 141113/141113 [01:52<00:00, 1257.39it/s]

Baseline 5 valid MSE: 1.226954132557433
Baseline 5 valid MAE: 0.8837190580624726





## Results of baseline 5

| window_size | Train MSE | Train MAE | Valid MSE | Valid MAE |
| --- | --- | --- | --- | --- |
| 1 | 0.8358 | 0.7070 | 1.2561 | 0.9045 |
| 3 | 0.7541 | 0.6724 | 1.2536 | 0.8948 |
| 5 | 0.7424 | 0.6685 | 1.2386 | 0.8884 |
| 10 | 0.7390 | 0.6673 | 1.226 | 0.8837 |



In [None]:
    
    
    
#     user = row.UserID
#     user_stamps = ratings_per_user_stamped[user]
#     print(user_stamps)
    
# y_pred5 = [predict_rating(ind, row,tresh) for ind, row in tqdm(X_valid.iterrows(), total=X_valid.shape[0], position=0)]

# mse5 = mse(y_valid, y_pred5)
# mae5 = mae(y_valid, y_pred5)
# print(f"Window size = {window_size}) baseline 5 MSE:" + str(mse5))
# print(f"Window size = {window_size}) baseline 5 MAE:" + str(mae5))

In [148]:
X_train.head()

Unnamed: 0.1,Unnamed: 0,MovieID,UserID,Rating,Rated Date,Released Date,Title,Released Since,show_id,month,day_of_month,year
6328647,6552913,8669,552382,4,2005-08-08,1996,Executive Decision,3507 days,s6720,8,8,2005
5754832,5858377,7509,374423,3,2003-10-22,2002,Die Another Day,659 days,s6610,10,22,2003
8914216,9177702,11781,2207196,4,2004-01-02,1984,Indiana Jones and the Temple of Doom,7306 days,s7074,1,2,2004
7484300,7747786,10231,2550125,1,2005-11-06,2004,Kung Fu Hustle,675 days,s7240,11,6,2005
373631,384234,457,1424596,5,2004-08-23,2004,Kill Bill: Vol. 2,235 days,s7199,8,23,2004


### 11/28 7:30AM - added
### Factorized Machines Models

# Baseline 6

1. Latent factor models
    - 


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

data = pd.read_csv('data/data.csv')
details = pd.read_csv('data/movie_detials.csv')
seed = 20
#working with half to reduce runtimes/failures
small = data.sample(frac=0.05, replace=False, random_state=seed)


# 60 / 20 / 20 split
X_train, X_test, y_train, y_test = train_test_split(small,
                    small.Rating, test_size=0.4, random_state=seed)

X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test,
                                                   test_size=0.5, random_state=seed)


In [33]:
# 1) latent factor model

# ! pip install scikit-surprise
from surprise import SVD, Reader, Dataset
from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(1, 5))

dataset = small.reset_index(drop=True)
dataset.rename(columns={'UserID': "userID", "MovieID":"itemID", "Rating":"rating"}, inplace=True)
dataset = Dataset.load_from_df(dataset[['userID', 'itemID', 'rating']], reader)
# trainset, testset = train_test_split(dataset, test_size=0.25)

model = SVD()
# model.fit(trainset)

# train_pred = model.test(trainset)
# mse6_train = mse(np.array([p.est for p in train_pred]), np.array([t[2] for t in trainset]))
# mae6_train = mae(np.array([p.est for p in train_pred]), np.array([t[2] for t in trainset]))

# print("MSE6 train:", mse6)
# print("MAE6 trian:", mae6)

# predictions = model.test(testset)

# mse6_valid = mse(np.array([p.est for p in predictions]), np.array([t[2] for t in testset]))
# mae6_valid = mae(np.array([p.est for p in predictions]), np.array([t[2] for t in testset]))

# print("MSE6 test:", mse6)
# print("MAE6 test:", mae6)

cross_validate(model, dataset, measures=["MSE", "MAE"], cv=5, verbose=True)

Evaluating MSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MSE (testset)     0.9929  0.9934  1.0012  0.9927  0.9946  0.9950  0.0032  
MAE (testset)     0.7942  0.7925  0.7964  0.7930  0.7952  0.7943  0.0014  
Fit time          6.46    6.15    6.37    6.38    6.45    6.36    0.11    
Test time         1.04    0.48    0.48    1.00    0.47    0.69    0.27    


{'test_mse': array([0.99294404, 0.9934367 , 1.00124265, 0.99271232, 0.99463695]),
 'test_mae': array([0.79423931, 0.79250555, 0.79639337, 0.79297743, 0.795157  ]),
 'fit_time': (6.458282470703125,
  6.151134490966797,
  6.373811960220337,
  6.380130290985107,
  6.453195571899414),
 'test_time': (1.0447471141815186,
  0.48354530334472656,
  0.4750077724456787,
  1.0015232563018799,
  0.4651298522949219)}

In [None]:
# Evaluating MSE, MAE of algorithm SVD on 5 split(s).

#                   Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
# MSE (testset)     0.9929  0.9934  1.0012  0.9927  0.9946  0.9950  0.0032  
# MAE (testset)     0.7942  0.7925  0.7964  0.7930  0.7952  0.7943  0.0014  
# Fit time          6.46    6.15    6.37    6.38    6.45    6.36    0.11    
# Test time         1.04    0.48    0.48    1.00    0.47    0.69    0.27    
# {'test_mse': array([0.99294404, 0.9934367 , 1.00124265, 0.99271232, 0.99463695]),
#  'test_mae': array([0.79423931, 0.79250555, 0.79639337, 0.79297743, 0.795157  ]),
#  'fit_time': (6.458282470703125,
#   6.151134490966797,
#   6.373811960220337,
#   6.380130290985107,
#   6.453195571899414),
#  'test_time': (1.0447471141815186,
#   0.48354530334472656,
#   0.4750077724456787,
#   1.0015232563018799,
#   0.4651298522949219)}

In [34]:
# SVD doesn't perform as well, which is expected.
# SVD is not great at cold start problems because it only uses userID, movieID, to predict rating.
# Since our dataset is sparse and additional features are not used, values from SVD for new users and items 
# revert to zero because of the regularizer.

In [107]:
# TODO: show that data is sparse to explain

# Baseline 7

## Factorization machines

### extends latent-factor

- 1) User and Movie ID
- 2) User + MovieID + Genre + release Year

In [81]:
# https://github.com/ibayer/fastFM
from fastFM import als
from scipy.spatial import distance
from scipy.sparse import lil_matrix

In [82]:
# basline 6
userIDs,itemIDs = {},{}

for idx, d in tqdm(small.iterrows(), total=len(small)):
    u,i = d.UserID, d.MovieID
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    
nUsers,nItems = len(userIDs),len(itemIDs)

100%|██████████████████████████████████████████| 705564/705564 [00:21<00:00, 32878.56it/s]


In [110]:
medium.release_year.min()

1942

In [83]:
nUsers, nItems

(259040, 633)

In [37]:
X = lil_matrix((small.shape[0], nUsers+nItems))

print(X.shape)

for i in trange(small.shape[0]):
    user = userIDs[small.iloc[i].UserID]
    item = itemIDs[small.iloc[i].MovieID]
    X[i,user] = 1 # One-hot encoding of user
    X[i,nUsers + item] = 1 # One-hot encoding of item

(705564, 259673)

In [70]:
seed = 20

y = small.Rating

fm = als.FMRegression(n_iter=500, init_stdev=0.1, rank=5, random_state=seed, l2_reg_w=1, l2_reg_V=1)


# 80 / 10 / 10 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=seed)
fm.fit(X_train, y_train)

In [71]:
y_pred_train = fm.predict(X_train)
print("Baseline train 7 FM: RMSE:", np.sqrt(mse(y_pred_train, y_train)))
print("Baseline train 7 FM: MAE:", mae(y_pred_train, y_train))

y_pred_valid = fm.predict(X_valid)
print("Baseline valid 7 FM: RMSE:", np.sqrt(mse(y_pred_valid, y_valid)))
print("Baseline valid 7 FM: MAE:", mae(y_pred_valid, y_valid))

Baseline train 7 FM: RMSE: 0.19859783123430458
Baseline train 7 FM: MAE: 0.0962419354018597
Baseline valid 7 FM: RMSE: 1.3598246989410048
Baseline valid 7 FM: MAE: 1.0321521189169733


In [190]:
# Baseline train 7 FM: RMSE: 0.19859783123430458
# Baseline train 7 FM: MAE: 0.0962419354018597
# Baseline valid 7 FM: RMSE: 1.3598246989410048
# Baseline valid 7 FM: MAE: 1.0321521189169733

2) User ID + MovieID + Genre

In [111]:
# baseline 7b
medium = small.merge(details, on="show_id")

In [145]:
small
show_id = 's562'
details[details['show_id'] == 's562']

Unnamed: 0.1,Unnamed: 0,show_id,Title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
222,5539372,s562,Austin Powers in Goldmember,Jay Roach,"Mike Myers, Beyoncé Knowles-Carter, Seth Green...",United States,"July 1, 2021",2002,PG-13,94 min,"Action & Adventure, Comedies",The world's most shagadelic spy continues his ...


In [150]:
len(genres_series)

19

In [187]:
minYear = details.release_year.min()
maxYear = details.release_year.max()
nYears = maxYear - minYear + 1

genres_series = pd.DataFrame(details.listed_in.str.split(',').tolist()).stack().str.strip().value_counts()
nGenres = len(genres)

X = lil_matrix((medium.shape[0], nUsers+nItems+nYears+nGenres))

for i, row in tqdm(medium.iterrows(), total=medium.shape[0]):
    user = userIDs[row.UserID]
    item = itemIDs[row.MovieID]
    year = row.release_year - minYear
    X[i,user] = 1 # One-hot encoding of user
    X[i,nUsers + item] = 1 # One-hot encoding of item
    X[i,nUsers+item + year] = 1
    
    genres = row.listed_in.split(',')
    for g in genres:
        g_idx = genres_series.index == g.strip()
        X[i, nUsers+nItems+g_idx] = 1

100%|███████████████████████████████████████████| 705564/705564 [02:15<00:00, 5212.59it/s]


In [188]:
seed = 20

y = small.Rating

fm = als.FMRegression(n_iter=500, init_stdev=0.1, rank=5, random_state=seed, l2_reg_w=1, l2_reg_V=1)


# 80 / 10 / 10 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=seed)
fm.fit(X_train, y_train)

In [189]:
y_pred_train = fm.predict(X_train)
print("Baseline train 7b FM: RMSE:", np.sqrt(mse(y_pred_train, y_train)))
print("Baseline train 7b FM: MAE:", mae(y_pred_train, y_train))

y_pred_valid = fm.predict(X_valid)
print("Baseline valid 7b FM: RMSE:", np.sqrt(mse(y_pred_valid, y_valid)))
print("Baseline valid 7b FM: MAE:", mae(y_pred_valid, y_valid))

Baseline train 7b FM: RMSE: 0.24434819514982983
Baseline train 7b FM: MAE: 0.11038932534199308
Baseline valid 7b FM: RMSE: 1.5388583476656417
Baseline valid 7b FM: MAE: 1.1750044285470544


In [None]:
# Baseline train 7b FM: RMSE: 0.24434819514982983
# Baseline train 7b FM: MAE: 0.11038932534199308
# Baseline valid 7b FM: RMSE: 1.5388583476656417
# Baseline valid 7b FM: MAE: 1.1750044285470544

# Testset metrics

In [None]:
# Plot "cold start for users"
