In [1]:
import numpy as np
from numba import jit
import pandas as pd
from pyclustering.utils.metric import type_metric, distance_metric
from pyclustering.cluster.kmedoids import kmedoids
from tqdm import tqdm
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

## Data Manipulation Result

`n_c`: number of customers in the dataset.  
`n_m`: number of movies in the dataset.  
`n`: number of ratings.   
`target`: dataset with size $n_c\times n_m$.  
`df_probe`: validation dataset with size $100\times 3$.  
`df_target`: training dataset with size $n\times 3$.  
`df_movie`: movie dataset containing information about genre, director, country etc.

In [2]:
target = pd.read_csv('./Data/target.csv').set_index('CustomerID')
# change the index of target.T from str to float
target_T = target.T.copy()
target_T.index = target_T.index.astype(float)
target = target_T.T.copy()
df_probe = pd.read_csv('./Data/df_probe.csv')
df_target = pd.read_csv('./Data/df_target.csv')
df_movie = pd.read_csv('./Data/df_movie.csv')

## Clustering Methods

### Important functions

self-defined metric

In [3]:
@jit
def metric_array(x,y):
    z = np.count_nonzero(~(np.isnan(x) + np.isnan(y)))
    d1 = np.nansum(abs(x - y))/z
    d2 = 1 - z/np.count_nonzero(~(np.isnan(x) & np.isnan(y)))
    d = d1 + d2*4
    if np.isnan(d):
        return 8
    else:
        return d

k-medoids based on self-defined metric

In [4]:
def kmedoids_self(data, centers, k, itermax = 100):
    metric = distance_metric(type_metric.USER_DEFINED, func = metric_array)
    kmedoids_instance = kmedoids(np.array(data), centers, metric = metric, itermax = itermax)
    kmedoids_instance.process()
    clusters = kmedoids_instance.get_clusters()
    medoids = kmedoids_instance.get_medoids()
    return [medoids, clusters]

calculate clustering MSE

In [5]:
# clustering MSE
def MSE_clustering(target = target, cluster = None, labels = None, bi = False, df_probe = df_probe):
    mse = 0
    for ind in range(df_probe.shape[0]):
        CID = df_probe.iloc[ind,0]
        MID = df_probe.iloc[ind,1]
        movie_index = list(target.T.index).index(MID)
        customer_index = list(target.index).index(CID)
        if labels is not None:
            if bi:
                rls = labels[0]
                cls = labels[1]
                rating_predict = np.mean(target_cc[rls == rls[customer_index],:][:,cls == cls[movie_index]])
            else:
                rating_predict = np.mean(target.loc[labels == labels[customer_index], :].iloc[:, movie_index])
        if cluster is not None:
            customer_list = (np.array(cluster)[[customer_index in i for i in cluster]]).tolist()[0]
            rating_predict = np.mean(target.iloc[customer_list, movie_index])
            if np.isnan(rating_predict):
                rating_predict = impute_cm[customer_index, movie_index]
        mse += (float(rating_predict) - list(df_probe.Rating)[ind])**2
    return mse/len(df_probe)

### Important datasets

`impute_cm`: a dataset whose value is produced by customer mean.  
`impute_cc`: a dataset whose value is produced by customer clustering using self-defined metric (which has the best performance).  
`target_cm`: impute the missing values of `target` dataset with customer mean.  
`target_cc`: impute the missing values of `target` dataset with customer clustering result using self-defined metric.

#### impute with customer mean dataset

In [6]:
impute_cm = np.zeros(target.shape)
for i in range(target.shape[0]):
    impute_cm[i,:] = np.mean(target.iloc[i,:])

In [7]:
mask = np.isnan(target)
target_cm = np.array(target.copy())
target_cm[mask] = impute_cm[mask]

#### impute with customer clustering dataset

In [8]:
k = 35
np.random.seed(4)
initial_centers = np.random.choice(target.shape[0], k)
medoid, cluster = kmedoids_self(target, initial_centers, k)

In [9]:
impute_cc = np.zeros(target.shape)
with tqdm(total = target.shape[1]) as pbar:
    for i in range(target.shape[1]):
        for j in cluster:
            fill = np.mean(target.iloc[j, i])
            if np.isnan(fill):
                fill = np.mean(target.iloc[:, i])
            impute_cc[j,i] = fill
        pbar.update(1)

100%|██████████| 2784/2784 [00:53<00:00, 52.34it/s]


In [10]:
# target_cc will be a matrix with missing element imputed by Customer Clustering result
mask = np.isnan(target)
target_cc = np.array(target.copy())
target_cc[mask] = impute_cc[mask]

### Customer Mean

MSE

In [11]:
mse_cm = 0
Rating_probe = list(df_probe.Rating)
customer_mean_pred = []
for i,j in enumerate(df_probe.CustomerID):
    rating_predict = np.mean(target.iloc[target.index == j,:], axis = 1)
    customer_mean_pred.append(float(rating_predict))
    mse_cm += (float(rating_predict) - Rating_probe[i])**2

mse_cm/len(df_probe)

0.7439401388642582

### Movie Mean

MSE

In [12]:
mse_mm = 0
movie_mean_pred = []
for i,j in enumerate(df_probe.MovieID):
    rating_predict = np.mean(target.T.iloc[target.T.index == j,:], axis = 1)
    movie_mean_pred.append(float(rating_predict))
    mse_mm += (float(rating_predict) - Rating_probe[i])**2
    
mse_mm/len(df_probe)

0.8058984788400783

### Customer Clustering

MSE

In [13]:
MSE_clustering(cluster = cluster)

0.6226860328316228

#### cosine similarity (imputed with customer mean)

In [14]:
# cosine similarity
clustering = KMedoids(n_clusters = 35, metric = 'cosine', init = 'k-medoids++', random_state = 0).fit(target_cm)
labels = clustering.labels_

MSE

In [15]:
MSE_clustering(labels = labels)

0.753725527005934

#### Euclidean distance (imputed with customer mean)

In [16]:
# euclidean distance
kmeans = KMeans(n_clusters = 35, init = 'k-means++', random_state = 0).fit(target_cm)
labels = kmeans.labels_

MSE

In [17]:
MSE_clustering(labels = labels)

0.6926533889317608

### Movie Clustering

In [18]:
genre_set = []
for gen in set(df_movie.genre):
    genre_set.append(list(df_movie[df_movie.genre == gen].MovieID))

MSE

In [19]:
mse_mc = 0
num = 0
for ind in range(df_probe.shape[0]):
    CID = df_probe.iloc[ind,0]
    MID = df_probe.iloc[ind,1]
    movie_list = np.array((np.array(genre_set)[[MID in i for i in genre_set]]).tolist()[0])
    rate_series = target.T.loc[movie_list[[i in target.T.index for i in movie_list]],:].loc[:,CID]
    if np.sum(~np.isnan(rate_series)) == 1:
        rating_predict = customer_mean_pred[ind]
        num += 1
    else:
        rating_predict = np.mean(rate_series)
    mse_mc += (rating_predict - list(df_probe.Rating)[ind])**2

mse_mc/len(df_probe)

0.7158278417272456

## Memory-based Collaborative Filtering

https://www.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/

In [20]:
from sklearn.metrics import mean_squared_error

#### impute with customer mean

In [21]:
target_cf_impute_train = np.array(target_cm)
target_cf_impute_train[[list(target.index).index(CID) for CID in df_probe['CustomerID']],\
                       [list(target.T.index).index(MID) for MID in df_probe['MovieID']]] = impute_cm[[list(target.index).index(CID) for CID in df_probe['CustomerID']],\
                                                                                                    [list(target.T.index).index(MID) for MID in df_probe['MovieID']]]
target_cf_impute_test = np.zeros(target.shape)
target_cf_impute_test[[list(target.index).index(CID) for CID in df_probe['CustomerID']],\
                      [list(target.T.index).index(MID) for MID in df_probe['MovieID']]] = df_probe['Rating']

In [22]:
# these functions are based on https://www.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/
def fast_similarity(ratings, kind='user', epsilon=1e-9):
    # epsilon: small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

def predict_fast_simple(ratings, similarity, kind='user'):
    if kind == 'user':
        return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'item':
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    
def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [23]:
user_similarity = fast_similarity(target_cf_impute_train, kind='user')
item_similarity = fast_similarity(target_cf_impute_train, kind='item')

item_prediction = predict_fast_simple(target_cf_impute_train, item_similarity, kind='item')
user_prediction = predict_fast_simple(target_cf_impute_train, user_similarity, kind='user')

print('User-based CF MSE:', get_mse(user_prediction, target_cf_impute_test))
print('Item-based CF MSE:', get_mse(item_prediction, target_cf_impute_test))

User-based CF MSE: 0.8167100804744857
Item-based CF MSE: 0.7444649163013036


## Matrix Decomposition Methods

In [24]:
from surprise import Reader, Dataset
from surprise import SVD, NMF, accuracy
from surprise.prediction_algorithms.co_clustering import CoClustering
from surprise.model_selection import train_test_split

### Data Preparation

In [25]:
reader = Reader()
df_merged = pd.concat([df_target, df_probe])

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df_merged, reader)
df_train, df_test = train_test_split(data, train_size = df_target.shape[0], test_size = df_probe.shape[0], shuffle = False)

### SVD

In [26]:
algo = SVD(random_state = 0)

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(df_train)
predictions = algo.test(df_test)

# Then compute MSE
accuracy.mse(predictions)

MSE: 0.4290


0.4289562606062728

### NMF (Non-negative matrix factorization)

In [27]:
algo = NMF(random_state = 0)

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(df_train)
predictions = algo.test(df_test)

# Then compute MSE
accuracy.mse(predictions)

MSE: 0.6499


0.6499286475834373

## Co-clustering

In [28]:
algo = CoClustering(35, 40, random_state = 0)

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(df_train)
predictions = algo.test(df_test)

# Then compute MSE
accuracy.mse(predictions)

MSE: 0.5917


0.59169903719255

## funk-svd
https://github.com/gbolmier/funk-svd

In [29]:
from funk_svd.dataset import fetch_ml_ratings
from funk_svd import SVD
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [30]:
# match the input column names
df_target_rename = df_target.copy()
df_probe_rename = df_probe.copy()
df_probe_rename.columns = ['u_id', 'i_id', 'rating']
df_target_rename.columns = ['u_id', 'i_id', 'rating']

In [31]:
svd = SVD(learning_rate=0.001, regularization=0.005, n_epochs=100,
          n_factors=15, min_rating=1, max_rating=5)
svd.fit(X = df_target_rename, early_stopping = True, shuffle = False)

Preprocessing data...

Epoch 1/100  | took 0.7 sec
Epoch 2/100  | took 0.0 sec
Epoch 3/100  | took 0.0 sec
Epoch 4/100  | took 0.0 sec
Epoch 5/100  | took 0.0 sec
Epoch 6/100  | took 0.0 sec
Epoch 7/100  | took 0.0 sec
Epoch 8/100  | took 0.0 sec
Epoch 9/100  | took 0.0 sec
Epoch 10/100 | took 0.0 sec
Epoch 11/100 | took 0.0 sec
Epoch 12/100 | took 0.1 sec
Epoch 13/100 | took 0.0 sec
Epoch 14/100 | took 0.0 sec
Epoch 15/100 | took 0.0 sec
Epoch 16/100 | took 0.0 sec
Epoch 17/100 | took 0.0 sec
Epoch 18/100 | took 0.0 sec
Epoch 19/100 | took 0.0 sec
Epoch 20/100 | took 0.0 sec
Epoch 21/100 | took 0.0 sec
Epoch 22/100 | took 0.0 sec
Epoch 23/100 | took 0.0 sec
Epoch 24/100 | took 0.0 sec
Epoch 25/100 | took 0.0 sec
Epoch 26/100 | took 0.0 sec
Epoch 27/100 | took 0.0 sec
Epoch 28/100 | took 0.0 sec
Epoch 29/100 | took 0.0 sec
Epoch 30/100 | took 0.0 sec
Epoch 31/100 | took 0.0 sec
Epoch 32/100 | took 0.0 sec
Epoch 33/100 | took 0.0 sec
Epoch 34/100 | took 0.0 sec
Epoch 35/100 | took 0.0 s

<funk_svd.svd.SVD at 0x106ba7d10>

MSE

In [32]:
mean_squared_error(svd.predict(df_probe_rename), df_probe_rename['rating'])

0.6159361076078264