# -------------------------------------------------------------------------#

# 1. Predict ratings using Spectral Clustering
https://towardsdatascience.com/unsupervised-machine-learning-spectral-clustering-algorithm-implemented-from-scratch-in-python-205c87271045

https://medium.com/@amelie_yeh/singular-value-decomposition-low-rank-approximation-5d867bf67404

# -------------------------------------------------------------------------#

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds
from scipy.linalg import svd
import matplotlib.pyplot as plt
import time

from numpy import linalg as LA
from scipy.sparse import linalg
from scipy.linalg import eig as LAeig

from scipy.sparse.linalg import eigsh

from sklearn.metrics.pairwise import sigmoid_kernel, cosine_similarity
from sklearn.cluster import SpectralClustering, KMeans, MiniBatchKMeans


In [5]:
class arguments():
    # Set Path
    DATAPATH = '../../datasets/'
    RESULTPATH = '../results/'
    metadata = 'False'
    fillnan='mean_col'
    sim_method='cosine_similarity'
    norm_laplacian_k=5
    normalize_laplacian='False'
    kmeans_k=5
    n_epochs=10
    test_prc=0.25
    graph_nodes='M'
    
args = arguments()

In [9]:
"""
Reading and preprocessing data
"""
import numpy as np
import pandas as pd
import pickle
import time

def read_preprocss_data(args):
    time_start=time.time()
    # args.DATAPATH = '../datasets/'
    train = pd.read_csv(args.DATAPATH + 'train.csv')
    test = pd.read_csv(args.DATAPATH + 'test.csv')

    train.columns = ['movie_id', 'customer_id', 'rating', 'date']
    test.columns  = ['movie_id', 'customer_id', 'rating', 'date']

    df = train.pivot_table(index='customer_id', \
                               columns='movie_id', values='rating', aggfunc=np.mean).fillna(0)
    A_fill_zeros = df.to_numpy().copy()

    if args.fillnan=='mean_col':
        df = train.pivot_table(index='customer_id', \
                               columns='movie_id', values='rating', aggfunc=np.mean)
        A = df.to_numpy().copy()
        # column mean
        col_mean = np.nanmean(A, axis = 0)
        col_mean = np.ceil(col_mean)
        print(col_mean.shape)
        col_mean[col_mean>5]=5
        # find indices where nan value is present
        inds = np.where(np.isnan(A))
        # replace inds with avg of column
        A[inds] = np.take(col_mean, inds[1])
    elif args.fillnan=='mean_row':
        df = train.pivot_table(index='customer_id', \
                               columns='movie_id', values='rating', aggfunc=np.mean)
        A = df.to_numpy().copy()
        # row mean
        row_mean = np.nanmean(A, axis = 1)
        row_mean = np.ceil(row_mean)
        # find indices where nan value is present
        inds = np.where(np.isnan(A))
        # replace inds with avg of column
        A[inds] = np.take(row_mean, inds[1])

    print('Reading time elapsed: {} sec'.format(time.time()-time_start))
    print('Reading is done, the shape of the data is:', A.shape)
    

    return df, A, A_fill_zeros





"""
Generating similarity matrix
"""

import numpy as np
import pickle
from sklearn.metrics.pairwise import sigmoid_kernel, cosine_similarity


def gen_similarity(args, X):

    if args.sim_method=='sigmoid_kernel':
        sim_UXU=sigmoid_kernel(X=X, Y=None, gamma=None, coef0=1)
        sim_MXM=sigmoid_kernel(X=X.T, Y=None, gamma=None, coef0=1)
    elif args.sim_method=='cosine_similarity':
        sim_UXU=cosine_similarity(X=X, Y=None)
        sim_MXM=cosine_similarity(X=X.T, Y=None)
    ## =====================================================================
#     # Save similarity matrix
#     fn_str = args.RESULTPATH + 'sim_%s_UXU.npy' %(args.sim_method)
#     with open(fn_str, 'wb') as f:
#         pickle.dump(sim_UXU, f)

#     fn_str = args.RESULTPATH + 'sim_%s_MXM.npy' %(args.sim_method)
#     with open(fn_str, 'wb') as f:
#         pickle.dump(sim_MXM, f)
#     print('saving similarity matrix is done!')
    ## =====================================================================
    return sim_UXU, sim_MXM




"""
Calculating the Laplacian matrix
"""

import numpy as np
import time

def calc_laplacian(args, Ws):
    t1 = time.time()
    # degree matrix
    D = np.diag(np.sum(np.array(Ws), axis=1))
    print('degree matrix:')
    print(D.shape)
    # laplacian matrix
    L = D - Ws
    print('laplacian matrix:')
    print(L.shape)
    elapsed_time = time.time() - t1
    print('Elapsed time is {} seconds: '.format(elapsed_time))

    return L, D




"""
Calculate eigen vectors and values of the input
"""

import numpy as np
import time

# from numpy import linalg as LA
# from scipy.sparse import linalg
# from scipy.linalg import eig as LAeig
# from scipy import linalg
from scipy.sparse.linalg import eigsh


def calc_eig(args, L, Ws, kk):
    t1 = time.time()
    D=np.diag(np.sum(Ws, axis=0))
    vol=np.sum(np.diag(D))

    vals, vecs = eigsh(L, k=kk, which="SM")  # Largest 5 eigenvalues/vectors
    vecs = vecs.real

#     vals, vecs = np.linalg.eig(L)
#     vecs = vecs.real
#     vals = vals[np.argsort(vals)]
#     vals = vals[1:]
#     vecs = vecs[:,np.argsort(vals)]

    print('the first 10 eigen values are:')
    print(vals[:10])
    print('\n')

    if (vals[0]==0):
        if vals[1] > 0:
            print('OOOPS the first eigen value was zero')
            vals = vals[1:]
            vecs = vecs[:,1:]
    if (vals[0]<1e-10):
        print('OOOPS the first eigen value was so small')
        vals = vals[1:]
        vecs = vecs[:,1:]

    #caluclate eigen gap
    e1 = np.zeros([vals.shape[0]+1])
    e2 = np.zeros([vals.shape[0]+1])
    print(e1.shape)
    e1[1:] = vals.copy()
    e2[:-1] = vals.copy()
    print('eigen gap is:')
    eigengap=(e2-e1)
    print(eigengap)
    print('the first 10 eigen values are:')
    print(vals[:10])
    print('\n')
    #


    # eigenvalues
    print('eigenvalues shape is:')
    print(vals.shape)
    # eigenvectors
    print('eigenvectors shape is :')
    print(vecs.shape)
    if args.normalize_laplacian:
        print('do the normalization')
        Y = np.sort(vals)
        I = np.argsort(vals)
        v_norm = vecs[:,I[:args.norm_laplacian_k]] \
            / LA.norm(vecs[:,I[:args.norm_laplacian_k]])*vol**(1/2)
    else:
        v_norm = []
    elapsed_time = time.time() - t1
    print('Elapsed time is {} seconds: '.format(elapsed_time))
    print('calc eigen vectors and values done!')
    return vals, vecs, v_norm, eigengap



In [10]:
df, A, A_fill_zeros = read_preprocss_data(args)
print('done reading the data')

data = A.copy()
data_fill_zeros = A_fill_zeros.copy()
print('data shape is:', data.shape)
print('data fill zero shape is:', data_fill_zeros.shape)
#===========================================================================
#=======================================================================
test = pd.read_csv(args.DATAPATH + 'test.csv')
test.columns  = ['movie_id', 'customer_id', 'rating', 'date']
test_np = test.to_numpy().copy()

train = pd.read_csv(args.DATAPATH + 'train.csv')
train.columns  = ['movie_id', 'customer_id', 'rating', 'date']
train_np = train.to_numpy().copy()

train_data = data.copy()

(16142,)
Reading time elapsed: 17.44917106628418 sec
Reading is done, the shape of the data is: (5905, 16142)
done reading the data
data shape is: (5905, 16142)
data fill zero shape is: (5905, 16142)


In [12]:
movie_to_index = {movie:indx for indx, movie in enumerate(df.columns.values)}
index_to_movie = {indx:movie for indx, movie in enumerate(df.columns.values)}


In [13]:
# #===========================================================================
# # use a subset of data just for testing everything first
# nu=10 # number of users
# ni=20 # number of items
# A_temp = A.copy()
# data = A_temp[:nu,:ni] # small 10 X 20 submatrix
# print(data.shape)

# A_temp = A_fill_zeros.copy()
# data_fill_zeros = A_temp[:nu,:ni] # small 10 X 20 submatrix

# train_np = train_np[:nu,:ni]
# test_np = test_np[:nu,:ni]

# train_data = data.copy()

# test_np.shape

In [14]:
#===========================================================================
# STEP 4 - Using the k smallest eigenvector as input,
# train a k-means model and use it to classify the data
#===========================================================================
if args.graph_nodes=='M':
    n_k = [10, 50, 100]
elif args.graph_nodes=='U':
    n_k = [10, 50, 100]
#=======================================================================
final_k = 5
#=======================================================================
# STEP 1 - Calculate similarity
sim_UXU, sim_MXM = gen_similarity(args, train_data)
print('gen similarity is done')

# STEP 2 - computing the laplacian
if args.graph_nodes=='M':
    Ws = sim_MXM.copy()
elif args.graph_nodes=='U':
    Ws = sim_UXU.copy()
L, D = calc_laplacian(args, Ws)
print('calc laplacian is done')

gen similarity is done
degree matrix:
(16142, 16142)
laplacian matrix:
(16142, 16142)
Elapsed time is 12.831299066543579 seconds: 
calc laplacian is done


In [15]:
# STEP 3 - Compute the eigenvectors of the matrix L
vals, vecs, v_norm, eigengap = calc_eig(args, L, Ws, final_k)

# STEP 5 - using k centers to predict data
U = np.array(vecs)
print('U array eigenvectors shape:', U.shape)

t1=time.time()
km = MiniBatchKMeans(n_clusters=final_k,
                     random_state=0,
                     batch_size=100,
                     max_iter=100)
print('MiniBatchKMeans time elapsed: {} sec'.format(time.time()-t1))
km.fit(U)
print('MiniBatchKMeans Fit time elapsed: {} sec'.format(time.time()-t1))

the first 10 eigen values are:
[-1.03364969e-11  1.58044353e+04  1.58622461e+04  1.58665676e+04
  1.58736045e+04]


OOOPS the first eigen value was so small
(5,)
eigen gap is:
[ 1.58044353e+04  5.78107852e+01  4.32158062e+00  7.03690276e+00
 -1.58736045e+04]
the first 10 eigen values are:
[15804.43527071 15862.24605587 15866.56763649 15873.60453925]


eigenvalues shape is:
(4,)
eigenvectors shape is :
(16142, 4)
do the normalization
Elapsed time is 12.0967698097229 seconds: 
calc eigen vectors and values done!
U array eigenvectors shape: (16142, 4)
MiniBatchKMeans time elapsed: 0.01572394371032715 sec
MiniBatchKMeans Fit time elapsed: 0.23460006713867188 sec


In [18]:
print(test_np.shape)

if args.graph_nodes=='M': # menas the sim is MXM
    labels = np.zeros([final_k])
    pred_ratings = np.zeros(train_data.shape[1])
    t0=time.time()
    for il, lbl in enumerate(range(final_k)):
        dfz=data_fill_zeros[:,km.labels_==lbl].copy()

        # find user that rated at least one of the movies
        goodU= np.mean(dfz, axis=1)
        if goodU.shape[0] > 0:
            # index for users that rate at least one of
            # the movies in that clustr
            indxgu=np.where(goodU > 0)
            trdata = train_data[:, km.labels_==lbl]
            trdata = trdata[indxgu[0], :]
        else:
            trdata = train_data[:, km.labels_==lbl]

        trdata = np.mean(trdata,axis=0)
        pr = np.ceil(np.mean(trdata,axis=0))
         
        if pr > 5:
                labels[il] = 5
        else:
                labels[il] = pr
    
    existed=0
    time_start=time.time()
    labels2=labels.copy()
    for ic in range(len(test_np)):    
        mvid   = test_np[ic, 0]
        custid = test_np[ic, 1]
        if mvid not in movie_to_index.keys():
            test_np[ic,2] = -1
            continue
        existed_rate = train[(train["movie_id"]==mvid)&(train["customer_id"]==custid)]
        if (existed_rate.empty):
            indx = movie_to_index[mvid]
            ctst = km.labels_[indx]
            test_np[ic,2] = labels[ctst]
        else:
            existed+=1
            test_np[ic,2] = existed_rate
        if ic%100000==0:
            print('ic:', ic)
            print(test_np)
            # Save movie titles
            fn_str = args.RESULTPATH + 'test_np_spectralClustring2'
            with open(fn_str, 'wb') as f:
                pickle.dump(test_np, f)
            print('Creating movie titles time elapsed: {} sec'.format(time.time()-time_start))
    print('existed:', existed)
   

(249999, 4)
ic: 0
[[11279 5858 5.0 '2005-05-07']
 [12140 5202 '?' '2004-01-24']
 [12191 18818 '?' '2005-03-27']
 ...
 [16707 32275 '?' '2004-07-29']
 [17339 10728 '?' '2005-07-15']
 [17580 5600 '?' '2005-06-18']]
Creating movie titles time elapsed: 0.48276686668395996 sec
ic: 100000
[[11279 5858 5.0 '2005-05-07']
 [12140 5202 4.0 '2004-01-24']
 [12191 18818 4.0 '2005-03-27']
 ...
 [16707 32275 '?' '2004-07-29']
 [17339 10728 '?' '2005-07-15']
 [17580 5600 '?' '2005-06-18']]
Creating movie titles time elapsed: 537.4410078525543 sec
ic: 200000
[[11279 5858 5.0 '2005-05-07']
 [12140 5202 4.0 '2004-01-24']
 [12191 18818 4.0 '2005-03-27']
 ...
 [16707 32275 '?' '2004-07-29']
 [17339 10728 '?' '2005-07-15']
 [17580 5600 '?' '2005-06-18']]
Creating movie titles time elapsed: 1018.030720949173 sec
existed: 119


In [22]:
test_df = pd.DataFrame(data=test_np, columns=['movie_id', 'customer_id', 'rating', 'date'])
fn_str = args.RESULTPATH + 'MaryZolfaghar_preds_clustering_k5.csv'
with open(fn_str, 'wb') as f:
    pickle.dump(test_df, f)
print('Creating movie titles time elapsed: {} sec'.format(time.time()-time_start))

Creating movie titles time elapsed: 2278.0017879009247 sec


In [None]:
# # test_df = pd.DataFrame(data=test_np, columns=['movie_id', 'customer_id', 'rating', 'date'])
# fn_str = args.RESULTPATH + 'test_np_spectralClustring_df_4k.csv'
# with open(fn_str, 'rb') as f:
#     test_ans = pickle.load( f)
# test_ans

#### visulization 

In [None]:
km.labels_.shape

In [None]:
train_data2 = data.copy()

if args.graph_nodes=='M': # menas the sim is MXM
    labels = np.zeros([final_k])
    pred_ratings = np.zeros(train_data.shape[1])
    t0=time.time()
    for il, lbl in enumerate(range(final_k)):
        dfz=data_fill_zeros[:,km.labels_==lbl].copy()

        # find user that rated at least one of the movies
        goodU= np.mean(dfz, axis=1)
        if goodU.shape[0] > 0:
            # index for users that rate at least one of
            # the movies in that clustr
            indxgu=np.where(goodU > 0)
            trdata = train_data[:, km.labels_==lbl]
            trdata = trdata[indxgu[0], :]
        else:
            trdata = train_data[:, km.labels_==lbl]

        trdata = np.mean(trdata,axis=0)
#         labels[il] = np.ceil(np.mean(trdata,axis=0))
#         
        pr = np.ceil(np.mean(trdata,axis=0))
         
        if pr > 5:
                labels[il] = 5
            else:
                labels[il] = pr
    
    existed=0
    time_start=time.time()
    labels2=labels.copy()
    
    time_start=time.time()
    for ic in range(train_data.shape[1]):
        ctst = km.labels_[ic]
        labels2=labels.copy()
        pred_ratings[ic] = labels2[ctst]
        train_data2[:,ic] = labels2[ctst]

        if ic%5000==0:
            print('ic:', ic)
            print(train_data2)
            print('\n')
            print(pred_ratings)
            print('\n')
            fn_str = args.RESULTPATH + 'train_data2_filled_spectralClustring'
            with open(fn_str, 'wb') as f:
                pickle.dump(train_data2, f)
            print('Creating movie titles time elapsed: {} sec'.format(time.time()-time_start))
        

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns

import time


def plot_clusters(data_plot, labels, labels_txt, final_k):
    time_start = time.time()
    tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
    tsne_results = tsne.fit_transform(data_plot)

    xx = tsne_results[:,0]
    yy = tsne_results[:,1]

    plt.figure(figsize=(14,10))
    sns.scatterplot(
        x=xx, y=yy,
        hue=labels,
        palette=sns.color_palette("hls", final_k),
        legend="full",
        alpha=0.3)
    fig, ax = plt.subplots()
    ax.scatter(z, y)

    for i, txt in enumerate(n):
        ax.annotate(txt, (z[i], y[i]))

In [None]:
t1=[]; t2=[]; t3=[]
with open(args.DATAPATH + 'movie_titles.txt', 'r',encoding="latin-1") as reading:
    for line in reading.readlines():
        tokens = line.split(",")
        t1.append(tokens[0])
        t2.append(tokens[1])
        t33 = tokens[2].split('\n')
        t3.append(t33[0])

t1=np.asarray(t1)
t1=t1[1:]
t2=np.asarray(t2)
t2=t2[1:]
t3=np.asarray(t3)
t3=t3[1:]

titles = pd.read_fwf(args.DATAPATH + 'movie_titles.txt', delimiter= ',', \
                           names = ["movie_id", "year_produced", "title"], encoding="ISO-8859-1")


movie_titles = pd.DataFrame(titles[1:], columns=["movie_id", "year_produced", "title"])

movie_titles['movie_id'] = t1
movie_titles['year_produced'] = t2
movie_titles['title'] = t3

movie_titles

movieid_to_title = {movie:title for in enumerate(movie_titles)}
movieid_to_title = {movie:year for in enumerate(movie_titles)}


In [None]:
data_plot = U.copy()
km_labels = km.labels_
labels_txt=''
print('data_plot shape:', labels.shape)
 
plot_clusters(data_plot, labels, labels_txt, final_k)