In [1]:
# Import library
import pandas as pd
import numpy as np
import time
import os
from math import sqrt
from numpy import random
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
# Clear terminal & ignore "RuntimeWarning: invalid value encountered in true_divide"
os.system('cls||clear')
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [3]:
# Reading data and define columns
read_start_time = time.time()
df = pd.read_csv(
    r'C:\laragon\www\web-seminkuy\app\Http\Controllers\Python\rating-test-case.csv', sep=',')
read_end_time = time.time()

In [4]:
df

Unnamed: 0,users_id,events_id,rating
0,2,1,4
1,2,3,5
2,2,6,4
3,2,8,2
4,3,1,5
5,3,2,5
6,3,3,4
7,3,10,5
8,4,1,3
9,4,2,4


In [5]:
# Total users and travel packages
n_users = df.users_id.max()
n_events = df.events_id.max()

In [6]:
n_events

10

In [7]:
# Converting data to matrix form
mat_start = time.time()
rating = np.zeros((n_users, n_events))
for row in df.itertuples():
    rating[row[1]-1, row[2]-1] = row[3]
mat_end = time.time()

In [8]:
# Train_test_splitting
trainset = np.copy(rating)
testset = np.zeros((n_users, n_events))

In [9]:
# Z for iterate over each row
train_test = time.time()
z          = 0

for row in rating:
    nz_in  = np.nonzero(row)
    per_20 = int(len(nz_in[0]) * 0.2)
    rand   = random.choice(nz_in[0], per_20, replace=False)
    for i in range(per_20):
        testset[z, rand[i]]  = rating[z, rand[i]]
        trainset[z, rand[i]] = 0
    z = z + 1
train_test_end = time.time()

In [10]:
# Adjusted cosine similarity calculation
def adjusted_cosine_similarity(train_data):
    start        = time.time()
    u_m          = train_data.sum(axis=1) / (train_data != 0).sum(axis=1)
    rating_m_sub = np.where((train_data!=0), train_data-u_m[:, None], train_data)
    similarity   = np.zeros((n_events, n_events))

    for i in range(n_events):
        print(i)
        for j in range(i, n_events):
            num     = 0
            dem1    = 0
            dem2    = 0
            set_c_u = np.where((train_data[:, i] != 0) * (train_data[:, j]))[0]
            for k in set_c_u:
                num              = num+rating_m_sub[k][i] * rating_m_sub[k][j]
                dem1             = dem1 + rating_m_sub[k][i]**2
                dem2             = dem2 + rating_m_sub[k][j]**2
                similarity[i, j] = num/sqrt(dem1*dem2 + 10**-12)

    end = time.time() - start

    print("similarity time = ", end)
    return similarity

In [11]:
# Copying below diagonal of similarity matrix
similarity  = adjusted_cosine_similarity(trainset)
upp_tr      = np.triu(similarity, k=1)
upp_tr      = upp_tr.T
similarity  = similarity + upp_tr
similarity  = np.where((similarity < 0), 0, similarity)

0
1
2
3
4
5
6
7
8
9
similarity time =  0.01598978042602539


In [13]:
# Prediction
mul = trainset.dot(similarity)
div = np.zeros((n_users, n_events))
stt = time.time()

for i in range(n_users):
    nzi = np.nonzero(trainset[i])
    for j in range(n_events):
        sm        = (similarity[j, nzi]).sum()
        div[i, j] = sm
        
    endd = time.time() - stt
    print(endd)


np.nan_to_num(div, copy=False)
prediction = mul/div
np.nan_to_num(prediction, copy=False)

0.0009970664978027344
0.001995086669921875
0.0029947757720947266
0.003993988037109375
0.00499272346496582
0.005995035171508789


array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [4.16331582, 4.        , 4.49421657, 0.        , 0.        ,
        4.3859183 , 0.        , 2.        , 0.        , 4.        ],
       [4.90240295, 5.        , 4.1951941 , 0.        , 5.        ,
        4.23698903, 0.        , 0.        , 0.        , 5.        ],
       [3.        , 3.8048059 , 2.23698903, 0.        , 0.        ,
        2.1951941 , 0.        , 0.        , 0.        , 3.1951941 ],
       [4.5       , 0.        , 4.56152368, 0.        , 4.        ,
        4.43847632, 3.        , 0.        , 0.        , 4.        ],
       [2.1951941 , 2.8048059 , 0.        , 0.        , 2.        ,
        0.        , 0.        , 0.        , 5.        , 2.10815241]])

In [15]:
# save similarity matrix to similarity.csv
np.savetxt(r'C:\laragon\www\web-seminkuy\app\Http\Controllers\Python\similarity-test-case.csv',similarity, fmt='%.4f', delimiter='|')


# save prediction matrix to prediction.csv
np.savetxt(r'C:\laragon\www\web-seminkuy\app\Http\Controllers\Python\prediction-test-case.csv',prediction, fmt='%.4f', delimiter='|')