# CF notebook

Note: Despite the web service is implemented on Python 3. This notebook has only been tested on Python 2.7.

In [1]:
import time
import math

import numpy as np
from matplotlib import pyplot as plt

from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import pairwise_distances

In [3]:
# convert some old courses to corresponded new courses
course_convert_id = {} #{"TH618":"CTT305","TH619":"CTT401","TH616":"CTT336","TH162":"CTT308","TH160":"CTT992","TH006":"CTT009","TH313":"CTT310","TH019":"CTT101","TH501":"CTT604","TH153":"CTT337","TH018":"CTT008","TH016":"CTT105","TH302":"CTT302","TH303":"CTT310","TH301":"CTT537","TH306":"CTT402","TH307":"CTT405","TH304":"CTT307","TH305":"CTT406","TH309":"CTT323","TH027":"CTT006","TH026":"CTT101","TH029":"CTT337","TH028":"CTT104","TH140":"CTT628","TH138":"CTT408","TH404":"CTT503","TH407":"CTT403","TH202":"CTT203","TH207":"CTT505","TH213":"CTT204","TH115":"CTT327","TH114":"CTT005","TH112":"CTT303","TH110":"CTT502","TH119":"CTT103","TH131":"CTT331","TH134":"CTT501","TH107":"CTT102","TH109":"CTT304","TH609":"CTT301","TH608":"CTT404","TH606":"CTT524","TH605":"CTT634","TH604":"CTT621","TH602":"CTT522","TH144":"CTT309"}

In [4]:
dataValues = np.load("../data_raw_05_15.npy" , encoding = "latin1")
print(dataValues)

[['0512001'
          0                                    1   2    3      4     5
0   TH301                     Đặc tả hình thức NaN    1      V   0.0
1   TH900                  Luận văn tốt nghiệp NaN   TN    9.9   9.9
2   TH160                     Thực tập thực tế NaN   T1     10  10.0
3   TH604                         An toàn mạng NaN    T   10.0  10.0
4   TH407          Mã hóa thông tin & ứng dụng NaN    1   10.0  10.0
5   TH304                            Nhận dạng NaN    1    8.0   8.0
6   TH401          Xây dựng PM hướng đối tượng NaN    1   10.0  10.0
7   TH138                    Xử lý tín hiệu số NaN    1    6.0   6.0
8   AN115                            Anh văn 5 NaN   25   8.30   8.5
9   TH608             Nhập môn mã hóa & mật mã NaN   T1   10.0  10.0
10  TH405  Phân tích, thiết kế hướng đối tượng NaN   T1    9.0   9.0
11  TH117               Quản lý đồ án phần mềm NaN   T1   10.0  10.0
12  TR040                 Tư tưởng Hồ Chí Minh NaN    2    8.0   8.0
13  TH303           

## Pre-processing

In [10]:
# This block inits data of these 3 dicts
course_namedict = {} # Convert from course id to course name
course_ids = {} # convert from course id to index in range [0, n_items)
user_ids = {} # convert from student id to index in range [0, n_users)

uid = 0
for student in dataValues:
    studentMssv = student[0]
    student_table_score = student[1]
    for scoreidx in range(0, len(student_table_score)):
        course_id = student_table_score[0][scoreidx]
        if course_id in course_convert_id:
            course_id = course_convert_id[course_id]
        course_namedict[course_id] = student_table_score[1][scoreidx]
    if np.sum(student_table_score[5]) != 0:
        user_ids[studentMssv] = uid
        uid += 1

cid = 0
for course_id in course_namedict.keys():
    course_ids[course_id] = cid
    cid += 1
    
print(user_ids, course_ids)



{'0512001': 0, '0512002': 1, '0512003': 2, '0512004': 3, '0512005': 4, '0512006': 5, '0512007': 6, '0512008': 7, '0512009': 8, '0512010': 9, '0512011': 10, '0512012': 11, '0512013': 12, '0512014': 13, '0512015': 14, '0512016': 15, '0512017': 16, '0512018': 17, '0512019': 18, '0512020': 19, '0512021': 20, '0512022': 21, '0512023': 22, '0512024': 23, '0512025': 24, '0512026': 25, '0512027': 26, '0512029': 27, '0512030': 28, '0512031': 29, '0512032': 30, '0512033': 31, '0512034': 32, '0512035': 33, '0512036': 34, '0512037': 35, '0512038': 36, '0512039': 37, '0512040': 38, '0512041': 39, '0512042': 40, '0512043': 41, '0512044': 42, '0512045': 43, '0512046': 44, '0512047': 45, '0512048': 46, '0512050': 47, '0512051': 48, '0512052': 49, '0512053': 50, '0512054': 51, '0512055': 52, '0512056': 53, '0512057': 54, '0512058': 55, '0512059': 56, '0512060': 57, '0512061': 58, '0512062': 59, '0512063': 60, '0512064': 61, '0512065': 62, '0512066': 63, '0512067': 64, '0512068': 65, '0512069': 66, '051

In [7]:

nusers = len(user_ids)
nitems = len(course_ids)
print ("Number of users:", nusers)
print ("Number of items:", nitems)

Number of users: 5551
Number of items: 278


In [11]:
# This block creates a score matrix which is the input of CF models
matrix_data = np.zeros((nusers, nitems))
i=0
for user in dataValues:
    [mssv, scoretable] = user
    if mssv not in user_ids:
        continue    
    for user_courseidx in range(0, len(scoretable)):
        course_id = scoretable[0][user_courseidx]
        course_id = course_convert_id[course_id] if course_id in course_convert_id else course_id
        user_coursescore = scoretable[5][user_courseidx]
        if user_coursescore != 0 and not math.isnan(user_coursescore):
            matrix_data[user_ids[mssv], course_ids[course_id]] = user_coursescore
            i+=1
print(i)
print (matrix_data)

201186
[[ 0.   9.9 10.  ...  0.   0.   0. ]
 [ 0.   0.   0.  ...  0.   0.   0. ]
 [ 0.   0.   0.  ...  0.   0.   0. ]
 ...
 [ 0.   0.   0.  ...  7.5  0.   0. ]
 [ 0.   0.   0.  ...  0.   0.   0. ]
 [ 0.   0.   0.  ...  4.   0.   0. ]]


## Data Details

## Memory-based

In [14]:
def similarity_compute(ratings):
    # Cosine distance is defined as 1.0 minus the cosine similarity.
    return 1 - pairwise_distances(ratings, metric='cosine')

def compute_predict(ratings, similarity):
    mean_userscore = np.true_divide(ratings.sum(1),(ratings!=0).sum(1))
    ratingsdiff = np.subtract(ratings, mean_userscore[:, np.newaxis], where=ratings!=0)
    aa  = similarity.dot(ratingsdiff) / np.array([np.abs(similarity).sum(axis=1)]).T
    return mean_userscore[:, np.newaxis] + aa

In [15]:
user_similarity = similarity_compute(matrix_data)

In [17]:
prediction = compute_predict(matrix_data, user_similarity)
print(prediction)

[[8.62702461 8.9364805  8.61799187 ... 8.45714286 8.45714286 8.45714286]
 [7.31794732 7.52053893 7.30328016 ... 7.25       7.25       7.25      ]
 [4.87531272 5.04905367 4.83513717 ... 4.75111752 4.76315789 4.76286936]
 ...
 [7.59106994 7.59142325 7.59122135 ... 7.32266848 7.59090909 7.58932142]
 [6.50027139 6.5004805  6.5005979  ... 6.25742135 6.5        6.49869358]
 [3.81267973 3.81278999 3.81259433 ... 3.52990346 3.8125     3.81068141]]


#### Evaluation

In [18]:
def test_train_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        nonzeros_user = ratings[user, :].nonzero()[0]
        if len(nonzeros_user) > 15:
            testsize = 10
        else:
            testsize = 0
        testratings = np.random.choice(nonzeros_user, 
                                        size=testsize, 
                                        replace=False)
        train[user, testratings] = 0.
        test[user, testratings] = ratings[user, testratings]
    return train, test

def rmse_compute(pred, actual):    
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return np.sqrt(mean_squared_error(pred, actual))

In [19]:
train_matrix, test_matrix = test_train_split(matrix_data)

In [20]:
train_prediction = compute_predict(train_matrix, user_similarity)
print(train_prediction)

[[8.57599474 8.84431495 8.58742924 ... 8.46153846 8.46153846 8.46153846]
 [7.29855105 7.46540612 7.28962635 ... 7.25       7.25       7.25      ]
 [4.70241206 4.85567986 4.68096929 ... 4.61662614 4.625      4.625     ]
 ...
 [7.59107943 7.5913028  7.59114641 ... 7.34635364 7.59090909 7.59090909]
 [6.50027772 6.50046956 6.50027772 ... 6.27766504 6.5        6.5       ]
 [3.81270478 3.81268877 3.81259924 ... 3.55397722 3.8125     3.8125    ]]


In [21]:
print ('Rooted mean squared error: ' + str(rmse_compute(train_prediction, test_matrix)))

Rooted mean squared error: 1.6721448563770709


## Model-based

In [22]:
mean_userscore = np.true_divide(train_matrix.sum(1),(train_matrix!=0).sum(1))
u, s, vt = svds(np.subtract(train_matrix, mean_userscore[:, np.newaxis], where=train_matrix!=0), k = 9)
s_matrix=np.diag(s)
train_prediction = np.dot(np.dot(u, s_matrix), vt) + mean_userscore[:, np.newaxis]
print ('Root Mean Squared Error: ' + str(rmse_compute(train_prediction, test_matrix)))
train_prediction_0 = np.dot(np.dot(u[0, :], s_matrix), vt)
print(train_prediction_0 )

Root Mean Squared Error: 1.6533213598214342
[ 5.79907262e-02  1.40629135e-01  3.44401752e-02  1.33471258e-03
  4.56213305e-02  9.89082238e-03  5.47278737e-02  1.24359633e-03
 -1.65550348e-01  1.14601632e-02  4.98164577e-03  1.75804255e-01
 -2.65017466e-02  6.76491616e-03 -1.42821635e-01  3.64906557e-03
  1.72707017e-01  2.13033449e-01 -9.05480924e-02  2.70726571e-02
  3.36872241e-02  1.03479501e-01 -1.04411314e-01 -6.63628600e-02
 -1.53312374e-01 -2.06370488e-01  9.59595596e-02 -2.75644104e-03
  5.54256562e-02 -9.23484941e-02 -2.94467699e-02 -1.70144931e-01
 -1.70221641e-02 -2.81984014e-01  8.51398774e-02 -4.43949669e-01
 -9.50909866e-03 -5.43411389e-03 -1.78806449e-01 -1.84702813e-01
 -5.17918136e-02 -2.17229323e-01  2.28462931e-02 -2.24064141e-01
 -1.12896395e-01 -7.51211692e-02 -7.60412526e-02 -1.26114871e-01
  3.69253713e-02 -7.87607359e-03 -4.67651470e-02  2.78876716e-02
 -3.62857769e-02 -1.40612800e-02 -5.38263896e-03 -8.66649680e-02
  2.66000971e-02 -1.55850633e-03  5.39976297e-

## Baseline model

In [23]:
# This is a trivial model which always predicts score equal to student's GPA
train_prediction = np.array(train_matrix)
for i in range(0, len(mean_userscore)):
    train_prediction[i][train_prediction[i] == 0] = mean_userscore[i]

In [24]:
print ('Root Mean Squared Error: ' + str(rmse_compute(train_prediction, test_matrix)))

Root Mean Squared Error: 1.759978103356725
