In [1]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mplcursors # Use this is for creating a cursor-interactive plot with "%matplotlib notebook"
from sklearn.decomposition import NMF # Use this for training Non-negative Matrix Factorization
from sklearn.utils.extmath import randomized_svd # Use this for training Singular Value Decomposition
from sklearn.manifold import TSNE # Use this for training t-sne manifolding

plt.style.use('ggplot') # You can also use different style

# just for plot checking, use this option
# %matplotlib inline

# for interactive plot
# If you use this option, plot will appear at first-drawn position
%matplotlib notebook

warnings.filterwarnings('ignore')

In [2]:
dir = './remap/'
df_ratings = pd.read_csv(dir + 'userId_problemId_remap.csv', usecols=['userId', 'problemId'])
df_ratings = df_ratings.assign(solve=1)
df_ratings.drop_duplicates(inplace = True)

FileNotFoundError: [Errno 2] No such file or directory: './remap/userId_problemId_remap.csv'

In [None]:
# 고유 사용자, 고유 문제 갯수 확인
n_users = len(df_ratings['userId'].unique())
n_problems = len(df_ratings['problemId'].unique())

n_users, n_problems

In [None]:
# ratings의 기술통계량 확인
df_ratings['solve'].describe()

In [None]:
A = df_ratings.pivot(index = 'problemId', columns = 'userId', values = 'solve').fillna(0).to_numpy()

In [None]:
pd.DataFrame(A)

## SVD Model

In [None]:
k = 100
U, Sigma, VT = randomized_svd(A, n_components=k, n_iter='auto')

# 분해된 행렬이 올바른 형태로 생성되었는지 확인
print(U.shape, Sigma.shape, VT.shape)

A_approx_svd = np.dot(np.dot(U, np.diag(Sigma)), VT)

# 근사 행렬이 올바른 형태로 생성되었는지 확인
print(A_approx_svd.shape)

In [None]:
pd.DataFrame(A_approx_svd)

## NMF Model

In [None]:
k = 100
model_nmf = NMF(n_components = k, init='random', random_state=30, max_iter=100, l1_ratio=0.2).fit(A)
W = model_nmf.transform(A)
H = model_nmf.components_

# 분해된 행렬이 올바른 형태로 생성되었는지 확인
print(W.shape, H.shape)

A_approx_nmf = np.dot(W, H)

# 근사 행렬이 올바른 형태로 생성되었는지 확인
print(A_approx_nmf.shape)

In [None]:
pd.DataFrame(A_approx_nmf)

## Compute loss

In [None]:
def compute_error(actual, prediction):
    # 매개변수로 입력받은 actual 행렬 안의 0값을 갖는 원소들은 오차 계산에서 제외합니다.
    prediction = prediction[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    
    sse = np.sum(np.square(np.subtract(actual, prediction)))
    rmse = np.sqrt(np.square(np.subtract(actual, prediction)).mean())
    
    return sse, rmse

def compute_error_all(actual, prediction):
    # actual 행렬 안의 0값을 갖는 원소들도 포함해서 오차를 계산합니다.
    sse = np.sum(np.square(np.subtract(actual, prediction)))
    rmse = np.sqrt(np.square(np.subtract(actual, prediction)).mean())
                   
    return sse, rmse

In [None]:
print(f"SVD Error(ignoring zero values): SSE = {compute_error(A, A_approx_svd)[0]}, RMSE = {compute_error(A, A_approx_svd)[1]}")
print(f"NMF Error(ignoring zero values): SSE = {compute_error(A, A_approx_nmf)[0]}, RMSE = {compute_error(A, A_approx_nmf)[1]}")

print('\n')

print(f"SVD Error(including all zero values): SSE = {compute_error_all(A, A_approx_svd)[0]}, RMSE = {compute_error_all(A, A_approx_svd)[1]}")
print(f"NMF Error(including all zero values): SSE = {compute_error_all(A, A_approx_nmf)[0]}, RMSE = {compute_error_all(A, A_approx_nmf)[1]}")

### grid search

In [16]:
svd_rlt = dict()
nmf_rlt = dict()
for k in [50, 100, 150, 200]:
    print("grid search:", k)
    U, Sigma, VT = randomized_svd(A, n_components=k, n_iter='auto')
    A_approx_svd = np.dot(np.dot(U, np.diag(Sigma)), VT)
    
    model_nmf = NMF(n_components = k, init='random', random_state=30, max_iter=100, l1_ratio=0.2).fit(A)
    W = model_nmf.transform(A)
    H = model_nmf.components_
    A_approx_nmf = np.dot(W, H)
    
    c_svd = compute_error(A, A_approx_svd)[1]
    c_nmf = compute_error(A, A_approx_nmf)[1]
    
    svd_rlt[k] = c_svd
    nmf_rlt[k] = c_nmf

grid search: 50
grid search: 100
grid search: 150
grid search: 200


In [17]:
svd_rlt

{50: 0.487408669093533,
 100: 0.4448330669232854,
 150: 0.4101553898643726,
 200: 0.38098933426351533}

In [18]:
nmf_rlt

{50: 0.5067141914358917,
 100: 0.48101584643565326,
 150: 0.4620809186384405,
 200: 0.44641222876533365}

In [19]:
U, Sigma, VT = randomized_svd(A, n_components=200, n_iter='auto')
A_approx_svd = np.dot(np.dot(U, np.diag(Sigma)), VT)