In [30]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mplcursors # Use this is for creating a cursor-interactive plot with "%matplotlib notebook"
from sklearn.decomposition import NMF # Use this for training Non-negative Matrix Factorization
from sklearn.utils.extmath import randomized_svd # Use this for training Singular Value Decomposition
from sklearn.manifold import TSNE # Use this for training t-sne manifolding

plt.style.use('ggplot') # You can also use different style

# just for plot checking, use this option
# %matplotlib inline

# for interactive plot
# If you use this option, plot will appear at first-drawn position
%matplotlib notebook

warnings.filterwarnings('ignore')

In [31]:
dir = './remap/'
df_ratings = pd.read_csv(dir + 'userId_problemId_remap.csv', usecols=['userId', 'problemId'])
df_ratings = df_ratings.assign(solve=1)
df_ratings.drop_duplicates(inplace = True)

In [3]:
# 고유 사용자, 고유 문제 갯수 확인
n_users = len(df_ratings['userId'].unique())
n_problems = len(df_ratings['problemId'].unique())

n_users, n_problems

(2000, 5723)

In [4]:
# ratings의 기술통계량 확인
df_ratings['solve'].describe()

count    833388.0
mean          1.0
std           0.0
min           1.0
25%           1.0
50%           1.0
75%           1.0
max           1.0
Name: solve, dtype: float64

In [5]:
A = df_ratings.pivot(index = 'problemId', columns = 'userId', values = 'solve').fillna(0).to_numpy()

In [6]:
pd.DataFrame(A)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
2,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5720,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5721,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## SVD Model

In [7]:
k = 100
U, Sigma, VT = randomized_svd(A, n_components=k, n_iter='auto')

# 분해된 행렬이 올바른 형태로 생성되었는지 확인
print(U.shape, Sigma.shape, VT.shape)

A_approx_svd = np.dot(np.dot(U, np.diag(Sigma)), VT)

# 근사 행렬이 올바른 형태로 생성되었는지 확인
print(A_approx_svd.shape)

(5723, 100) (100,) (100, 2000)
(5723, 2000)


In [8]:
pd.DataFrame(A_approx_svd)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,1.282908,0.982457,1.012734,0.210409,0.799908,0.700536,1.020860,1.270019,1.183272,0.844995,...,1.020233,0.804079,0.999843,0.234471,0.409834,1.155394,1.001244,0.989859,1.040696,0.609728
1,1.162254,0.917811,0.874251,0.090846,0.934184,0.608458,1.034947,1.072972,0.995421,0.465680,...,0.966441,0.806583,1.027665,0.142309,0.270925,1.219290,1.049448,0.939293,1.055428,0.337898
2,0.782888,0.705934,0.639680,0.076158,0.964215,0.940499,1.047680,0.241833,0.166008,0.090939,...,0.818318,0.086393,0.530721,-0.020026,0.034672,0.086814,0.903771,0.028993,0.428280,-0.007616
3,0.901320,0.959503,0.943408,0.196422,0.936846,0.721684,1.002532,0.748526,1.039079,0.874505,...,0.815739,0.533975,0.502565,0.316679,0.743295,0.690068,1.029553,0.634880,0.953322,-0.068640
4,0.427167,0.765989,0.526817,0.042648,0.503224,1.071760,0.456225,0.130979,0.365263,-0.006430,...,0.709526,0.039322,0.483935,-0.066048,-0.131062,-0.077609,0.016371,0.088309,0.220896,0.000723
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5718,0.051441,-0.001073,0.017357,0.005076,0.033870,0.000758,0.019908,-0.009021,0.017157,0.043919,...,0.025259,-0.016810,0.003443,0.024440,-0.000163,-0.045330,-0.005817,0.009730,-0.019410,0.007210
5719,0.023259,-0.003485,0.023122,-0.001141,0.010055,0.028461,0.012798,-0.017134,0.028003,-0.009291,...,-0.026556,0.024879,0.007238,0.009504,0.015687,-0.018428,-0.004872,0.030737,0.000136,-0.010049
5720,0.002039,-0.001532,-0.000214,0.002539,-0.000398,0.013730,0.003184,-0.004793,-0.009660,0.006475,...,-0.004732,0.004789,0.002894,0.000946,0.000853,-0.008926,0.007824,-0.001526,-0.002758,-0.006158
5721,-0.007944,0.000567,0.000433,0.003146,-0.012067,0.008116,0.008291,-0.011202,-0.009539,-0.003058,...,0.005308,-0.000183,0.007754,0.001780,-0.003955,0.000941,0.008443,-0.002872,-0.003231,0.002076


## NMF Model

In [9]:
k = 100
model_nmf = NMF(n_components = k, init='random', random_state=30, max_iter=100, l1_ratio=0.2).fit(A)
W = model_nmf.transform(A)
H = model_nmf.components_

# 분해된 행렬이 올바른 형태로 생성되었는지 확인
print(W.shape, H.shape)

A_approx_nmf = np.dot(W, H)

# 근사 행렬이 올바른 형태로 생성되었는지 확인
print(A_approx_nmf.shape)

(5723, 100) (100, 2000)
(5723, 2000)


In [10]:
pd.DataFrame(A_approx_nmf)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,1.234905,1.009064,0.991519,0.227058,0.884733,0.708635,1.032982,1.264788,1.054260,0.620640,...,1.021434,0.863812,0.999083,0.273475,0.355025,1.083067,0.948448,0.949483,1.125569,0.558851
1,1.165478,1.029897,0.901042,0.117921,0.972153,0.716850,0.991197,1.044329,1.000846,0.331002,...,1.006412,0.859558,0.918994,0.139096,0.215827,1.192518,0.969082,0.815060,1.069942,0.269923
2,0.650699,0.906714,0.575303,0.060192,0.778371,0.912777,0.747917,0.261604,0.288642,0.094584,...,0.931594,0.167705,0.417756,0.031347,0.037876,0.202895,0.909500,0.062311,0.503359,0.008342
3,0.821452,0.936255,0.886502,0.165875,0.856148,0.781922,0.957815,0.823398,0.930054,0.803716,...,0.977258,0.619978,0.536405,0.400778,0.630744,0.897387,1.068478,0.673548,1.002710,0.120582
4,0.264714,0.620971,0.762412,0.025695,0.437070,1.357928,0.216951,0.068693,0.213221,0.045271,...,0.471023,0.149658,0.326184,0.012156,0.032437,0.059962,0.081512,0.000876,0.158456,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5718,0.009520,0.005785,0.026968,0.002345,0.002619,0.001614,0.022429,0.005318,0.022007,0.025987,...,0.007032,0.002132,0.003300,0.006352,0.003591,0.000608,0.012989,0.004667,0.001107,0.004408
5719,0.007226,0.000301,0.010274,0.000135,0.000161,0.000020,0.015418,0.002266,0.036186,0.018779,...,0.000033,0.003704,0.000665,0.002906,0.000284,0.000151,0.006510,0.005481,0.002249,0.000503
5720,0.001143,0.000496,0.005727,0.000005,0.001820,0.006173,0.002546,0.001614,0.001141,0.000286,...,0.000391,0.000195,0.001052,0.001169,0.000206,0.000130,0.001407,0.000000,0.002681,0.000376
5721,0.000644,0.002460,0.003805,0.000251,0.000370,0.001555,0.010912,0.000863,0.004367,0.008304,...,0.001666,0.001947,0.000272,0.000184,0.000695,0.000833,0.006671,0.000000,0.000832,0.000276


## Compute loss

In [11]:
def compute_error(actual, prediction):
    # 매개변수로 입력받은 actual 행렬 안의 0값을 갖는 원소들은 오차 계산에서 제외합니다.
    prediction = prediction[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    
    sse = np.sum(np.square(np.subtract(actual, prediction)))
    rmse = np.sqrt(np.square(np.subtract(actual, prediction)).mean())
    
    return sse, rmse

def compute_error_all(actual, prediction):
    # actual 행렬 안의 0값을 갖는 원소들도 포함해서 오차를 계산합니다.
    sse = np.sum(np.square(np.subtract(actual, prediction)))
    rmse = np.sqrt(np.square(np.subtract(actual, prediction)).mean())
                   
    return sse, rmse

In [12]:
print(f"SVD Error(ignoring zero values): SSE = {compute_error(A, A_approx_svd)[0]}, RMSE = {compute_error(A, A_approx_svd)[1]}")
print(f"NMF Error(ignoring zero values): SSE = {compute_error(A, A_approx_nmf)[0]}, RMSE = {compute_error(A, A_approx_nmf)[1]}")

print('\n')

print(f"SVD Error(including all zero values): SSE = {compute_error_all(A, A_approx_svd)[0]}, RMSE = {compute_error_all(A, A_approx_svd)[1]}")
print(f"NMF Error(including all zero values): SSE = {compute_error_all(A, A_approx_nmf)[0]}, RMSE = {compute_error_all(A, A_approx_nmf)[1]}")

SVD Error(ignoring zero values): SSE = 164907.8651033195, RMSE = 0.4448330669232854
NMF Error(ignoring zero values): SSE = 192826.18566987387, RMSE = 0.48101584643565326


SVD Error(including all zero values): SSE = 249651.0079229659, RMSE = 0.1476861574812087
NMF Error(including all zero values): SSE = 278869.4204541576, RMSE = 0.15608945932633245


## grid search

In [20]:
from easydict import EasyDict
from itertools import product

ranges = {
    "n_components" : [50, 100, 150, 200],
    "n_oversamples" : [5, 10, 15],
    "n_iter" : ['auto', 4, 5, 6, 7]
}

min_rmse = float('inf')
best_comb = []
parameter_combinations = [dict(zip(ranges, v)) for v in product(*ranges.values())]

for i, architecture_parameters in enumerate(parameter_combinations):
    print('============ Grid Search {:} / {:} ============'.format(i + 1, len(parameter_combinations)))
    U, Sigma, VT = randomized_svd(A, n_components=architecture_parameters['n_components'], 
                                  n_oversamples=architecture_parameters['n_oversamples'], 
                                  n_iter=architecture_parameters['n_iter'],
                                 random_state = 13)
    A_approx_svd = np.dot(np.dot(U, np.diag(Sigma)), VT)
    c_svd = compute_error(A, A_approx_svd)[1]
    if min_rmse > c_svd:
        min_rmse = c_svd
        best_comb = architecture_parameters
    log = f'RMSE: {c_svd} using_params=> {architecture_parameters}'
    print(log)

RMSE: 0.4873916268361904 using_params=> {'n_components': 50, 'n_oversamples': 5, 'n_iter': 'auto'}
RMSE: 0.4879827089236577 using_params=> {'n_components': 50, 'n_oversamples': 5, 'n_iter': 4}
RMSE: 0.4876571451230847 using_params=> {'n_components': 50, 'n_oversamples': 5, 'n_iter': 5}
RMSE: 0.48748590711242 using_params=> {'n_components': 50, 'n_oversamples': 5, 'n_iter': 6}
RMSE: 0.4873916268361904 using_params=> {'n_components': 50, 'n_oversamples': 5, 'n_iter': 7}
RMSE: 0.4873311862284604 using_params=> {'n_components': 50, 'n_oversamples': 10, 'n_iter': 'auto'}
RMSE: 0.48778463887636453 using_params=> {'n_components': 50, 'n_oversamples': 10, 'n_iter': 4}
RMSE: 0.4875248684852378 using_params=> {'n_components': 50, 'n_oversamples': 10, 'n_iter': 5}
RMSE: 0.48740051275511165 using_params=> {'n_components': 50, 'n_oversamples': 10, 'n_iter': 6}
RMSE: 0.4873311862284604 using_params=> {'n_components': 50, 'n_oversamples': 10, 'n_iter': 7}
RMSE: 0.48728224796543035 using_params=> {'n_

RMSE: 0.3797251088226212 using_params=> {'n_components': 200, 'n_oversamples': 15, 'n_iter': 6}
RMSE: 0.3794829157735911 using_params=> {'n_components': 200, 'n_oversamples': 15, 'n_iter': 7}


In [21]:
print(f'BEST SVD Model :::: RMSE: {min_rmse} using_params=> {best_comb}')

BEST SVD Model :::: RMSE: 0.3794829157735911 using_params=> {'n_components': 200, 'n_oversamples': 15, 'n_iter': 7}


In [27]:
ranges = {
    "n_components" : [100, 150, 200],
    "init" : ['random', 'nndsvd', 'nndsvda'],
    "solver": ['mu', "cd"],
    "max_iter" : [100, 150, 200],
    "l1_ratio" : [0.001, 0.01, 0.1]
}

min_rmse = float('inf')
best_comb = []
parameter_combinations = [dict(zip(ranges, v)) for v in product(*ranges.values())]

for i, architecture_parameters in enumerate(parameter_combinations):
    print('============ Grid Search {:} / {:} ============'.format(i + 1, len(parameter_combinations)))
    model_nmf = NMF(n_components = architecture_parameters['n_components'],
                    init=architecture_parameters['init'], 
                    random_state=13, 
                    max_iter=architecture_parameters['max_iter'], 
                    l1_ratio=architecture_parameters['l1_ratio']).fit(A)
    W = model_nmf.transform(A)
    H = model_nmf.components_
    A_approx_nmf = np.dot(W, H)
    c_nmf = compute_error(A, A_approx_nmf)[1]
    if min_rmse > c_nmf:
        min_rmse = c_nmf
        best_comb = architecture_parameters
    log = f'RMSE: {c_nmf} using_params=> {architecture_parameters}'
    print(log)

RMSE: 0.4813474022230581 using_params=> {'n_components': 100, 'init': 'random', 'solver': 'mu', 'max_iter': 100, 'l1_ratio': 0.001}
RMSE: 0.4813474022230581 using_params=> {'n_components': 100, 'init': 'random', 'solver': 'mu', 'max_iter': 100, 'l1_ratio': 0.01}
RMSE: 0.4813474022230581 using_params=> {'n_components': 100, 'init': 'random', 'solver': 'mu', 'max_iter': 100, 'l1_ratio': 0.1}
RMSE: 0.48135220168943443 using_params=> {'n_components': 100, 'init': 'random', 'solver': 'mu', 'max_iter': 150, 'l1_ratio': 0.001}
RMSE: 0.48135220168943443 using_params=> {'n_components': 100, 'init': 'random', 'solver': 'mu', 'max_iter': 150, 'l1_ratio': 0.01}
RMSE: 0.48135220168943443 using_params=> {'n_components': 100, 'init': 'random', 'solver': 'mu', 'max_iter': 150, 'l1_ratio': 0.1}
RMSE: 0.4813571120295308 using_params=> {'n_components': 100, 'init': 'random', 'solver': 'mu', 'max_iter': 200, 'l1_ratio': 0.001}
RMSE: 0.4813571120295308 using_params=> {'n_components': 100, 'init': 'random',

RMSE: 0.48155787660973476 using_params=> {'n_components': 100, 'init': 'nndsvda', 'solver': 'cd', 'max_iter': 100, 'l1_ratio': 0.01}
RMSE: 0.48155787660973476 using_params=> {'n_components': 100, 'init': 'nndsvda', 'solver': 'cd', 'max_iter': 100, 'l1_ratio': 0.1}
RMSE: 0.4814536540689702 using_params=> {'n_components': 100, 'init': 'nndsvda', 'solver': 'cd', 'max_iter': 150, 'l1_ratio': 0.001}
RMSE: 0.4814536540689702 using_params=> {'n_components': 100, 'init': 'nndsvda', 'solver': 'cd', 'max_iter': 150, 'l1_ratio': 0.01}
RMSE: 0.4814536540689702 using_params=> {'n_components': 100, 'init': 'nndsvda', 'solver': 'cd', 'max_iter': 150, 'l1_ratio': 0.1}
RMSE: 0.48140191619187717 using_params=> {'n_components': 100, 'init': 'nndsvda', 'solver': 'cd', 'max_iter': 200, 'l1_ratio': 0.001}
RMSE: 0.48140191619187717 using_params=> {'n_components': 100, 'init': 'nndsvda', 'solver': 'cd', 'max_iter': 200, 'l1_ratio': 0.01}
RMSE: 0.48140191619187717 using_params=> {'n_components': 100, 'init': '

RMSE: 0.4646932988808793 using_params=> {'n_components': 150, 'init': 'nndsvda', 'solver': 'mu', 'max_iter': 100, 'l1_ratio': 0.1}
RMSE: 0.4642533240897834 using_params=> {'n_components': 150, 'init': 'nndsvda', 'solver': 'mu', 'max_iter': 150, 'l1_ratio': 0.001}
RMSE: 0.4642533240897834 using_params=> {'n_components': 150, 'init': 'nndsvda', 'solver': 'mu', 'max_iter': 150, 'l1_ratio': 0.01}
RMSE: 0.4642533240897834 using_params=> {'n_components': 150, 'init': 'nndsvda', 'solver': 'mu', 'max_iter': 150, 'l1_ratio': 0.1}
RMSE: 0.46424209672735517 using_params=> {'n_components': 150, 'init': 'nndsvda', 'solver': 'mu', 'max_iter': 200, 'l1_ratio': 0.001}
RMSE: 0.46424209672735517 using_params=> {'n_components': 150, 'init': 'nndsvda', 'solver': 'mu', 'max_iter': 200, 'l1_ratio': 0.01}
RMSE: 0.46424209672735517 using_params=> {'n_components': 150, 'init': 'nndsvda', 'solver': 'mu', 'max_iter': 200, 'l1_ratio': 0.1}
RMSE: 0.4646932988808793 using_params=> {'n_components': 150, 'init': 'nnd

RMSE: 0.4470211237448343 using_params=> {'n_components': 200, 'init': 'nndsvd', 'solver': 'cd', 'max_iter': 150, 'l1_ratio': 0.001}
RMSE: 0.4470211237448343 using_params=> {'n_components': 200, 'init': 'nndsvd', 'solver': 'cd', 'max_iter': 150, 'l1_ratio': 0.01}
RMSE: 0.4470211237448343 using_params=> {'n_components': 200, 'init': 'nndsvd', 'solver': 'cd', 'max_iter': 150, 'l1_ratio': 0.1}
RMSE: 0.44701054719324795 using_params=> {'n_components': 200, 'init': 'nndsvd', 'solver': 'cd', 'max_iter': 200, 'l1_ratio': 0.001}
RMSE: 0.44701054719324795 using_params=> {'n_components': 200, 'init': 'nndsvd', 'solver': 'cd', 'max_iter': 200, 'l1_ratio': 0.01}
RMSE: 0.44701054719324795 using_params=> {'n_components': 200, 'init': 'nndsvd', 'solver': 'cd', 'max_iter': 200, 'l1_ratio': 0.1}
RMSE: 0.4497707228268728 using_params=> {'n_components': 200, 'init': 'nndsvda', 'solver': 'mu', 'max_iter': 100, 'l1_ratio': 0.001}
RMSE: 0.4497707228268728 using_params=> {'n_components': 200, 'init': 'nndsvda

In [28]:
print(f'BEST NMF Model :::: RMSE: {min_rmse} using_params=> {best_comb}')

BEST NMF Model :::: RMSE: 0.44607421393438035 using_params=> {'n_components': 200, 'init': 'random', 'solver': 'mu', 'max_iter': 150, 'l1_ratio': 0.001}


### grid search 결과 SVD 모델('n_components': 200, 'n_oversamples': 15, 'n_iter': 7)이 가장 좋은 성능을 보임 (RMSE: 0.3794829157735911)

In [32]:
U, Sigma, VT = randomized_svd(A, n_components=200, n_oversamples= 15, n_iter= 7, random_state = 13)
# 분해된 행렬이 올바른 형태로 생성되었는지 확인
print(U.shape, Sigma.shape, VT.shape)

A_approx_svd = np.dot(np.dot(U, np.diag(Sigma)), VT)

# 근사 행렬이 올바른 형태로 생성되었는지 확인
print(A_approx_svd.shape)

(5723, 200) (200,) (200, 2000)
(5723, 2000)


## 추천 결과 확인

In [392]:
problem_list = pd.read_csv(dir + 'problem_list.csv', usecols=['problemId', 'remap_id'])
problem_category_remap_list = pd.read_csv(dir + 'problem_category_remap.csv', usecols=['problemId', 'category'])
tag_list = pd.read_csv(dir + 'tag_list.csv', usecols=['tag_name', 'remap_id'])
print(problem_list)
print(problem_category_remap_list)
print(tag_list)

      problemId  remap_id
0          1000         0
1          1001         1
2          1002         2
3          1003         3
4          1004         4
...         ...       ...
5822      24954      5769
5823      24956      5770
5824      25024      5771
5825       9023      5772
5826      17248      5825

[5827 rows x 2 columns]
       problemId  category
0              0        16
1              0         1
2              0         0
3              1        16
4              1         1
...          ...       ...
14040       5769         1
14041       5770        20
14042       5770         2
14043       5770         0
14044       5771         1

[14045 rows x 2 columns]
                       tag_name  remap_id
0                          math       0.0
1                implementation       1.0
2                            dp       2.0
3                        graphs       3.0
4               data_structures       4.0
..                          ...       ...
183                

In [393]:
problem_list['category_tag'] = ""
print(problem_list)

      problemId  remap_id category_tag
0          1000         0             
1          1001         1             
2          1002         2             
3          1003         3             
4          1004         4             
...         ...       ...          ...
5822      24954      5769             
5823      24956      5770             
5824      25024      5771             
5825       9023      5772             
5826      17248      5825             

[5827 rows x 3 columns]


In [394]:
for idx in range(len(problem_category_remap_list)):
    tag = tag_list[tag_list['remap_id'] == problem_category_remap_list.iloc[idx]['category']]['tag_name'].item()
    tag += "| "
    problem_list.loc[problem_list['remap_id']==problem_category_remap_list.iloc[idx]['problemId'],'category_tag']+=tag

In [395]:
problem_list

Unnamed: 0,problemId,remap_id,category_tag
0,1000,0,arithmetic| implementation| math|
1,1001,1,arithmetic| implementation| math|
2,1002,2,geometry| math|
3,1003,3,dp|
4,1004,4,geometry|
...,...,...,...
5822,24954,5769,bruteforcing| implementation|
5823,24956,5770,combinatorics| dp| math|
5824,25024,5771,implementation|
5825,9023,5772,bfs| graph_traversal| graphs| implementation|


In [396]:
problem_list[problem_list['category_tag']==""]

Unnamed: 0,problemId,remap_id,category_tag
283,1390,282,
454,1666,453,
507,1745,506,
858,2337,852,
990,2540,984,
...,...,...,...
5701,23889,5648,
5711,24024,5658,
5717,24043,5664,
5743,24271,5690,


In [397]:
problem_list = problem_list[problem_list['remap_id'].isin(df_ratings['problemId'].unique())].reset_index(drop=True)

In [398]:
problem_list = problem_list.drop_duplicates(['remap_id'], keep='first').reset_index(drop=True)

In [399]:
problem_list

Unnamed: 0,problemId,remap_id,category_tag
0,1000,0,arithmetic| implementation| math|
1,1001,1,arithmetic| implementation| math|
2,1002,2,geometry| math|
3,1003,3,dp|
4,1004,4,geometry|
...,...,...,...
5718,24937,5768,string|
5719,24954,5769,bruteforcing| implementation|
5720,24956,5770,combinatorics| dp| math|
5721,25024,5771,implementation|


In [401]:
userId_problemId = pd.read_csv(dir + 'userId_problemId.csv', usecols=['userId', 'problemId', 'level', 'title'])
userId_problemId.drop(['userId'], axis=1, inplace=True)
userId_problemId = userId_problemId.drop_duplicates(['problemId'])
userId_problemId = userId_problemId[userId_problemId['problemId'].isin(problem_list['problemId'])]
userId_problemId = userId_problemId.reset_index(drop=True)
# userId_problemId[userId_problemId['problemId']==16234]
userId_problemId

Unnamed: 0,problemId,level,title
0,1000,1,A+B
1,1001,1,A-B
2,1002,7,터렛
3,1003,8,피보나치 함수
4,1005,13,ACM Craft
...,...,...,...
5718,10286,5,Runway Planning
5719,11447,15,Colby’s Costly Collectibles
5720,17304,20,변호사들
5721,18192,15,보고 정렬


In [403]:
problem_list['title'] = ""
problem_list['level'] = 0
for i in range(len(problem_list)):
    title = userId_problemId[userId_problemId['problemId']==problem_list.iloc[i]['problemId']]['title'].item()
    level = userId_problemId[userId_problemId['problemId']==problem_list.iloc[i]['problemId']]['level'].item()
    problem_list.loc[i, 'title'] = title
    problem_list.loc[i, 'level'] = level

In [404]:
problem_list[problem_list['problemId']==16234]

Unnamed: 0,problemId,remap_id,category_tag,title,level
4228,16234,4257,bfs| graph_traversal| graphs| implementation| ...,인구 이동,11


In [405]:
f_ratings = pd.read_csv(dir + 'userId_problemId_remap.csv', usecols=['userId', 'problemId'])
f_ratings = f_ratings.assign(solve=1)
f_ratings.drop_duplicates(inplace = True)
f_ratings = f_ratings.pivot(index = 'problemId', columns = 'userId', values = 'solve').fillna(0)

# len()

In [435]:
def makePredictions(actual, pred, user):
    actual_user_solve = pd.DataFrame(actual).iloc[:, user]
    pred_user_solve = pd.DataFrame(pred).iloc[:, user]
    total_df = pd.concat([actual_user_solve, pred_user_solve, pd.DataFrame(f_ratings.index)], axis = 1)
    total_df.columns = ['solving', 'pred_solving', 'problemId']
    total_df = total_df.merge(problem_list, on='problemId')
    
    solved = total_df[total_df['solving']==1]
    solved = solved.sort_values(by='pred_solving', ascending=False)
    
    not_solved = total_df[total_df['solving']==0]
    not_solved = not_solved.sort_values(by='pred_solving', ascending=False)
    
    total_df = total.sort_values(by='pred_solving', ascending=False)
    return total_df, solved, not_solved

def findMiddle(dataframe):
    mid = len(dataframe)//2
    return dataframe.iloc[mid-5:mid+5]

In [504]:
userNumber = 944

total, prediction_with_solved, prediction_with_notsolved = makePredictions(A, A_approx_svd, userNumber)

In [505]:
prediction_with_solved.head(10)

Unnamed: 0,solving,pred_solving,problemId,remap_id,category_tag,title,level
229,1.0,1.642394,1309,230,dp|,동물원,10
40,1.0,1.465567,1042,41,dp| parsing| string|,움,20
110,1.0,1.46057,1124,111,math| number_theory| primality_test| sieve|,언더프라임,8
167,1.0,1.342465,1222,168,math| number_theory|,홍준 프로그래밍 대회,14
3,1.0,1.333278,1003,3,dp|,피보나치 함수,8
1227,1.0,1.319161,2929,1248,implementation| parsing| regex| string|,머신 코드,5
776,1.0,1.315728,2216,783,dp|,문자열과 점수,13
1121,1.0,1.307284,2756,1139,geometry| implementation| math| pythagoras|,다트,4
1244,1.0,1.301677,2959,1265,arithmetic| math|,거북이,3
24,1.0,1.283715,1024,24,math|,수열의 합,9


In [431]:
prediction_with_notsolved.head(10)

Unnamed: 0,solving,pred_solving,problemId,remap_id,category_tag,title,level
1056,0.0,0.822181,2651,1072,dp|,자동차경주대회,12
1055,0.0,0.79803,2650,1071,geometry| implementation|,교차점개수,11
501,0.0,0.649229,1750,510,dp| knapsack| math|,서로소의 개수,14
511,0.0,0.612207,1764,521,data_structures| hash_set| sorting| string|,듣보잡,7
1283,0.0,0.595267,3015,1306,data_structures| stack|,오아시스 재결합,15
903,0.0,0.590874,2450,912,bruteforcing| greedy| implementation|,모양 정돈,14
467,0.0,0.577897,1701,474,kmp| string|,Cubeditor,14
570,0.0,0.564875,1885,580,greedy|,비부분수열,12
1128,0.0,0.513451,2776,1146,binary_search| data_structures| hash_set| sort...,암기왕,7
119,0.0,0.499208,1146,120,dp| math|,지그재그 서기,16


In [417]:
prediction_with_solved.head(10)

Unnamed: 0,solving,pred_solving,problemId,remap_id,category_tag,title,level
229,1.0,1.642394,1309,230,dp|,동물원,10
40,1.0,1.465567,1042,41,dp| parsing| string|,움,20
110,1.0,1.46057,1124,111,math| number_theory| primality_test| sieve|,언더프라임,8
167,1.0,1.342465,1222,168,math| number_theory|,홍준 프로그래밍 대회,14
3,1.0,1.333278,1003,3,dp|,피보나치 함수,8
1227,1.0,1.319161,2929,1248,implementation| parsing| regex| string|,머신 코드,5
776,1.0,1.315728,2216,783,dp|,문자열과 점수,13
1121,1.0,1.307284,2756,1139,geometry| implementation| math| pythagoras|,다트,4
1244,1.0,1.301677,2959,1265,arithmetic| math|,거북이,3
24,1.0,1.283715,1024,24,math|,수열의 합,9


In [418]:
prediction_with_notsolved.head(10)

Unnamed: 0,solving,pred_solving,problemId,remap_id,category_tag,title,level
1056,0.0,0.822181,2651,1072,dp|,자동차경주대회,12
1055,0.0,0.79803,2650,1071,geometry| implementation|,교차점개수,11
501,0.0,0.649229,1750,510,dp| knapsack| math|,서로소의 개수,14
511,0.0,0.612207,1764,521,data_structures| hash_set| sorting| string|,듣보잡,7
1283,0.0,0.595267,3015,1306,data_structures| stack|,오아시스 재결합,15
903,0.0,0.590874,2450,912,bruteforcing| greedy| implementation|,모양 정돈,14
467,0.0,0.577897,1701,474,kmp| string|,Cubeditor,14
570,0.0,0.564875,1885,580,greedy|,비부분수열,12
1128,0.0,0.513451,2776,1146,binary_search| data_structures| hash_set| sort...,암기왕,7
119,0.0,0.499208,1146,120,dp| math|,지그재그 서기,16


In [419]:
prediction_with_solved.tail(10)

Unnamed: 0,solving,pred_solving,problemId,remap_id,category_tag,title,level
600,1.0,0.36239,1933,610,data_structures| priority_queue| sweeping| tre...,스카이라인,15
882,1.0,0.357577,2420,890,arithmetic| implementation| math|,사파리월드,2
1557,1.0,0.348575,4563,1588,math| number_theory|,리벤지 오브 피타고라스,11
1252,1.0,0.34715,2977,1273,binary_search| bruteforcing| parametric_search|,폭탄제조,15
302,1.0,0.344157,1418,305,bruteforcing| math| number_theory| primality_t...,K-세준수,5
420,1.0,0.328259,1613,425,floyd_warshall| graphs|,역사,13
148,1.0,0.32111,1194,149,bfs| bitmask| graph_traversal| graphs|,"달이 차오른다, 가자.",15
1118,1.0,0.285856,2753,1136,implementation| math|,윤년,2
873,1.0,0.284266,2410,881,dp|,2의 멱수의 합,10
320,1.0,0.280147,1446,323,dijkstra| dp| graphs|,지름길,10


In [420]:
prediction_with_notsolved.tail(10)

Unnamed: 0,solving,pred_solving,problemId,remap_id,category_tag,title,level
1124,0.0,-0.26492,2759,1142,constructive| greedy|,팬케이크 뒤집기,12
1098,0.0,-0.280561,2725,1115,euclidean| math| number_theory| prefix_sum|,보이는 점의 개수,9
162,0.0,-0.290884,1213,163,greedy| implementation| string|,팰린드롬 만들기,7
835,0.0,-0.29527,2310,842,bfs| dfs| graph_traversal| graphs|,어드벤처 게임,11
1116,0.0,-0.315636,2751,1134,sorting|,수 정렬하기 2,6
1097,0.0,-0.334461,2721,1114,implementation| math|,삼각수의 합,3
905,0.0,-0.358688,2452,914,bfs| graph_traversal| graphs|,그리드 게임,20
78,0.0,-0.39162,1085,79,geometry| math|,직사각형에서 탈출,3
332,0.0,-0.445203,1464,335,data_structures| deque| greedy| string|,뒤집기 3,11
128,0.0,-0.458011,1160,129,exponentiation_by_squaring| math|,Random Number Generator,16


In [421]:
findMiddle(prediction_with_solved)

Unnamed: 0,solving,pred_solving,problemId,remap_id,category_tag,title,level
892,1.0,0.780054,2438,900,implementation|,별 찍기 - 1,3
1247,1.0,0.779947,2963,1268,arbitrary_precision| math|,무한 이진 트리 탐색,13
1114,1.0,0.77946,2749,1132,exponentiation_by_squaring| math|,피보나치 수 3,14
775,1.0,0.775823,2213,782,dp| dp_tree| trees|,트리의 독립집합,15
146,1.0,0.774543,1189,147,backtracking| bruteforcing|,컴백홈,10
171,1.0,0.768836,1230,172,dp|,문자열 거리,15
1661,1.0,0.764348,5218,1698,implementation| math| string|,알파벳 거리,4
518,1.0,0.762173,1774,528,graphs| mst|,우주신과의 교감,13
913,1.0,0.758922,2460,922,arithmetic| implementation| math|,지능형 기차 2,3
72,1.0,0.758811,1078,73,bruteforcing| math|,뒤집음,20


In [422]:
findMiddle(prediction_with_notsolved)

Unnamed: 0,solving,pred_solving,problemId,remap_id,category_tag,title,level
221,0.0,0.013053,1300,222,binary_search| parametric_search|,K번째 수,14
805,0.0,0.013014,2257,812,data_structures| stack| string|,화학식량,9
681,0.0,0.013008,2064,692,bitmask| implementation|,IP 주소,13
1620,0.0,0.013005,4948,1653,math| number_theory| primality_test| sieve|,베르트랑 공준,9
211,0.0,0.012842,1284,212,arithmetic| implementation| math|,집 주소,3
240,0.0,0.012838,1327,242,bfs| data_structures| graph_traversal| graphs|...,소트 게임,10
89,0.0,0.012779,1097,90,bruteforcing| kmp| string|,마법의 문자열,16
214,0.0,0.012591,1289,215,dp| dp_tree| math| trees|,트리의 가중치,18
1453,0.0,0.012541,3850,1482,bfs| bitmask| dp| dp_bitfield| graph_traversal...,Jumping monkey,20
1627,0.0,0.012527,5000,1660,data_structures| math| segtree|,빵 정렬,19


In [427]:
prediction_with_notsolved[prediction_with_notsolved['level']>=10].head(30)

Unnamed: 0,solving,pred_solving,problemId,remap_id,category_tag,title,level
1056,0.0,0.822181,2651,1072,dp|,자동차경주대회,12
1055,0.0,0.79803,2650,1071,geometry| implementation|,교차점개수,11
501,0.0,0.649229,1750,510,dp| knapsack| math|,서로소의 개수,14
1283,0.0,0.595267,3015,1306,data_structures| stack|,오아시스 재결합,15
903,0.0,0.590874,2450,912,bruteforcing| greedy| implementation|,모양 정돈,14
467,0.0,0.577897,1701,474,kmp| string|,Cubeditor,14
570,0.0,0.564875,1885,580,greedy|,비부분수열,12
119,0.0,0.499208,1146,120,dp| math|,지그재그 서기,16
401,0.0,0.483673,1581,406,bruteforcing| case_work|,락스타 락동호,12
914,0.0,0.480174,2461,923,data_structures| implementation| two_pointer|,대표 선수,14


## 정확도

In [261]:
# predict_solved_correct = len(prediction_with_solved[prediction_with_solved['pred_solving']>=0.5])
# predict_notsolved_correct = len(prediction_with_notsolved[prediction_with_notsolved['pred_solving']<0.5])
# total_correct = predict_solved_correct+predict_notsolved_correct
# total_len = len(prediction_with_solved)+len(prediction_with_notsolved)
# acc = total_correct/total_len
# print(f'유저: {userNumber} 정확도=> {acc}')

유저: 944 정확도=> 0.9748383714834877


In [562]:
def acc(actual, pred, user):
    actual_user_solve = pd.DataFrame(actual).iloc[:, user]
    pred_user_solve = pd.DataFrame(pred).iloc[:, user]
    total_df = pd.concat([actual_user_solve, pred_user_solve, pd.DataFrame(f_ratings.index)], axis = 1)
    total_df.columns = ['solving', 'pred_solving', 'problemId']
    total_df = total_df.merge(problem_list, on='problemId')
    
    solved = total_df[total_df['solving']==1]
    solved = solved.sort_values(by='pred_solving', ascending=False)
    total_df = total_df.sort_values(by='pred_solving', ascending=False)
    total_df = total_df[total_df['level']>=10][:100]
    
    sol = len(total_df[total_df['solving']==1]) # 상위 100개 중 유저가 푼 문제 수 100
    cnt = min(100, len(solved)) # 유저가 푼 전체 문제수 250, 500
    return sol/cnt

In [563]:
ac = 0
for userNumber in range(2000):
    ac += acc(A, A_approx_svd, userNumber)
ac = ac/2000

In [564]:
print(ac)

0.7198674477844279
