In [None]:
!pip install bottleneck

In [1]:
import os
import json
import pickle
import numpy as np
import pandas as pd
import bottleneck as bn

In [2]:
DATA_DIR = '/content' # Write your own directory
RANDOM_SEED = 98765
FOLD_SIZE = 3000
NUM_TOP_PROBLEMS = 10
np.random.seed(98765)

In [3]:
raw_data = pd.read_csv(os.path.join(DATA_DIR, 'problems_solved.csv'), header=0)
raw_data

Unnamed: 0,id,handle,problems
0,1,koosaga,"1000,1001,1002,1003,1004,1005,1006,1007,1008,1..."
1,2,cki86201,"1000,1001,1002,1003,1004,1005,1006,1007,1008,1..."
2,3,mitnegativeinfinity,"1000,1001,1019,1056,1067,1144,1150,1311,1372,1..."
3,4,ainta,"1000,1001,1002,1003,1004,1005,1007,1008,1009,1..."
4,5,yclock,"1000,1001,1002,1003,1004,1005,1007,1008,1009,1..."
...,...,...,...
63871,63872,ohjcms1,25571071815596
63872,63873,steven1010,2557
63873,63874,wwowwww,"1000,1001,1008,1330,2438,2439,2557,2588,2739,2..."
63874,63875,yijh061016,


In [4]:
df_user_problems = raw_data[['handle', 'problems']]
df_user_problems

Unnamed: 0,handle,problems
0,koosaga,"1000,1001,1002,1003,1004,1005,1006,1007,1008,1..."
1,cki86201,"1000,1001,1002,1003,1004,1005,1006,1007,1008,1..."
2,mitnegativeinfinity,"1000,1001,1019,1056,1067,1144,1150,1311,1372,1..."
3,ainta,"1000,1001,1002,1003,1004,1005,1007,1008,1009,1..."
4,yclock,"1000,1001,1002,1003,1004,1005,1007,1008,1009,1..."
...,...,...
63871,ohjcms1,25571071815596
63872,steven1010,2557
63873,wwowwww,"1000,1001,1008,1330,2438,2439,2557,2588,2739,2..."
63874,yijh061016,


In [5]:
df_user_problems.problems = df_user_problems.problems.str.split(',')
df_user_problems

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user_problems.problems = df_user_problems.problems.str.split(',')


Unnamed: 0,handle,problems
0,koosaga,"[1000, 1001, 1002, 1003, 1004, 1005, 1006, 100..."
1,cki86201,"[1000, 1001, 1002, 1003, 1004, 1005, 1006, 100..."
2,mitnegativeinfinity,"[1000, 1001, 1019, 1056, 1067, 1144, 1150, 131..."
3,ainta,"[1000, 1001, 1002, 1003, 1004, 1005, 1007, 100..."
4,yclock,"[1000, 1001, 1002, 1003, 1004, 1005, 1007, 100..."
...,...,...
63871,ohjcms1,"[2557, 10718, 15596]"
63872,steven1010,[2557]
63873,wwowwww,"[1000, 1001, 1008, 1330, 2438, 2439, 2557, 258..."
63874,yijh061016,


In [6]:
df_user_problems = df_user_problems.explode('problems').reset_index(drop=True)
df_user_problems = df_user_problems.dropna(axis=0)
df_user_problems

Unnamed: 0,handle,problems
0,koosaga,1000
1,koosaga,1001
2,koosaga,1002
3,koosaga,1003
4,koosaga,1004
...,...,...
8509347,wwowwww,10998
8509348,wwowwww,11021
8509349,wwowwww,11022
8509350,wwowwww,14681


In [7]:
df_user_problems = df_user_problems.astype({'handle':'str', 'problems': np.uint16})
df_user_problems

Unnamed: 0,handle,problems
0,koosaga,1000
1,koosaga,1001
2,koosaga,1002
3,koosaga,1003
4,koosaga,1004
...,...,...
8509347,wwowwww,10998
8509348,wwowwww,11021
8509349,wwowwww,11022
8509350,wwowwww,14681


In [8]:
def get_interaction_count(df, id):
    interaction_count_groupby_id = df[[id]].groupby(id, as_index=False)
    grouped_count = interaction_count_groupby_id.size()
    return grouped_count

In [9]:
def filter_data(df, min_user_interaction, min_problem_interaction):
    user_interaction_count = get_interaction_count(df, 'handle')
    problem_interaction_count = get_interaction_count(df, 'problems')

    print(f"Size of Dataframe Before Filtering: {df.size}")

    if min_user_interaction > 0:
        df = df[df['handle'].isin(user_interaction_count[user_interaction_count['size'] >= min_user_interaction]['handle'])]

    print(f"Size of Dataframe After User Filtering: {df.size}")

    if min_problem_interaction > 0:
        df = df[df['problems'].isin(problem_interaction_count[problem_interaction_count['size'] >= min_problem_interaction]['problems'])]

    print(f"Size of Dataframe After Problem Filtering: {df.size}")

    return df, user_interaction_count, problem_interaction_count

In [10]:
df_user_problems_filtered, user_activity, problem_popularity = filter_data(df_user_problems, 5, 10)

Size of Dataframe Before Filtering: 17018064
Size of Dataframe After User Filtering: 17005344
Size of Dataframe After Problem Filtering: 16947992


In [11]:
df_user_problems_filtered

Unnamed: 0,handle,problems
0,koosaga,1000
1,koosaga,1001
2,koosaga,1002
3,koosaga,1003
4,koosaga,1004
...,...,...
8509347,wwowwww,10998
8509348,wwowwww,11021
8509349,wwowwww,11022
8509350,wwowwww,14681


In [12]:
user_activity

Unnamed: 0,handle,size
0,0000000000,734
1,0000064,136
2,0000sb,10
3,0004poppop,145
4,000ian,1
...,...,...
63549,zzzzlll,6
63550,zzzzong123,6
63551,zzzzz9887,27
63552,zzzzzzzz,67


In [13]:
problem_popularity

Unnamed: 0,problems,size
0,1000,52278
1,1001,47009
2,1002,16183
3,1003,22466
4,1004,4686
...,...,...
19716,25070,1
19717,25071,1
19718,25074,1
19719,25075,1


# EASE(Embarrassingly Shallow Autoencoders for Sparse Data)
EASE모델은 validation이 필요가 없으므로 학습 데이터셋과 검증 데이터셋으로 나누지 않는다.

In [14]:
unique_user_id = pd.unique(df_user_problems_filtered['handle'])
index_permutation = np.random.permutation(unique_user_id.size)
unique_user_id_shuffled = unique_user_id[index_permutation]
print(f"Before shuffling\n {unique_user_id}\n")
print(f"After shuffling\n {unique_user_id_shuffled}")

Before shuffling
 ['koosaga' 'cki86201' 'mitnegativeinfinity' ... 'hihimotga' 'number'
 'wwowwww']

After shuffling
 ['goonbamm' 'junqwoopark' 'skwlghddlek' ... 'ogisoo' 'doublej20'
 'afternoon']


In [15]:
unique_problem_id = pd.unique(df_user_problems_filtered['problems'])
print(unique_problem_id)

[1000 1001 1002 ... 6677 2071 1845]


In [16]:
user_to_id = dict((str(user_id), int(id)) for (id, user_id) in enumerate(unique_user_id))
problem_to_id = dict((int(problem_id), int(id)) for (id, problem_id) in enumerate(unique_problem_id))

In [17]:
id_to_user = dict((int(id), str(user_id)) for user_id, id in user_to_id.items())
id_to_problem = dict((int(id), int(problem_id)) for problem_id, id in problem_to_id.items())

In [18]:
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

with open(os.path.join(DATA_DIR, 'user_to_id.json'), 'w', encoding="utf-8") as f:
    json.dump(user_to_id, f, ensure_ascii=False, indent="\t")

with open(os.path.join(DATA_DIR, 'problem_to_id.json'), 'w', encoding="utf-8") as f:
    json.dump(problem_to_id, f, ensure_ascii=False, indent="\t")

In [19]:
num_users = unique_user_id_shuffled.size
num_heldout_users = FOLD_SIZE
print(f"Number of Users: {num_users}")
print(f"Number of Fold Size: {num_heldout_users}")

Number of Users: 61124
Number of Fold Size: 3000


In [20]:
def numerize(df, user_to_id, problem_to_id):
    user_id = df['handle'].apply(lambda x: user_to_id[str(x)])
    problem_id = df['problems'].apply(lambda x: problem_to_id[str(x)])

    return pd.DataFrame(data={'handle': user_id, 'problem': problem_id}, columns=['handle', 'problem'])

In [21]:
def denumerize(df, id_to_user, id_to_problem):
    user_id = df['user_id'].apply(lambda x: id_to_user[x])
    problem_id = df['problem_id'].apply(lambda x: id_to_problem[x])

    return pd.DataFrame(data={'handle': user_id, 'problem': problem_id}, columns=['handle', 'problem'])

In [22]:
with open(os.path.join(DATA_DIR, 'user_to_id.json'), 'r', encoding="utf-8") as f:
    user_to_id = json.load(f)

with open(os.path.join(DATA_DIR, 'problem_to_id.json'), 'r', encoding="utf-8") as f:
    problem_to_id = json.load(f)

In [23]:
df_for_ease = numerize(df_user_problems_filtered, user_to_id, problem_to_id)
df_for_ease

Unnamed: 0,handle,problem
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
...,...,...
8509347,61123,3646
8509348,61123,3661
8509349,61123,3662
8509350,61123,5307


In [24]:
df_for_ease['solve'] = [1] * len(df_for_ease)
df_for_ease

Unnamed: 0,handle,problem,solve
0,0,0,1
1,0,1,1
2,0,2,1
3,0,3,1
4,0,4,1
...,...,...,...
8509347,61123,3646,1
8509348,61123,3661,1
8509349,61123,3662,1
8509350,61123,5307,1


In [25]:
pivot_table = df_for_ease.pivot_table(index=["handle"], columns=["problem"], values="solve")
X = pivot_table.to_numpy()
X = np.nan_to_num(X)

In [26]:
print(X)

[[1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]]


In [27]:
class EASE:
    """
    Embarrassingly Shallow Autoencoders model class
    """

    def __init__(self, lambda_):
        self.B = None
        self.lambda_ = lambda_

    def train(self, interaction_matrix):
        """
        train pass
        :param interaction_matrix: interaction_matrix
        """
        G = interaction_matrix.T @ interaction_matrix
        diag = list(range(G.shape[0]))
        G[diag, diag] += self.lambda_
        P = np.linalg.inv(G)

        # B = P * (X^T * X − diagMat(γ))
        self.B = P / -np.diag(P)
        min_dim = min(*self.B.shape)
        self.B[range(min_dim), range(min_dim)] = 0

    def forward(self, user_row):
        """
        forward pass
        """
        return user_row @ self.B

In [28]:
ease = EASE(300)
ease.train(X)

In [29]:
result = ease.forward(X[:, :])
print(result)

[[ 1.00727725e+00  1.02440200e+00  1.01068748e+00 ...  1.11855588e-03
   3.12755723e-03 -5.54864549e-03]
 [ 1.00036939e+00  1.01667548e+00  9.88474423e-01 ...  1.31113134e-03
   6.84148040e-03  2.71095626e-03]
 [ 1.08367825e+00  8.31202138e-01  1.67785703e-02 ... -5.30959702e-03
   4.92269876e-03  5.50936932e-03]
 ...
 [-1.71914208e-02  7.86388777e-03  5.43994123e-02 ...  6.76805079e-04
   8.47921342e-04 -3.42467570e-03]
 [-3.50431692e-03 -2.41779668e-02 -6.70369826e-02 ... -1.65399198e-03
  -4.73558723e-03  8.10726865e-03]
 [ 9.85755405e-01  9.85760876e-01  8.15752727e-03 ... -1.13759033e-04
  -3.53057683e-04  7.46113530e-04]]


In [30]:
print(X.nonzero())

(array([    0,     0,     0, ..., 61123, 61123, 61123]), array([   0,    1,    2, ..., 3662, 5307, 5729]))


In [31]:
result[X.nonzero()] = -np.inf
print(result)

[[       -inf        -inf        -inf ...  0.00111856  0.00312756
  -0.00554865]
 [       -inf        -inf        -inf ...  0.00131113  0.00684148
   0.00271096]
 [       -inf        -inf  0.01677857 ... -0.0053096   0.0049227
   0.00550937]
 ...
 [-0.01719142  0.00786389  0.05439941 ...  0.00067681  0.00084792
  -0.00342468]
 [-0.00350432 -0.02417797 -0.06703698 ... -0.00165399 -0.00473559
   0.00810727]
 [       -inf        -inf  0.00815753 ... -0.00011376 -0.00035306
   0.00074611]]


In [32]:
top_problems_by_user = bn.argpartition(-result, NUM_TOP_PROBLEMS, axis=1)[:, :NUM_TOP_PROBLEMS]
print(top_problems_by_user)

[[8844 8783 8807 ... 8725 8912 9010]
 [3158 5573 5979 ... 2593 3220 3442]
 [6323 6268 1835 ... 7783 7030 4395]
 ...
 [8476 8467 8464 ... 3602 5920 8457]
 [7250 3679 3524 ... 8494 3027  755]
 [  87 1012 1008 ... 3490 5768 1352]]


In [33]:
user_result = []
problem_result =[]

for id, top_k in enumerate(top_problems_by_user):
    user_result.extend([id] * NUM_TOP_PROBLEMS)
    problem_result.extend(top_k)

df_user_result = pd.DataFrame(user_result, columns=['user_id'])
df_problem_result = pd.DataFrame(problem_result, columns=['problem_id'])
df_result = pd.concat([df_user_result, df_problem_result], axis=1)

In [34]:
df_result

Unnamed: 0,user_id,problem_id
0,0,8844
1,0,8783
2,0,8807
3,0,8745
4,0,8841
...,...,...
611235,61123,3944
611236,61123,1026
611237,61123,3490
611238,61123,5768


In [35]:
df_infer = denumerize(df_result, id_to_user, id_to_problem)
df_infer.columns = ['handle', 'problem']
df_infer = df_infer.sort_values('handle')
df_infer

Unnamed: 0,handle,problem
7749,0000000000,2010
7748,0000000000,3004
7741,0000000000,2231
7742,0000000000,2743
7743,0000000000,13866
...,...,...
432753,zzzzzzzzu2,2839
432752,zzzzzzzzu2,10809
432751,zzzzzzzzu2,11654
432750,zzzzzzzzu2,1157


In [36]:
df_infer[df_infer['handle'] == 'glanceyes']['problem']

3891     6236
3890    10266
3893    10953
3894     8111
3896     4781
3897     1620
3898    14918
3899     1550
3892    10211
3895     9202
Name: problem, dtype: int64

In [37]:
df_infer.to_csv(os.path.join(DATA_DIR, 'inference.csv'), index=False)

In [39]:
with open(os.path.join(DATA_DIR, "model.txt"), 'wb') as f:
    pickle.dump(ease, f)