In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
import joblib
import ast

data_path = '../DataSet/'
lib_path = '../Dataset/lib_dataset/'
encoder_data_path = '../DataSet/Encoder/'

user_df = pd.read_csv(os.path.join(data_path, 'final_Baekjoon_유저_크롤링.csv'))
problem_df = pd.read_csv(os.path.join(data_path, 'final_Baekjoon_문제_크롤링.csv'))
lib_df = pd.read_csv(os.path.join(lib_path, 'Non_Encoding_lib_data.csv'))

user_ID_Encoder = joblib.load(os.path.join(encoder_data_path, 'user_ID_Encoder.joblib'))
problem_number_Encoder = joblib.load(os.path.join(encoder_data_path, 'problem_number_Encoder.joblib'))
problem_algorithm_Encoder = joblib.load(os.path.join(encoder_data_path, 'problem_algorithm_Encoder.joblib'))

In [2]:
def get_user_non_problem(user):

   problem_list = problem_df['problem_number']
   u_problem_list = user_df.groupby('user_id')['problem_number'].apply(list).to_dict()[user]
   user_tier = user_df.loc[user_df['user_id'] == user,'user_tier'].iloc[0]
   user_weak_algorithm = lib_df.loc[lib_df['user_id']==user, 'user_weak_algorithm'].iloc[0]
   user_weak_algorithm = ast.literal_eval(user_weak_algorithm)[:5]

   ans = [item for item in problem_list if item not in u_problem_list]

   user_non_problem_df = pd.DataFrame({'user_id': [user]*len(ans), 'problem_number': ans, 'user_tier': [user_tier]*len(ans), 'user_weak_algorithm': [user_weak_algorithm] * len(ans)})
   
   semi_problem_df = problem_df.drop(['problem_name',
      'problem_time_condition', 'problem_memory_condition',
      'problem_submission_count', 'problem_answer_submission_count',
      'problem_answered_people_count', 'problem_correct_rate',
      'problem_avg_try'],axis=1)
   
   user_non_problem_df = user_non_problem_df.merge(semi_problem_df, on = ['problem_number'], how='left')
   
   user_non_problem_df['problem_algorithm'] = user_non_problem_df['problem_algorithm'].apply(ast.literal_eval)

   user_non_problem_df['user_id'] = user_ID_Encoder.transform(user_non_problem_df['user_id'])
   user_non_problem_df['problem_number'] = problem_number_Encoder.transform(user_non_problem_df['problem_number'])
   user_non_problem_df['problem_algorithm'] = user_non_problem_df['problem_algorithm'].apply(lambda x: problem_algorithm_Encoder.transform(x))
   user_non_problem_df['user_weak_algorithm'] = user_non_problem_df['user_weak_algorithm'].apply(lambda x: problem_algorithm_Encoder.transform(x))

   return user_non_problem_df

In [3]:
user_id_field = 1
user_tier_field = 0.5
problem_number_field = 1
problem_tier_field = 0.5
cat_problem_avg_try_field = 0
problem_algorithm_field = 1
user_weak_algorithm_field = 5

In [5]:
def create_user_ffm_data(user_name, result_df):
    
    user_libffm_path = f'../Dataset/user_libffm_data_folder/{user_name}/'

    os.makedirs(user_libffm_path, exist_ok=True)

    with open(os.path.join(user_libffm_path, f'{user_name}_libffm_binary.txt'), 'w') as file:
        for i, v in result_df.iterrows():

            result_string = f"0:{v['user_id']}:{user_id_field} 1:{v['user_tier']}:{user_tier_field} 2:{v['problem_number']}:{problem_number_field} 3:{v['problem_tier']}:{problem_tier_field} 4:{v['cat_problem_avg_try']}:{cat_problem_avg_try_field} "
            
            for num, algorithm_num in enumerate(v['problem_algorithm']):
                result_string += f"5:{algorithm_num}:{(problem_algorithm_field / len(v['problem_algorithm'])):.2f} "
            
            for num, algorithm_num in enumerate(v['user_weak_algorithm']):
                result_string += f"6:{algorithm_num}:{(user_weak_algorithm_field / len(v['user_weak_algorithm']) * (1 - num/10)):.2f} "
            
            file.write(result_string + '\n')

    with open(os.path.join(user_libffm_path, f'{user_name}_libffm_reg.txt'), 'w') as file:
        for i, v in result_df.iterrows():

            result_string = f"0:{v['user_id']}:{user_id_field} 1:{v['user_tier']}:{user_tier_field} 2:{v['problem_number']}:{problem_number_field} 3:{v['problem_tier']}:{problem_tier_field} 4:{v['cat_problem_avg_try']}:{cat_problem_avg_try_field} "
            
            for num, algorithm_num in enumerate(v['problem_algorithm']):
                result_string += f"5:{algorithm_num}:{(problem_algorithm_field / len(v['problem_algorithm'])):.2f} "
            
            for num, algorithm_num in enumerate(v['user_weak_algorithm']):
                result_string += f"6:{algorithm_num}:{(user_weak_algorithm_field / len(v['user_weak_algorithm']) * (1 - num/10)):.2f} "
            
            file.write(result_string + '\n')

In [4]:
user_table = lib_df.drop_duplicates(['user_id']).loc[:,['user_id', 'user_tier', 'user_weak_algorithm']].reset_index(drop=True)
user_table

Unnamed: 0,user_id,user_tier,user_weak_algorithm
0,teddybear1024,21,"['자료 구조', '그래프 이론', '수학', '다이나믹 프로그래밍', '그리디 알..."
1,vladimir11,21,"['자료 구조', '그래프 이론', '다이나믹 프로그래밍', '수학', '그래프 탐..."
2,minkim3987,21,"['자료 구조', '다이나믹 프로그래밍', '세그먼트 트리', '그래프 이론', '..."
3,movie_jo,21,"['수학', '다이나믹 프로그래밍', '그래프 이론', '자료 구조', '구현', ..."
4,mymasterpark2,21,"['자료 구조', '다이나믹 프로그래밍', '수학', '그래프 이론', '구현', ..."
...,...,...,...
10102,dldkfma9577,15,"['그래프 이론', '그래프 탐색', '구현', '자료 구조', '너비 우선 탐색'..."
10103,goodjm0698,15,"['다이나믹 프로그래밍', '그래프 이론', '그래프 탐색', '수학', '자료 구..."
10104,grace9350,15,"['정렬', '자료 구조', '구현', '그래프 이론', '수학', '문자열', '..."
10105,hjlim7831,15,"['구현', '다이나믹 프로그래밍', '그래프 이론', '수학', '그래프 탐색',..."


In [10]:
user_name_list.index('ahngj96')

6342

In [12]:
complete_user_name_list= user_name_list[:6343]

In [14]:
complete_user_name_list[-1]

'ahngj96'

In [15]:
user_name_list = list(user_table['user_id'])

for user_name in user_name_list:
    if(user_name in complete_user_name_list):
        continue
    result_df = get_user_non_problem(user_name)
    create_user_ffm_data(user_name, result_df)
    print(user_name)

bababrll
bbb32
bulnabang99
claire9162
cyan4s
dkdud9261
dong0886
dqdq4197
ehddn5252
ekfsla0113
havegood123
hpk23
jenabill
jeonyj0609
jvlover
kse
lightbulb
lwc421
minryul
qorwnsjc
qudrnr217
rabbit64
salyuan
smc3843
t9rres
tkdgus115
whraaqq
wjdgml0078
wkdwoo
wnw1102
wogus230
93hschoi
alswo5419
anacoluthon
as2680
bbwwpark
blacksooooo
dhrdlxl
ebseud6135
egg528
gomharuharu
granularjarl131
jhw919
jieun21124
jjun21004
jsungmin6
jungwoo7250
khk7129
kmellon
leeholeo
music468
paparoni
younprize
alswo96
clownpiece
cosmo8724
cwh73090
dbtntkd456
devjgkim
dkdlel1z
jake0601
josungyuk
kitty1463
kmjn111
lyh4186
magicpython
maum97
nns503
pdh5056
qhrb1997
rinnnt
rjsgh1232
semtax
seowj0710
shb04016
shfur2006
tjdgus3160
uaudgml98
wnsgh2933
wrist0neye
yannoo
zezs3535
zxzxs112
anirun
belluga
beowolf4565
cjhgg2
ckdrb7067
devappmin
dltkdtn56
flrmdhqmwnsgk0062
gloomypotato
hyun0223
iskull
jaehyeon96
jaeworld2002
jin9497young
jinung1055
jjabkk
jjee1212
jjiiyyuu
junge2u
kpg0518
ks7733
parkwc0213
pinkku
qkrdydrjs7
