In [None]:
import os
import glob
import random
import pandas as pd
import tiktoken

import datetime
from collections import Counter
from IPython.display import display, Markdown


# user DF에서 History column과 Train column을 합쳐서 history로 사용
user = pd.read_csv('data/user.tsv', sep='\t', names=['User', 'History', 'Train', 'Question'])
user['History'] = user['History'] + user['Train']
user = user.drop(columns=['Train'])

history_news = pd.read_csv('data/history/news.tsv', sep='\t', names=['News ID', 'Publish', 'Title', 'Click time history', 'Category'], parse_dates=['Publish'])
train_news = pd.read_csv('data/train/news.tsv', sep='\t', names=['News ID', 'Publish', 'Title', 'Click time history', 'Category'], parse_dates=['Publish'])
history_news = pd.concat([history_news, train_news], ignore_index=True)
history_news[['Category_New', 'SubCategory']] = history_news['Category'].str.split('|', expand=True)
history_news = history_news.drop('Category', axis=1).rename(columns={'Category_New': 'Category'})

question_news = pd.read_csv('data/test/news.tsv', sep='\t', names=['News ID', 'Publish', 'Title', 'Click time history', 'Category'], parse_dates=['Publish'])
question_news[['Category_New', 'SubCategory']] = question_news['Category'].str.split('|', expand=True)
question_news = question_news.drop('Category', axis=1).rename(columns={'Category_New': 'Category'})

# publish 순서에 맞게 오름차순으로 정렬
history_news_sorted = history_news.sort_values(by='Publish', ascending=True).reset_index(drop=True)
question_news_sorted = question_news.sort_values(by='Publish', ascending=True).reset_index(drop=True)

# behaviors
train_behaviors = pd.read_csv('data/train/behaviors.tsv', sep='\t', names=['User ID', 'Click time', 'Click history', 'click'])
test_behaviors = pd.read_csv('data/test/behaviors.tsv', sep='\t', names=['User ID', 'Click time', 'Click history', 'click'])

In [5]:
display(Markdown("### train_behaviors 출력<hr/>"))
display(train_behaviors)
display(Markdown("### history_news 출력<hr/>"))
display(history_news)
display(Markdown("### user 출력<hr/>"))
display(user)

### train_behaviors 출력<hr/>

Unnamed: 0,User ID,Click time,Click history,click
0,U21742,2017-01-04 08:00:05,N1057 N904,N1517-1
1,U1133,2017-01-04 08:00:05,N868 N1412 N387 N757 N2 N335 N317 N1344 N318 N...,N1518-1
2,U4912,2017-01-04 08:00:07,N2 N318 N829 N1342 N757 N1152 N1349 N1304 N86 ...,N1519-1
3,U13359,2017-01-04 08:00:08,N1057 N1412 N879 N1432 N1444 N1092 N25 N830 N9...,N1520-1
4,U11615,2017-01-04 08:00:09,N1057 N1464 N1233 N25 N1349 N1419 N317 N329 N1...,N1521-1
...,...,...,...,...
295972,U2190,2017-01-07 07:59:50,N868 N843 N494 N2 N318 N837 N657 N1469 N1449 N...,N3433-1
295973,U2912,2017-01-07 07:59:52,N1057 N868 N757 N933 N1092 N25 N230 N1 N426 N1...,N3504-1
295974,U8298,2017-01-07 07:59:55,N533 N2 N333 N335 N385 N30 N329 N318 N230 N408...,N3482-1
295975,U4757,2017-01-07 07:59:58,N533 N1412 N1202 N25 N335 N329 N1304 N1469 N13...,N3264-1


### history_news 출력<hr/>

Unnamed: 0,News ID,Publish,Title,Click time history,Category,SubCategory
0,N1,2016-12-31 17:13:57,Se lesernes nyttårsbilder,"2017-01-01 08:00:02,2017-01-01 08:00:09,2017-0...",,
1,N2,2016-12-31 18:06:21,Her koker det over for Tønseth. Så stakk han f...,"2017-01-01 08:00:15,2017-01-01 08:00:52,2017-0...",100sport,vintersport
2,N3,2016-12-31 15:48:48,- Det blir fyrverkeri,"2017-01-01 08:00:36,2017-01-01 08:00:47,2017-0...",nyheter,trondheim
3,N4,2016-12-31 10:29:14,Norsk løper reiser hjem fra Tour de Ski,"2017-01-01 08:01:51,2017-01-01 08:15:15,2017-0...",100sport,vintersport
4,N5,2016-12-30 10:26:28,Mann (78) funnet død i en kum ved riksvei i Gj...,"2017-01-01 08:02:15,2017-01-01 08:05:55,2017-0...",nyheter,moreromsdal
...,...,...,...,...,...,...
3531,N3532,2016-01-07 19:24:10,Slik kan ny E6 mellom Kvål og Hovin bli,"2017-01-07 07:52:03,2017-01-07 07:52:40",nyheter,sortrondelag
3532,N3533,2009-10-02 13:09:25,En atombombe over Trondheim,2017-01-07 07:54:30,nyheter,dokument
3533,N3534,2013-09-14 11:08:59,Trøndersk joik åpner Disney-film,2017-01-07 07:54:33,,
3534,N3535,2016-11-23 13:26:29,Lerøy Midt er skuffet over Mattilsynet etter l...,2017-01-07 07:54:47,pluss,nyheter


### user 출력<hr/>

Unnamed: 0,User,History,Question
0,U1,"N1,2016-12-31 17:13:57,2017-01-01 08:00:02;N1,...","N4296,2017-01-07 13:08:20,2017-01-08 01:35:11;"
1,U2,"N1,2016-12-31 17:13:57,2017-01-01 08:00:09;N1,...","N3589,2017-01-06 07:00:00,2017-01-07 18:06:12;..."
2,U3,"N1,2016-12-31 17:13:57,2017-01-01 08:00:09;N1,...","N4296,2017-01-07 13:08:20,2017-01-08 00:53:08;"
3,U4,"N2,2016-12-31 18:06:21,2017-01-01 08:00:15;N2,...","N4475,2017-01-07 18:55:37,2017-01-08 04:28:54;"
4,U5,"N3,2016-12-31 15:48:48,2017-01-01 08:00:36;N3,...","N3539,2017-01-06 22:06:33,2017-01-07 12:08:25;..."
...,...,...,...
21934,U21935,"N1412,2017-01-03 18:24:38,2017-01-04 07:39:56;...","N3890,2017-01-07 07:00:52,2017-01-08 07:43:48;..."
21935,U21936,"N1448,2017-01-03 20:12:15,2017-01-04 07:40:22;...","N4274,2017-01-07 13:51:45,2017-01-08 00:22:07;..."
21936,U21937,"N1379,2017-01-03 16:23:27,2017-01-04 07:44:48;...","N4125,2017-01-07 10:36:22,2017-01-07 20:36:47;..."
21937,U21938,"N1504,2017-01-03 21:56:31,2017-01-04 07:50:25;...","N3556,2017-01-06 18:49:28,2017-01-07 09:45:59;..."
