In [1]:
import pandas as pd
from IPython.display import display, Markdown
import importlib
from config import model_name
from ast import literal_eval
import torch

# <span style="color: #2D3748; background-color:#fff5b1;">__init__</span>

In [2]:
config = getattr(importlib.import_module('config'), f"{model_name}Config")

behaviors_path = 'Adressa_5w/train/behaviors_parsed_ns8_lt36_3_0.tsv'
news_path = 'Adressa_5w/train/news_parsed.tsv'

# 1. behaviors_parsed
behaviors_parsed = pd.read_table(behaviors_path)

# 2. news_parsed
news_parsed = pd.read_table(
        news_path,
        index_col='id',
        usecols=['id'] + config.dataset_attributes['news'],
        converters={
            attribute: literal_eval
            for attribute in set(config.dataset_attributes['news']) & set([
                'title', 'abstract', 'title_entities', 'abstract_entities',"category_word",
                'title_roberta', 'title_mask_roberta', 'abstract_roberta',
                'abstract_mask_roberta'
            ])
        })

# 3. news_id2int
news_id2int = {x: i for i, x in enumerate(news_parsed.index)}

# 4. news2dict
news2dict = news_parsed.to_dict('index')
for key1 in news2dict.keys():
    for key2 in news2dict[key1].keys():
        news2dict[key1][key2] = torch.tensor(
            news2dict[key1][key2])
        
# 5. padding
padding_all = {
    'category': 0,
    'subcategory': 0, "category_word" : [0] * (config.negative_sampling_ratio + 1),
    'title': [0] * config.num_words_title,
    'abstract': [0] * config.num_words_abstract,
    'title_entities': [0] * config.num_words_title,
    'abstract_entities': [0] * config.num_words_abstract,
    'title_roberta': [0] * config.num_words_title,
    'title_mask_roberta': [0] * config.num_words_title,
    'abstract_roberta': [0] * config.num_words_abstract,
    'abstract_mask_roberta': [0] * config.num_words_abstract
}
for key in padding_all.keys():
    padding_all[key] = torch.tensor(padding_all[key])

padding = { # 실제 사용될 패딩 정보만 필터링하여 저장
    k: v
    for k, v in padding_all.items()
    if k in config.dataset_attributes['news']
}

In [3]:
display(Markdown("### 1. behaviors_parsed<hr/>"))
display(behaviors_parsed)
display(Markdown("### 2. news_parsed<hr/>"))
display(news_parsed)
display(Markdown("### 3. news_id2int<hr/>"))
display(news_id2int)
display(Markdown("### 4. news2dict<hr/>"))
display(news2dict)
display(Markdown("### 5. padding<hr/>"))
display(padding)

### 1. behaviors_parsed<hr/>

Unnamed: 0,user,time,clicked_news,candidate_news_current_log_pop,candidate_news_rev_current_log_pop,clicked
0,204423,2017-02-10 02:36:43,,N22324 N22255 N22218 N22232 N22153 N22342 N222...,N22324 N22421 N22408 N22153 N22347 N22250 N222...,1 0 0 0 0 0 0 0 0
1,32616,2017-02-10 02:36:43,N16744 N20349 N16747 N17935 N11132 N13014 N178...,N22419 N22153 N22156 N22185 N22197 N22205 N221...,N22419 N22421 N22324 N22408 N22218 N22321 N222...,1 0 0 0 0 0 0 0 0
2,115972,2017-02-10 02:36:43,N14684 N9010 N17095 N5701 N8055 N10311 N17617 ...,N22419 N22268 N22266 N22232 N22255 N22218 N221...,N22419 N22380 N22304 N22218 N22384 N22363 N223...,1 0 0 0 0 0 0 0 0
3,39838,2017-02-10 02:36:43,N10279 N20336 N18434 N7376 N19903 N7979 N15183...,N22394 N22268 N22324 N22380 N22218 N22332 N221...,N22394 N22399 N22421 N22388 N22368 N22250 N223...,1 0 0 0 0 0 0 0 0
4,2570,2017-02-10 02:36:43,N16 N95 N20430 N17785 N16747 N10900 N18557 N18...,N22231 N22124 N22323 N22382 N22249 N22182 N222...,N22231 N22393 N22421 N22321 N22143 N22408 N223...,1 0 0 0 0 0 0 0 0
...,...,...,...,...,...,...
514383,5165,2017-02-12 07:59:59,N21 N17258 N13517 N4156 N17616 N5161 N19272 N1...,N22852 N22844 N22786 N22863 N22782 N22815 N227...,N22852 N22863 N22807 N22907 N22825 N22890 N228...,1 0 0 0 0 0 0 0 0
514384,229061,2017-02-12 07:59:59,N9403 N18747 N17160,N22891 N22837 N22831 N22844 N22860 N22788 N228...,N22891 N22813 N22831 N22844 N22688 N22755 N228...,1 0 0 0 0 0 0 0 0
514385,88092,2017-02-12 07:59:59,N4399 N8452 N16 N14160 N12359 N10402 N12491 N2...,N22921 N22787 N22814 N22851 N22807 N22802 N227...,N22921 N22784 N22887 N22876 N22874 N22833 N227...,1 0 0 0 0 0 0 0 0
514386,135103,2017-02-12 08:00:00,N14631 N18357 N5917 N17053 N12309 N19143 N9679...,N22921 N22893 N22787 N22813 N22839 N22868 N227...,N22921 N22784 N22782 N22791 N22813 N22874 N228...,1 0 0 0 0 0 0 0 0


### 2. news_parsed<hr/>

Unnamed: 0_level_0,category,subcategory,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
N1,1,2,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0..."
N2,1,3,"[262, 47, 263, 264, 265, 266, 267, 0, 0, 0, 0,..."
N3,1,4,"[540, 92, 541, 542, 7, 543, 0, 0, 0, 0, 0, 0, ..."
N4,1,3,"[603, 604, 605, 92, 265, 606, 130, 606, 18, 32..."
N5,5,6,"[719, 720, 721, 47, 722, 47, 723, 0, 0, 0, 0, ..."
...,...,...,...
N24436,5,16,"[4412, 67326, 69, 582, 37, 563, 0, 0, 0, 0, 0,..."
N24437,5,15,"[204, 1070, 4966, 197, 38, 1653, 1844, 47, 840..."
N24438,5,15,"[1271, 496, 4684, 527, 37, 49598, 0, 0, 0, 0, ..."
N24439,5,15,"[143871, 18, 8216, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


### 3. news_id2int<hr/>

{'N1': 0,
 'N2': 1,
 'N3': 2,
 'N4': 3,
 'N5': 4,
 'N6': 5,
 'N7': 6,
 'N8': 7,
 'N9': 8,
 'N10': 9,
 'N11': 10,
 'N12': 11,
 'N13': 12,
 'N14': 13,
 'N15': 14,
 'N16': 15,
 'N17': 16,
 'N18': 17,
 'N19': 18,
 'N20': 19,
 'N21': 20,
 'N22': 21,
 'N24': 22,
 'N25': 23,
 'N26': 24,
 'N27': 25,
 'N28': 26,
 'N29': 27,
 'N31': 28,
 'N32': 29,
 'N33': 30,
 'N34': 31,
 'N35': 32,
 'N36': 33,
 'N37': 34,
 'N38': 35,
 'N39': 36,
 'N40': 37,
 'N41': 38,
 'N42': 39,
 'N43': 40,
 'N44': 41,
 'N45': 42,
 'N47': 43,
 'N48': 44,
 'N49': 45,
 'N50': 46,
 'N51': 47,
 'N53': 48,
 'N54': 49,
 'N55': 50,
 'N56': 51,
 'N57': 52,
 'N58': 53,
 'N59': 54,
 'N60': 55,
 'N61': 56,
 'N62': 57,
 'N63': 58,
 'N64': 59,
 'N65': 60,
 'N66': 61,
 'N67': 62,
 'N68': 63,
 'N69': 64,
 'N70': 65,
 'N71': 66,
 'N72': 67,
 'N73': 68,
 'N74': 69,
 'N75': 70,
 'N77': 71,
 'N78': 72,
 'N79': 73,
 'N80': 74,
 'N81': 75,
 'N82': 76,
 'N83': 77,
 'N84': 78,
 'N85': 79,
 'N86': 80,
 'N87': 81,
 'N88': 82,
 'N89': 83,
 'N90': 84,

### 4. news2dict<hr/>

{'N1': {'category': tensor(1),
  'subcategory': tensor(2),
  'title': tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  0,  0,  0,  0,  0,  0,
           0,  0])},
 'N2': {'category': tensor(1),
  'subcategory': tensor(3),
  'title': tensor([262,  47, 263, 264, 265, 266, 267,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0])},
 'N3': {'category': tensor(1),
  'subcategory': tensor(4),
  'title': tensor([540,  92, 541, 542,   7, 543,   0,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0])},
 'N4': {'category': tensor(1),
  'subcategory': tensor(3),
  'title': tensor([603, 604, 605,  92, 265, 606, 130, 606,  18, 321, 149, 418, 607, 293,
          608, 604,   0,   0,   0,   0])},
 'N5': {'category': tensor(5),
  'subcategory': tensor(6),
  'title': tensor([719, 720, 721,  47, 722,  47, 723,   0,   0,   0,   0,   0,   0,   0,
            0,   0,   0,   0,   0,   0])},
 'N6': {'category': tensor(5),
  'subcategory': tensor(7),
  'tit

### 5. padding<hr/>

{'category': tensor(0),
 'subcategory': tensor(0),
 'title': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}

# <span style="color: #2D3748; background-color:#fff5b1;">실제로 실험</span>

In [14]:
from dataset import BaseDataset
from torch.utils.data import DataLoader

conf_mode = 'log'

behaviors_path = 'Adressa_5w/train/behaviors_parsed_ns8_lt36_3_0.tsv'
news_path = 'Adressa_5w/train/news_parsed.tsv'

# dataset Load
dataset = BaseDataset(behaviors_path, # behaviors_path
                      news_path,      # news_path
                        f'{config.data}/train/roberta')           # roberta_embedding_dir

print(f"Load training dataset with size {len(dataset)}.")


# dataloader Load
dataloader = iter(
    DataLoader(dataset,
                batch_size=config.batch_size,
                shuffle=True,
                num_workers=config.num_workers,
                drop_last=True,
                pin_memory=True))

Load training dataset with size 514388.


In [16]:
display(Markdown("### 1. _news2dict(self, id)<hr/>"))
display( dataset._news2dict('N1') )

display(Markdown("### 2. \__len__(self)<hr/>"))
display( len(dataset) )

display(Markdown("### 3. \__getitem__(self, idx), 예시 : idx == 1<hr/>"))
display( dataset[1] )

display(Markdown("### 3. \__getitem__(self, idx) - DataLoader 버전<hr/>"))
for batch in dataloader:
    print(batch)
    break

### 1. _news2dict(self, id)<hr/>

{'category': tensor(1),
 'subcategory': tensor(2),
 'title': tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  0,  0,  0,  0,  0,  0,
          0,  0])}

### 2. \__len__(self)<hr/>

514388

### 3. \__getitem__(self, idx), 예시 : idx == 1<hr/>

{'user': 32616,
 'clicked': [1, 0, 0, 0, 0, 0, 0, 0, 0],
 'candidate_news': [{'category': tensor(1),
   'subcategory': tensor(4),
   'title': tensor([  380,    18, 15094,  1152, 13707,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0,     0,     0,     0,     0,     0])},
  {'category': tensor(5),
   'subcategory': tensor(6),
   'title': tensor([  1627,   1641,   4073,     37, 101432,     47,    933,      0,      0,
                0,      0,      0,      0,      0,      0,      0,      0,      0,
                0,      0])},
  {'category': tensor(1),
   'subcategory': tensor(20),
   'title': tensor([314559,     47,   2193,     92,    380,     18,    742,    106,     16,
              321,    209,    534,      0,      0,      0,      0,      0,      0,
                0,      0])},
  {'category': tensor(5),
   'subcategory': tensor(6),
   'title': tensor([ 4897, 41342, 15918,   130,  5219, 30750,     0,     0,     0,     0,
               0,     0,     0

### 3. \__getitem__(self, idx) - DataLoader 버전<hr/>

{'user': tensor([226903,  96831, 103559, 186946,  75990,  61939,   9089,   5327, 111783,
        217909,  19378,  99766, 146654,  43862, 221744,  57201, 218909, 140169,
         17239, 208232, 102433,  46701, 183985,  44161,  32491,   9026,   4780,
           547, 209307,  68235, 204699, 145677,  27106, 107989, 126421,   7765,
         59892, 161809, 126952,  96333, 205662,  46701, 207154, 207390,  79997,
         40179,  98852, 210632,  36100, 130716]), 'clicked': [tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1]), tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]), tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]), t

# <span style="color: #2D3748; background-color:#fff5b1;">Others</span>

In [17]:
behaviors_parsed


Unnamed: 0,user,time,clicked_news,candidate_news_current_log_pop,candidate_news_rev_current_log_pop,clicked
0,204423,2017-02-10 02:36:43,,N22324 N22255 N22218 N22232 N22153 N22342 N222...,N22324 N22421 N22408 N22153 N22347 N22250 N222...,1 0 0 0 0 0 0 0 0
1,32616,2017-02-10 02:36:43,N16744 N20349 N16747 N17935 N11132 N13014 N178...,N22419 N22153 N22156 N22185 N22197 N22205 N221...,N22419 N22421 N22324 N22408 N22218 N22321 N222...,1 0 0 0 0 0 0 0 0
2,115972,2017-02-10 02:36:43,N14684 N9010 N17095 N5701 N8055 N10311 N17617 ...,N22419 N22268 N22266 N22232 N22255 N22218 N221...,N22419 N22380 N22304 N22218 N22384 N22363 N223...,1 0 0 0 0 0 0 0 0
3,39838,2017-02-10 02:36:43,N10279 N20336 N18434 N7376 N19903 N7979 N15183...,N22394 N22268 N22324 N22380 N22218 N22332 N221...,N22394 N22399 N22421 N22388 N22368 N22250 N223...,1 0 0 0 0 0 0 0 0
4,2570,2017-02-10 02:36:43,N16 N95 N20430 N17785 N16747 N10900 N18557 N18...,N22231 N22124 N22323 N22382 N22249 N22182 N222...,N22231 N22393 N22421 N22321 N22143 N22408 N223...,1 0 0 0 0 0 0 0 0
...,...,...,...,...,...,...
514383,5165,2017-02-12 07:59:59,N21 N17258 N13517 N4156 N17616 N5161 N19272 N1...,N22852 N22844 N22786 N22863 N22782 N22815 N227...,N22852 N22863 N22807 N22907 N22825 N22890 N228...,1 0 0 0 0 0 0 0 0
514384,229061,2017-02-12 07:59:59,N9403 N18747 N17160,N22891 N22837 N22831 N22844 N22860 N22788 N228...,N22891 N22813 N22831 N22844 N22688 N22755 N228...,1 0 0 0 0 0 0 0 0
514385,88092,2017-02-12 07:59:59,N4399 N8452 N16 N14160 N12359 N10402 N12491 N2...,N22921 N22787 N22814 N22851 N22807 N22802 N227...,N22921 N22784 N22887 N22876 N22874 N22833 N227...,1 0 0 0 0 0 0 0 0
514386,135103,2017-02-12 08:00:00,N14631 N18357 N5917 N17053 N12309 N19143 N9679...,N22921 N22893 N22787 N22813 N22839 N22868 N227...,N22921 N22784 N22782 N22791 N22813 N22874 N228...,1 0 0 0 0 0 0 0 0


In [18]:
# behaviors_parsed 데이터셋에서 user 중복 확인
user_duplicates = behaviors_parsed['user'].duplicated().any()

if user_duplicates:
    print("user 열에 중복값이 있습니다.")
else:
    print("user 열에 중복값이 없습니다.")

user 열에 중복값이 있습니다.


In [19]:
# 중복된 user의 개수
duplicate_count = behaviors_parsed['user'].duplicated().sum()
print(f"중복된 user의 개수: {duplicate_count}")

# 중복된 user 값들
duplicated_users = behaviors_parsed['user'][behaviors_parsed['user'].duplicated()]
print("중복된 user 값:")
print(duplicated_users.value_counts())


중복된 user의 개수: 373787
중복된 user 값:
user
2566      71
218977    69
5584      60
59416     59
16650     57
          ..
178089     1
1281       1
84346      1
128816     1
226987     1
Name: count, Length: 95589, dtype: int64


In [20]:
# user 열의 고유값 개수 확인
unique_user_count = behaviors_parsed['user'].nunique()
print(f"user 열의 고유값 개수: {unique_user_count}")


user 열의 고유값 개수: 140601


In [51]:
behaviors_parsed[behaviors_parsed['user']==226903]

Unnamed: 0,user,time,clicked_news,candidate_news_current_log_pop,candidate_news_rev_current_log_pop,clicked
464113,226903,2017-02-12 03:26:45,N17095 N17127 N12725 N10402,N22868 N22748 N22708 N22805 N22858 N22802 N227...,N22868 N22698 N22732 N22688 N22755 N22864 N228...,1 0 0 0 0 0 0 0 0


In [41]:
display(behaviors_parsed[behaviors_parsed['user']==178089]['candidate_news_current_log_pop'][162023])
display(behaviors_parsed[behaviors_parsed['user']==178089]['candidate_news_rev_current_log_pop'][162023])
display()
display(behaviors_parsed[behaviors_parsed['user']==178089]['candidate_news_current_log_pop'][162290])
display(behaviors_parsed[behaviors_parsed['user']==178089]['candidate_news_rev_current_log_pop'][162290])

'N22488 N22363 N22445 N22503 N22342 N22399 N22502 N22263 N22451'

'N22488 N22278 N22380 N22426 N22368 N22455 N22507 N22393 N22465'

'N22508 N22553 N22363 N22419 N22451 N22266 N22324 N22408 N22277'

'N22508 N22408 N22323 N22543 N22360 N22537 N22431 N22321 N22347'

In [30]:
len(behaviors_parsed[behaviors_parsed['user']==2566]['candidate_news_rev_current_log_pop'][2086])

62

In [34]:
len(behaviors_parsed[behaviors_parsed['user']==2566]['clicked_news'][499507].split(' '))

280

In [46]:
test_news[test_news['News ID']=='N22488']

Unnamed: 0,News ID,Category,Sub-Category,Title,Body,ID,Publish,Click time history
4,N22488,meninger,ordetfritt,"Vi ble stående i villrede, på vei til en begra...",Saken oppdateres. Flere beboere på Leuthenhave...,8f268d0c53750e5683e72238465400cdf9e6b7fd,2017-02-09 21:07:48,"2017-02-12 08:00:05,2017-02-12 08:02:30,2017-0..."


In [49]:
train_news[train_news['News ID']=='N22488']

Unnamed: 0,News ID,Category,Sub-Category,Title,Body,ID,Publish,Click time history
6293,N22488,meninger,ordetfritt,"Vi ble stående i villrede, på vei til en begra...",Saken oppdateres. Flere beboere på Leuthenhave...,8f268d0c53750e5683e72238465400cdf9e6b7fd,2017-02-09 21:07:48,"2017-02-10 06:10:46,2017-02-10 06:10:56,2017-0..."
