In [1]:
import sys
sys.path.append("../src")
import pandas as pd
from dataLoad import dataLoad
from datetime import datetime

path= "../dataset/"
item_names_table, train_user_seq_log, test_user_label, test_user_seq_log = dataLoad(path)

1. 요일(0~6)
2. 정수 매핑 일자 (0~90)
3. 시간 (0~23)
4. 정수 매핑 주차 (1~13)
5. mp : 아이템 인기도(누적 클릭 횟수)
- 고민해볼 문제 : 클릭이 한번도 안 된 아이템은 학습에서 제거할 것인가? : O    
6. 유저의 아이템 클릭 횟수 (얼마나 많은 로그를 남기는 충성고객인지)
7. 아이템 카테고리 (자연어 처리 필요)
8. 아이템 브랜드 (자연어 처리 필요)

In [2]:
train_timestamp = train_user_seq_log["timestamp"]
train_timestamp

0          2021-02-11 13:03:42
1          2021-02-11 13:03:52
2          2021-02-11 13:04:07
3          2021-02-11 13:05:04
4          2021-02-11 13:06:19
                   ...        
32134194   2021-05-12 13:00:47
32134195   2021-05-12 13:00:42
32134196   2021-05-12 13:01:57
32134197   2021-05-12 13:02:31
32134198   2021-05-12 13:03:22
Name: timestamp, Length: 32134199, dtype: datetime64[ns]

In [3]:
# 1. 요일(1~7)

train_timestamp_df = train_timestamp.to_frame(name="timestamp")
train_user_seq_log["day_of_week"] = train_timestamp_df["timestamp"].dt.dayofweek

In [4]:
# 2. 정수 매핑 일자 (1~31)

reference_date = datetime(2021, 2, 11)
days = (train_timestamp - reference_date).dt.total_seconds() / (24 * 3600)
days_df = days.astype(int).to_frame(name="days")

train_user_seq_log = pd.concat([train_user_seq_log, days_df], axis=1)

In [5]:
# 3. 시간 (1~24)
train_user_seq_log["hour"] = train_timestamp_df["timestamp"].dt.hour

In [6]:
# 4. 정수 매핑 주차 (1~13)

weeks = (days / 7).astype(int) + 1
weeks_df = weeks.to_frame(name="weeks")
train_user_seq_log = pd.concat([train_user_seq_log, weeks_df], axis=1)

In [7]:
# 5. mp : 아이템 인기도(누적 클릭 횟수)
train_user_seq_log = train_user_seq_log.drop_duplicates() 
# 5-1 . 누적집계
# 시퀀셜 데이터이므로 시간에 따른 누적량 증가 확인 가능.
cumcount = train_user_seq_log.groupby("item_id").cumcount().to_frame(name="cumcount")
cumcount["cumcount"] = cumcount["cumcount"] + 1
train_user_seq_log = pd.concat([train_user_seq_log, cumcount], axis=1)

In [8]:
# 5-2 . 90일간의 총 누적집계 (mp ranking)
train_user_seq_log_df = train_user_seq_log.copy()
mp_item = train_user_seq_log_df.item_id.value_counts().reset_index().rename(columns={"index":"item_id","item_id":"click_count"})
### normalize column
min_val = mp_item["click_count"].min()
range_val = mp_item["click_count"].max() - mp_item["click_count"].min()
mp_item["click_count_normalized"] = (mp_item["click_count"] - min_val) / range_val
train_user_seq_log = pd.merge(train_user_seq_log, mp_item, on="item_id", how="inner")

In [9]:
# 6. 유저 충성도
train_user_seq_log_df = train_user_seq_log.copy()
user_click = train_user_seq_log_df.user.value_counts().reset_index().rename(columns={"index":"user","user":"user_click_count"})
### normalize column
min_val = user_click["user_click_count"].min()
range_val = user_click["user_click_count"].max() - user_click["user_click_count"].min()
user_click["user_click_count_normalized"] = (user_click["user_click_count"] - min_val) / range_val
train_user_seq_log = pd.merge(train_user_seq_log, user_click, on="user", how="inner")

In [10]:
# 7. 아이템 카테고리 (자연어 처리 필요)

# 일단은 여기까지 하고, 만든 임베딩을 넣어 모델 돌리기 시도

In [11]:
# 8. 아이템 브랜드 (자연어 처리 필요)

In [12]:
train_user_seq_log

Unnamed: 0,user,item_id,timestamp,day_of_week,days,hour,weeks,cumcount,click_count,click_count_normalized,user_click_count,user_click_count_normalized
0,0,805696,2021-02-11 13:03:42,3,0,13,1,1,3435,0.099660,1714,0.053422
1,0,386903,2021-02-11 13:03:52,3,0,13,1,1,561,0.016252,1714,0.053422
2,0,386903,2021-02-12 13:41:36,4,1,13,1,2,561,0.016252,1714,0.053422
3,0,3832,2021-02-11 13:04:07,3,0,13,1,1,15941,0.462606,1714,0.053422
4,0,3832,2021-02-12 08:33:29,4,1,8,1,2,15941,0.462606,1714,0.053422
...,...,...,...,...,...,...,...,...,...,...,...,...
32134191,829201,792871,2021-05-12 11:57:58,2,90,11,13,1,1,0.000000,4,0.000062
32134192,829201,100146,2021-05-12 12:05:06,2,90,12,13,1,1,0.000000,4,0.000062
32134193,829201,305387,2021-05-12 12:07:02,2,90,12,13,1,1,0.000000,4,0.000062
32134194,829356,353736,2021-05-12 12:46:11,2,90,12,13,1,1,0.000000,2,0.000000


In [13]:
train_user_seq_log.to_parquet("../dataset/fe_train_user_seq_log.parquet")

In [14]:
# featureEngineering.py에 위 코드 정리 완료. 실행 시 fe_ 버전으로 train 데이터셋 반환.
# 다만 notebook들에서는 parquet datatype로 미리 저장해둔 file을 dataLoad.py에서 바로 불러올 수 있도록 함
# 촉박한 시간 관계상 shortcut 활용