In [1]:
# 패키지 로드
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from collections import defaultdict
import os, random

from scipy import sparse
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
from torch.nn.init import normal_
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

In [2]:
# 경로 설정
data_path = '/Users/kyoungseo/lgupls_aistage/data/'
saved_path = '/Users/kyoungseo/lgupls_aistage/saved'
output_path = '/Users/kyoungseo/lgupls_aistage/submission'

In [3]:
# 데이터 불러오기 
history = pd.read_csv(os.path.join(data_path, 'history_data.csv'), encoding='utf-8')
watch = pd.read_csv(os.path.join(data_path, 'watch_e_data.csv'), encoding='utf-8')
buy = pd.read_csv(os.path.join(data_path, 'buy_data.csv'), encoding='utf-8') 
search = pd.read_csv(os.path.join(data_path, 'search_data.csv'), encoding='utf-8')
profile = pd.read_csv(os.path.join(data_path, 'profile_data.csv'), encoding='utf-8')

In [4]:
hd = history.copy()
wd = watch.copy()
bd = buy.copy()
sd = search.copy()
pd = profile.copy()

In [5]:
print('hd 데이터(중복 제거 전) : ', hd.shape)
print('wd 데이터(중복 제거 전) : ', wd.shape)

hd 데이터(중복 제거 전) :  (1005651, 8)
wd 데이터(중복 제거 전) :  (892794, 8)


#### 1. 중복 데이터 제거

In [6]:
# 중복행 확인
hd[hd.duplicated()] # 시청시작 데이터

Unnamed: 0,profile_id,ss_id,log_time,act_target_dtl,album_id,payment,continuous_play,short_trailer
517,5,20220421082427,20220421082957,MKID003,92,,N,N
519,5,20220421082427,20220421085009,MKID003,174,,N,N
521,5,20220421082427,20220421090237,MKID003,201,,N,N
523,5,20220421082427,20220421090332,MKID003,187,,N,N
525,5,20220421082427,20220421090449,MKID003,305,,N,N
...,...,...,...,...,...,...,...,...
1004498,32908,20220420203843,20220420204139,MKID003,6458,,N,N
1005467,33015,20220424194027,20220424194036,MKID003,416,,Y,N
1005469,33015,20220424194027,20220424194091,MKID003,175,,N,Y
1005471,33015,20220424195608,20220424195612,MKID003,57,,N,N


In [7]:
wd[wd.duplicated()] # 시청종료 데이터

Unnamed: 0,profile_id,ss_id,log_time,act_target_dtl,album_id,watch_time,total_time,continuous_play
507,5,20220421082427,20220421082953,MKID049,298,307,309,1
509,5,20220421082427,20220421085005,MKID049,92,1207,1207,1
511,5,20220421082427,20220421090233,MKID049,174,744,745,1
513,5,20220421082427,20220421090327,MKID049,201,50,50,1
515,5,20220421082427,20220421090443,MKID049,187,70,70,1
...,...,...,...,...,...,...,...,...
892571,33016,20220424194881,20220424195327,MKID049,5873,94,94,1
892573,33016,20220424194881,20220424195543,MKID049,5874,94,94,1
892575,33016,20220424194881,20220424195570,MKID049,5874,50,94,0
892577,33016,20220424194881,20220424195683,MKID049,4598,34,95,0


In [8]:
# 중복행 제거
hd = hd[~hd.duplicated()]
print('hd 데이터(중복 제거 후) : ', hd.shape)

wd = wd[~wd.duplicated()]
print('wd 데이터(중복 제거 후) : ', wd.shape)

hd 데이터(중복 제거 후) :  (899273, 8)
wd 데이터(중복 제거 후) :  (800740, 8)


#### 2. 이상치 제거

#### 2-1. History_data 내 log_time 이상치 제거

In [9]:
hd.head()

Unnamed: 0,profile_id,ss_id,log_time,act_target_dtl,album_id,payment,continuous_play,short_trailer
0,3,20220301115653,20220301115719,MKID003,15,,Y,N
1,3,20220301115653,20220301115809,MKID003,16,,Y,N
2,3,20220301115653,20220301115958,MKID003,17,,Y,N
3,3,20220301115653,20220301120118,MKID003,18,,Y,N
4,3,20220301115653,20220301120229,MKID003,19,,Y,N


In [10]:
# log_time 날짜/시간 분리
hd["log_time"] = hd["log_time"].astype(str)
hd["date"] = hd["log_time"].str.slice(0, 8)
hd["time"] = hd["log_time"].str.slice(8, 14)

In [11]:
hd.head()

Unnamed: 0,profile_id,ss_id,log_time,act_target_dtl,album_id,payment,continuous_play,short_trailer,date,time
0,3,20220301115653,20220301115719,MKID003,15,,Y,N,20220301,115719
1,3,20220301115653,20220301115809,MKID003,16,,Y,N,20220301,115809
2,3,20220301115653,20220301115958,MKID003,17,,Y,N,20220301,115958
3,3,20220301115653,20220301120118,MKID003,18,,Y,N,20220301,120118
4,3,20220301115653,20220301120229,MKID003,19,,Y,N,20220301,120229


In [12]:
hd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 899273 entries, 0 to 1005650
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   profile_id       899273 non-null  int64  
 1   ss_id            899273 non-null  int64  
 2   log_time         899273 non-null  object 
 3   act_target_dtl   899273 non-null  object 
 4   album_id         899273 non-null  int64  
 5   payment          59193 non-null   float64
 6   continuous_play  899273 non-null  object 
 7   short_trailer    899273 non-null  object 
 8   date             899273 non-null  object 
 9   time             899273 non-null  object 
dtypes: float64(1), int64(3), object(6)
memory usage: 75.5+ MB


In [13]:
# 월 조건 (3 ~ 7월 제외한 월이 있는지)
con1 = (hd['date'].str[4:6] != '03')
con2 = (hd['date'].str[4:6] != '04')
con3 = (hd['date'].str[4:6] != '05')
con4 = (hd['date'].str[4:6] != '06')
con5 = (hd['date'].str[4:6] != '07')

# 일 조건 (0일 / 00일 / 32일 이상 있는지)
con8 = (hd['date'].str[6:8] == '0')
con9 = (hd['date'].str[6:8] == '00')
con10 = (hd['date'].str[6:8].astype(int) >= 32)

In [14]:
print('조건에 맞지 않는 연도 수:', len(hd.loc[hd['date'].str[:4] != '2022']))
print('조건에 맞지 않는 월 수:', len(hd.loc[con1 & con2 & con3 & con4 & con5]))
print('조건에 맞지 않는 일 수:', len(hd.loc[con8 | con9 | con10]))

조건에 맞지 않는 연도 수: 0
조건에 맞지 않는 월 수: 0
조건에 맞지 않는 일 수: 0


In [15]:
print('조건에 맞지 않는 시 수:', len(hd.loc[hd['time'].str[:2] >= '24']))
print('조건에 맞지 않는 분 수:', len(hd.loc[hd['time'].str[2:4] >= '60']))
print('조건에 맞지 않는 초 수:', len(hd.loc[hd['time'].str[4:6] >= '60']))

조건에 맞지 않는 시 수: 0
조건에 맞지 않는 분 수: 0
조건에 맞지 않는 초 수: 73581


In [16]:
# 초에만 이상 있음
# 이상 있는 데이터 73,581개
hd.loc[hd['time'].str[4:6] >= '60']

Unnamed: 0,profile_id,ss_id,log_time,act_target_dtl,album_id,payment,continuous_play,short_trailer,date,time
2975,30,20220305130048,20220305131273,MKID003,1901,,Y,N,20220305,131273
2976,30,20220305130048,20220305132966,MKID003,1902,,Y,N,20220305,132966
2978,30,20220305130048,20220305134092,MKID003,1903,,Y,N,20220305,134092
2983,30,20220320140092,20220320140174,MKID003,446,500.0,N,N,20220320,140174
2988,30,20220320140092,20220320144788,MKID003,1908,500.0,Y,N,20220320,144788
...,...,...,...,...,...,...,...,...,...,...
1005638,33032,20220427151499,20220427154870,MKID003,7105,,N,N,20220427,154870
1005639,33032,20220427151499,20220427155075,MKID003,1725,,Y,N,20220427,155075
1005646,33032,20220427155091,20220427155668,MKID003,381,,Y,N,20220427,155668
1005647,33032,20220427155091,20220427155680,MKID003,381,,Y,N,20220427,155680


In [19]:
# 이상 데이터 제거
hd = hd.drop(hd.loc[hd['time'].str[4:6] >= '60'].index)
print('이상 데이터 제거 후:', len(hd))

이상 데이터 제거 후: 825692


In [21]:
print('조건에 맞지 않는 초 수:', len(hd.loc[hd['time'].str[4:6] >= '60']))

조건에 맞지 않는 초 수: 0


#### 3. 결측치 제거

In [22]:
# 결측치 확인
# wd, bd, pd에는 결측치x / pd keyword에 결측치 존재
hd.isnull().sum()

profile_id              0
ss_id                   0
log_time                0
act_target_dtl          0
album_id                0
payment            776209
continuous_play         0
short_trailer           0
date                    0
time                    0
dtype: int64

In [23]:
# payment에 결측치가 있는 행 삭제
hd = hd.dropna(axis=0)

In [24]:
hd.isnull().sum()

profile_id         0
ss_id              0
log_time           0
act_target_dtl     0
album_id           0
payment            0
continuous_play    0
short_trailer      0
date               0
time               0
dtype: int64

In [27]:
hd = hd.drop(['date', 'time'], axis = 1)

In [28]:
print('hd 데이터 결측치 제거 후 :', hd.shape) # 데이터 너무 많이 삭제되는거 아닌지

hd 데이터 결측치 제거 후 : (49483, 8)
