In [1]:
# 패키지 로드
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from collections import defaultdict
import os, random

from scipy import sparse
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
from torch.nn.init import normal_
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F

#import plotnine
#from plotnine import *

In [2]:
# 경로 설정
data_path = '/Users/kyoungseo/lgupls_aistage/data/'

In [3]:
# 데이터 불러오기 
df = pd.read_csv(os.path.join(data_path, 'sample.csv'), encoding='utf-8')

In [4]:
df = df.drop(['Unnamed: 0'], axis = 1)
df.head()

Unnamed: 0,profile_id,album_id,ss_id,log_time_x,log_time_y,diff,diff_ss,diff_ox,watch_time,total_time,...,search_ox,prefer,sex,age,pr_interest_keyword_cd_1,pr_interest_keyword_cd_2,pr_interest_keyword_cd_3,ch_interest_keyword_cd_1,ch_interest_keyword_cd_2,ch_interest_keyword_cd_3
0,3,15.0,2022-03-01 11:56:53,2022-03-01 11:57:19,2022-03-01 11:58:05,0 days 00:00:46,46.0,True,46.0,46.0,...,0.0,4.0,F,5,P02,P04,P07,K01,K03,K04
1,3,16.0,2022-03-01 11:56:53,2022-03-01 11:58:09,2022-03-01 11:59:54,0 days 00:01:45,105.0,False,104.0,105.0,...,0.0,4.0,F,5,P02,P04,P07,K01,K03,K04
2,3,17.0,2022-03-01 11:56:53,2022-03-01 11:59:58,2022-03-01 12:01:14,0 days 00:01:16,76.0,True,76.0,76.0,...,0.0,4.0,F,5,P02,P04,P07,K01,K03,K04
3,3,18.0,2022-03-01 11:56:53,2022-03-01 12:01:18,2022-03-01 12:02:26,0 days 00:01:08,68.0,False,67.0,68.0,...,0.0,4.0,F,5,P02,P04,P07,K01,K03,K04
4,3,19.0,2022-03-01 11:56:53,2022-03-01 12:02:29,2022-03-01 12:04:00,0 days 00:01:31,91.0,False,90.0,90.0,...,0.0,4.0,F,5,P02,P04,P07,K01,K03,K04


In [5]:
df.columns

Index(['profile_id', 'album_id', 'ss_id', 'log_time_x', 'log_time_y', 'diff',
       'diff_ss', 'diff_ox', 'watch_time', 'total_time', 'continuous_play_x',
       'continuous_play_y', 'act_target_dtl_x', 'act_target_dtl',
       'short_trailer', 'payment', 'time_slot', 'search_ox', 'prefer', 'sex',
       'age', 'pr_interest_keyword_cd_1', 'pr_interest_keyword_cd_2',
       'pr_interest_keyword_cd_3', 'ch_interest_keyword_cd_1',
       'ch_interest_keyword_cd_2', 'ch_interest_keyword_cd_3'],
      dtype='object')

In [6]:
X = df.loc[:, ~df.columns.isin(['album_id'])] # FEATURE DATA
y = df['album_id'] # LABEL DATA

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2)

In [8]:
pd.options.display.float_format = '{:.5f}'.format

In [52]:
%%time

# 전부 int or float형으로 바꾸고 돌려야 함

# extract effective features using variance inflation factor
vif = pd.DataFrame()

# variance_inflation_factor(X, i) : Xi를 x나머지로 회귀분석한 후 VIF값을 구한것. 즉 xi의 vif값. 즉 이값이 높을수록 종속성이 높다는 뜻

vif['VIF_Factor'] = [variance_inflation_factor(X.values, i) 
                     for i in range(X.shape[1])]
vif['Feature'] = X.columns
vif.sort_values(by='VIF_Factor', ascending=True)

Wall time: 1min 49s


Unnamed: 0,VIF_Factor,Feature
0,0.99999,profile_id
11,1.0,search_ox
7,1.00001,continuous_play_y
16,1.00016,pr_interest_keyword_cd_2_encode
9,1.00024,payment
6,1.00075,total_time
10,1.00152,time_slot
14,1.0016,age
17,1.00195,pr_interest_keyword_cd_3_encode
19,1.00241,ch_interest_keyword_cd_2_encode


In [53]:
X.corr()

Unnamed: 0,profile_id,album_id,ss_id,log_time_x,log_time_y,diff_ss,diff_ox,watch_time,total_time,continuous_play_y,...,search_ox,prefer,sex,age,pr_interest_keyword_cd_1_encode,pr_interest_keyword_cd_2_encode,pr_interest_keyword_cd_3_encode,ch_interest_keyword_cd_1_encode,ch_interest_keyword_cd_2_encode,ch_interest_keyword_cd_3_encode
profile_id,1.0,0.01412,-0.0042,-0.00432,-0.00432,-0.00015,0.04314,-0.00119,-0.02353,0.00161,...,-0.00122,-0.05227,,0.02959,-0.06337,-6e-05,-0.04477,-0.04866,-0.04802,-0.05321
album_id,0.01412,1.0,-0.00921,-0.00971,-0.00968,0.00112,-0.00531,-0.02097,0.16895,-0.06114,...,0.00379,-0.13856,,0.16907,-0.01263,-0.03735,-0.01525,-0.05952,-0.06322,-0.05918
ss_id,-0.0042,-0.00921,1.0,0.99886,0.99883,-0.0005,-0.01609,-0.03181,-0.03795,-0.0012,...,0.00352,-0.00125,,-0.05022,0.03214,0.01639,-0.01759,0.03602,0.02151,0.05205
log_time_x,-0.00432,-0.00971,0.99886,1.0,0.99961,-0.01348,-0.01828,-0.02782,-0.03875,-0.00109,...,0.00351,0.00371,,-0.05165,0.03195,0.01678,-0.01787,0.03743,0.02138,0.05331
log_time_y,-0.00432,-0.00968,0.99883,0.99961,1.0,0.01432,-0.01834,-0.02745,-0.03864,-0.00118,...,0.00351,0.00382,,-0.05166,0.03204,0.01686,-0.01774,0.03733,0.02132,0.05325
diff_ss,-0.00015,0.00112,-0.0005,-0.01348,0.01432,1.0,-0.00201,0.0133,0.00379,-0.00321,...,0.0,0.00398,,-0.00025,0.00321,0.00292,0.00435,-0.00371,-0.00221,-0.00234
diff_ox,0.04314,-0.00531,-0.01609,-0.01828,-0.01834,-0.00201,1.0,0.07526,-0.00839,0.00521,...,-0.00138,0.04342,,0.0594,-0.03319,-0.02791,-0.01971,-0.05771,-0.03298,-0.05263
watch_time,-0.00119,-0.02097,-0.03181,-0.02782,-0.02745,0.0133,0.07526,1.0,0.17328,-0.00182,...,0.00234,0.52287,,0.14445,-0.04981,-0.06254,-0.04898,-0.10585,-0.08322,-0.07241
total_time,-0.02353,0.16895,-0.03795,-0.03875,-0.03864,0.00379,-0.00839,0.17328,1.0,-0.08733,...,0.00214,-0.31483,,0.24408,0.01841,-0.02568,-0.0487,-0.13913,-0.13332,-0.11403
continuous_play_y,0.00161,-0.06114,-0.0012,-0.00109,-0.00118,-0.00321,0.00521,-0.00182,-0.08733,1.0,...,-0.00352,0.05898,,-0.12599,-0.01779,0.01465,-0.01304,0.09071,0.03447,0.04068


In [9]:
%%time
for i in ['pr_interest_keyword_cd_1', 'pr_interest_keyword_cd_2', 'pr_interest_keyword_cd_3',
          'ch_interest_keyword_cd_1', 'ch_interest_keyword_cd_2', 'ch_interest_keyword_cd_3'] : 
    enc_1 = (df.groupby(i).size()) / len(df)
    enc_1
    
    df['{}_encode'.format(i)] = df[i].apply(lambda x : enc_1[x])
    
#인코딩 전 변수는 제거 
df = df.drop(['pr_interest_keyword_cd_1', 'pr_interest_keyword_cd_2', 'pr_interest_keyword_cd_3',
                'ch_interest_keyword_cd_1', 'ch_interest_keyword_cd_2', 'ch_interest_keyword_cd_3'],axis=1)

Wall time: 2min 26s


In [10]:
df['short_trailer'] = df['short_trailer'].apply(lambda x: 1 if x == 'Y' else 0)
df['sex'] = df['sex'].apply(lambda x: 1 if x == 'M' else 0)

In [11]:
X['ss_id'] = pd.to_numeric(pd.to_datetime(X['ss_id']))
X['log_time_x'] = pd.to_numeric(pd.to_datetime(X['log_time_x']))
X['log_time_y'] = pd.to_numeric(pd.to_datetime(X['log_time_y']))

In [12]:
X = X.drop(['log_time_x', 'log_time_y', 'diff', 'diff_ox', 'continuous_play_x', 'act_target_dtl_x', 'act_target_dtl'], axis = 1)

In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1914994 entries, 0 to 1914993
Data columns (total 21 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   profile_id                int64  
 1   ss_id                     int64  
 2   log_time_x                int64  
 3   log_time_y                int64  
 4   diff_ss                   float64
 5   watch_time                float64
 6   total_time                float64
 7   continuous_play_y         float64
 8   short_trailer             object 
 9   payment                   float64
 10  time_slot                 float64
 11  search_ox                 float64
 12  prefer                    float64
 13  sex                       object 
 14  age                       int64  
 15  pr_interest_keyword_cd_1  object 
 16  pr_interest_keyword_cd_2  object 
 17  pr_interest_keyword_cd_3  object 
 18  ch_interest_keyword_cd_1  object 
 19  ch_interest_keyword_cd_2  object 
 20  ch_interest_keyword_cd_3