### Imports

In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Read Data

In [2]:
df_train = pd.read_csv('X_train.csv', encoding='cp949')
df_test = pd.read_csv('X_test.csv', encoding='cp949')
y_tr = pd.read_csv('y_train.csv').gender
IDtest = df_test.custid.unique()

tr = pd.concat([df_train, df_test]); tr.head()

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,buyer_nm,import_flg,tot_amt,dis_amt,net_amt,inst_mon,inst_fee
0,0,2000-06-25 00:00:00,1212,무역점,2116050008000,에스티로더,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,1,90000,9000,81000,3,0
1,0,2000-06-25 00:00:00,1242,무역점,4125440008000,시슬리,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,1,39000,3900,35100,1,0
2,0,2000-08-26 00:00:00,1810,본점,2116052008000,크리니크,수입종합화장품,화장품,잡화파트,잡화가용팀,화장품,1,175000,17500,157500,3,0
3,0,2000-08-26 00:00:00,1830,본점,4106430119900,듀퐁,수입의류,명품토탈,잡화파트,잡화가용팀,수입명품,1,455000,45500,409500,3,0
4,0,2000-09-03 00:00:00,1802,무역점,2139141008000,랑콤,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,0,100000,10000,90000,3,0


In [3]:
df_test.head()

Unnamed: 0,custid,sales_date,sales_time,str_nm,goodcd,brd_nm,corner_nm,pc_nm,part_nm,team_nm,buyer_nm,import_flg,tot_amt,dis_amt,net_amt,inst_mon,inst_fee
0,30000,2000-05-17 00:00:00,1900,본점,4405230126000,루츠,진케주얼,진케주얼,"케주얼,구두,아동",의류패션팀,유니캐주얼,0,47000,2350,44650,1,0
1,30000,2000-08-03 00:00:00,1520,본점,4208200014400,제임스딘.P,내의,내의란제리,"케주얼,구두,아동",의류패션팀,니트단품,0,17500,880,16620,1,0
2,30000,2000-08-03 00:00:00,1540,본점,4405600026000,퀵실버,진케주얼,진케주얼,"케주얼,구두,아동",의류패션팀,유니캐주얼,0,158000,7900,150100,1,0
3,30000,2000-08-06 00:00:00,1620,본점,4405600026000,퀵실버,진케주얼,진케주얼,"케주얼,구두,아동",의류패션팀,유니캐주얼,0,-158000,-7900,-150100,1,0
4,30000,2000-08-06 00:00:00,1620,본점,4405600026000,퀵실버,진케주얼,진케주얼,"케주얼,구두,아동",의류패션팀,유니캐주얼,0,148000,7400,140600,1,0


In [4]:
tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1726430 entries, 0 to 689776
Data columns (total 17 columns):
custid        int64
sales_date    object
sales_time    int64
str_nm        object
goodcd        int64
brd_nm        object
corner_nm     object
pc_nm         object
part_nm       object
team_nm       object
buyer_nm      object
import_flg    int64
tot_amt       int64
dis_amt       int64
net_amt       int64
inst_mon      int64
inst_fee      int64
dtypes: int64(9), object(8)
memory usage: 237.1+ MB


In [5]:
features = []

**[파생변수 1]** 총 구매액

In [6]:
f = tr.groupby('custid')['tot_amt'].agg([('총구매액', 'sum')]).reset_index()
features.append(f); f.head()

Unnamed: 0,custid,총구매액
0,0,1742000
1,1,2772100
2,2,3750850
3,3,2300500
4,4,1045000


**[파생변수 2]** 구매건수

In [7]:
f = tr.groupby('custid')['tot_amt'].agg([('구매건수', 'size')]).reset_index()
features.append(f); f.head()

Unnamed: 0,custid,구매건수
0,0,11
1,1,26
2,2,11
3,3,30
4,4,4


**[파생변수 3]** 평균 구매가격

In [8]:
f = tr.groupby('custid')['tot_amt'].agg([('평균구매가격', 'mean')]).reset_index()
f.iloc[:,1] = f.iloc[:,1].apply(round)
features.append(f); f.head()

Unnamed: 0,custid,평균구매가격
0,0,158364
1,1,106619
2,2,340986
3,3,76683
4,4,261250


**[파생변수 4]** 평균 할부개월수

In [9]:
f = tr.groupby('custid')['inst_mon'].agg([('평균할부개월수', 'mean')]).reset_index()
f.iloc[:,1] = f.iloc[:,1].apply(round, args=(1,))
features.append(f); f.head()

Unnamed: 0,custid,평균할부개월수
0,0,2.8
1,1,2.5
2,2,3.5
3,3,2.7
4,4,4.5


**[파생변수 5]** 구매상품 다양성: `구매한 서로다른 브랜드 수`

In [10]:
f = tr.groupby('custid')['brd_nm'].agg([('구매브랜드종류', lambda x: x.nunique())]).reset_index()
features.append(f); f.head()

Unnamed: 0,custid,구매브랜드종류
0,0,7
1,1,19
2,2,7
3,3,21
4,4,4


**[파생변수 6]** 내점일수

In [11]:
f = tr.groupby(by = 'custid')['sales_date'].agg([('내점일수','nunique')]).reset_index()
features.append(f); f.head()

Unnamed: 0,custid,내점일수
0,0,7
1,1,16
2,2,7
3,3,13
4,4,2


**[파생변수 7]** 수입상품 구매비율: `수입상품 구매건수` / `총 구매건수`

In [12]:
x = tr[tr['import_flg'] == 1].groupby('custid').size() / tr.groupby('custid').size()
f = x.reset_index().rename(columns={0: '수입상품_구매비율'}).fillna(0)
f.iloc[:,1] = (f.iloc[:,1]*100).apply(round, args=(1,))
features.append(f); f.head()

Unnamed: 0,custid,수입상품_구매비율
0,0,63.6
1,1,42.3
2,2,9.1
3,3,0.0
4,4,25.0


**[파생변수 8]** 요일 구매패턴: `주말방문비율`

In [13]:
def fw(x):
    k = x.dayofweek
    if k <= 4 :
        return('주중_방문')
    else :
        return('주말_방문')    
    
df = tr.copy()
df = df.drop_duplicates(['custid','sales_date'])

df['week'] = pd.to_datetime(df.sales_date).apply(fw)
df = pd.pivot_table(df, index='custid', columns='week', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
df['주말방문비율'] = ((df.iloc[:,1] / (df.iloc[:,1]+df.iloc[:,2]))*100).apply(round, args=(1,))
f = df.copy().iloc[:,[0,-1]]
features.append(f); f.head()

week,custid,주말방문비율
0,0,42.9
1,1,50.0
2,2,28.6
3,3,38.5
4,4,50.0


**[파생변수 9]** 계절별 구매건수: `Spring`(3~5)/`Summer`(6~8)/`Fall`(9-11)/`Winter`(12~2)

In [14]:
def f1(x):
    k = x.month
    if 3 <= k <= 5 :
        return('봄_구매건수')
    elif 6 <= k <= 8 :
        return('여름_구매건수')
    elif 9 <= k <= 11 :    
        return('가을_구매건수')
    else :
        return('겨울_구매건수')    
    
tr['season'] = pd.to_datetime(tr.sales_date).apply(f1)
f = pd.pivot_table(tr, index='custid', columns='season', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f.head()

season,custid,가을_구매건수,겨울_구매건수,봄_구매건수,여름_구매건수
0,0,3,3,1,4
1,1,6,1,10,9
2,2,5,3,0,3
3,3,3,6,9,12
4,4,0,0,0,4


**[파생변수 10]** 시간대별 구매건수: `Morning`(09~12)/`Afternoon`/(13~17)/`Evening`(18-20)

In [15]:
def f2(x):
    if 9 <= x <= 12 :
        return('아침_구매건수')
    elif 13 <= x <= 17 :
        return('점심_구매건수')
    else :
        return('저녁_구매건수')  # datatime 필드가 시간 형식에 맞지 않은 값을 갖는 경우 저녁시간으로 처리

tr['timeslot'] = (tr.sales_time // 100).apply(f2)
f = pd.pivot_table(tr, index='custid', columns='timeslot', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f.head()

timeslot,custid,아침_구매건수,저녁_구매건수,점심_구매건수
0,0,2,9,0
1,1,7,5,14
2,2,3,4,4
3,3,6,13,11
4,4,0,0,4


**[파생변수 11]** 주구매 파트: 29개 파트 중 가장 많이 구매한 곳
<font color='red'>주의) 이 변수를 만드는데 시간이 많이 소요된다.

In [16]:
f = tr.groupby('custid')['part_nm'].agg([('주구매파트', lambda x: x.value_counts().reset_index().sort_values(by=['part_nm','index'], ascending=False).iloc[0,0])]).reset_index()

f = pd.get_dummies(f, columns=['주구매파트'])  # This method performs One-hot-encoding

features.append(f); f.head()

Unnamed: 0,custid,주구매파트_가정용품,주구매파트_가정용품파트,주구매파트_골프/유니캐쥬얼,주구매파트_공산품,주구매파트_공산품파트,주구매파트_남성의류,주구매파트_남성정장스포츠,주구매파트_로얄부띠끄,주구매파트_로얄부틱,...,주구매파트_여성캐주얼,주구매파트_여성캐쥬얼,주구매파트_영라이브,주구매파트_영어덜트캐쥬얼,주구매파트_영캐릭터,주구매파트_영플라자,주구매파트_잡화,주구매파트_잡화파트,"주구매파트_케주얼,구두,아동",주구매파트_패션잡화
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


**[파생변수 12]** 평균할인금액

In [17]:
f = tr.groupby('custid')['dis_amt'].agg([('평균할인금액', 'mean')]).reset_index()
features.append(f)
f.head()

Unnamed: 0,custid,평균할인금액
0,0,15836.363636
1,1,2178.076923
2,2,23190.0
3,3,3055.333333
4,4,5450.0


**[파생변수 13]** 지점별 구매횟수

In [18]:
f = pd.pivot_table(tr, values='tot_amt', index='custid', columns='str_nm', 
               aggfunc=np.size).fillna(0).reset_index()
features.append(f)
f.head()

str_nm,custid,무역점,본점,신촌점,천호점
0,0,5.0,6.0,0.0,0.0
1,1,4.0,22.0,0.0,0.0
2,2,8.0,0.0,0.0,3.0
3,3,4.0,0.0,0.0,26.0
4,4,4.0,0.0,0.0,0.0


**[파생변수 15]** 분기별 구매횟수

In [19]:
def month(x):
    if 1 <= x <= 3 :
        return('1분기')
    elif 4 <= x <= 6 :
        return('2분기')
    elif 7 <= x <= 9:
        return('3분기')
    else :
        return('4분기')

In [20]:
tr['sales_date'] = pd.to_datetime(tr['sales_date'])
tr['month'] = tr['sales_date'].dt.strftime('%m')
tr['month'] = tr['month'].astype(np.int64)
tr['분기'] = tr['month'].apply(month)

In [21]:
f = pd.pivot_table(tr, index='custid', columns='분기', values='tot_amt', 
                   aggfunc=np.sum, fill_value=0).reset_index()
features.append(f); f.head()

분기,custid,1분기,2분기,3분기,4분기
0,0,0,423000,961000,358000
1,1,10000,1661100,585000,516000
2,2,-216000,0,1961500,2005350
3,3,1438000,-218800,618800,462500
4,4,0,0,1045000,0


**[파생변수 22]** 실제지불액

In [22]:
f = tr.groupby('custid')['net_amt'].agg([('실제지불액', 'mean')]).reset_index()
features.append(f)

**[파생변수 24]** 선호 가격대 

In [23]:
tr['가격대'] = pd.cut(tr.tot_amt, bins=  [  -8000000,0 , 10000., 100000, 500000, 1000000., 8000000.],
                              right=False, 
                              labels=['환불','만원 이하', '1만~10만', '10만원이상','50만원이상','100만 이상'])
f = tr.groupby(['custid'])['가격대'].describe()['top'].reset_index()
f = f.rename(columns={'top': '선호가격대'})
f = pd.get_dummies(f, columns=['선호가격대']) 
features.append(f)
f.head(5); f.head()

Unnamed: 0,custid,선호가격대_100만 이상,선호가격대_10만원이상,선호가격대_1만~10만,선호가격대_50만원이상,선호가격대_만원 이하,선호가격대_환불
0,0,0,1,0,0,0,0
1,1,0,0,1,0,0,0
2,2,0,1,0,0,0,0
3,3,0,0,1,0,0,0
4,4,0,0,1,0,0,0


**[파생변수 25]** 구매간격 

In [24]:
tr['sales_date']=pd.to_datetime(tr['sales_date'])
f=tr.groupby('custid')['sales_date'].agg([('구매간격',lambda x: (x.max()-x.min()))]).reset_index()
f['구매간격']=f['구매간격'].apply(str)
f['구매간격']=f['구매간격'].apply(lambda x: x.split('days')[0])
f['구매간격']=f['구매간격'].astype(np.int64)
features.append(f); f.head()

Unnamed: 0,custid,구매간격
0,0,288
1,1,350
2,2,129
3,3,331
4,4,3


**[파생변수 26]** 란제리 구매회수

**[파생변수 30]** 구매시간 평균

In [25]:
f = tr.groupby('custid')['sales_time'].agg([('구매시간 평균', 'mean')]).reset_index()
features.append(f);f.head()

Unnamed: 0,custid,구매시간 평균
0,0,1757.090909
1,1,1531.153846
2,2,1537.090909
3,3,1604.066667
4,4,1434.0


[파생변수 30-1] 구매시간 총합

In [26]:
f = tr.groupby('custid')['sales_time'].agg([('구매시간 총합', 'sum')]).reset_index()
features.append(f);f.head()

Unnamed: 0,custid,구매시간 총합
0,0,19328
1,1,39810
2,2,16908
3,3,48122
4,4,5736


[파생변수 30-2] 구매시간 표준편차

In [27]:
f = tr.groupby('custid')['sales_time'].agg([('구매시간 표준편차', 'std')]).reset_index()
features.append(f);f.head()

Unnamed: 0,custid,구매시간 표준편차
0,0,266.443786
1,1,263.38302
2,2,352.567002
3,3,274.230199
4,4,21.087121


[파생변수 31] 파트별 구매량

In [28]:
f = pd.pivot_table(tr, index='custid', columns='part_nm', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f.head()

part_nm,custid,가정용품,가정용품파트,골프/유니캐쥬얼,공산품,공산품파트,남성의류,남성정장스포츠,로얄부띠끄,로얄부틱,...,여성캐쥬얼,영라이브,영어덜트캐쥬얼,영캐릭터,영플라자,인터넷백화점,잡화,잡화파트,"케주얼,구두,아동",패션잡화
0,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,6,0,0
1,1,1,1,0,0,0,0,1,6,0,...,0,0,0,0,0,0,0,5,8,0
2,2,4,0,3,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
3,3,2,0,0,3,0,3,0,0,0,...,0,7,3,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


[파생변수 31-1] 파트별 총구매액

In [29]:
f = pd.pivot_table(tr, index='custid', columns='part_nm', values='tot_amt', 
                   aggfunc=np.sum, fill_value=0).reset_index()
features.append(f); f.head()

part_nm,custid,가정용품,가정용품파트,골프/유니캐쥬얼,공산품,공산품파트,남성의류,남성정장스포츠,로얄부띠끄,로얄부틱,...,여성캐쥬얼,영라이브,영어덜트캐쥬얼,영캐릭터,영플라자,인터넷백화점,잡화,잡화파트,"케주얼,구두,아동",패션잡화
0,0,0,0,118000,0,0,113000,0,0,0,...,0,0,0,0,0,0,0,1282000,0,0
1,1,350000,145000,0,0,0,0,188000,1248500,0,...,0,0,0,0,0,0,0,216000,325100,0
2,2,2312000,0,404500,0,0,445000,0,0,0,...,0,0,232000,0,0,0,146000,0,0,0
3,3,136500,0,0,25800,0,218000,0,0,0,...,0,393200,135000,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


[파생변수 31-2] 파트별 평균 구매액

In [30]:
f = pd.pivot_table(tr, index='custid', columns='part_nm', values='tot_amt', 
                   aggfunc=np.mean, fill_value=0).reset_index()
features.append(f); f.head()

part_nm,custid,가정용품,가정용품파트,골프/유니캐쥬얼,공산품,공산품파트,남성의류,남성정장스포츠,로얄부띠끄,로얄부틱,...,여성캐쥬얼,영라이브,영어덜트캐쥬얼,영캐릭터,영플라자,인터넷백화점,잡화,잡화파트,"케주얼,구두,아동",패션잡화
0,0,0.0,0.0,118000.0,0.0,0.0,113000.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,213666.666667,0.0,0.0
1,1,350000.0,145000.0,0.0,0.0,0.0,0.0,188000.0,208083.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,43200.0,40637.5,0.0
2,2,578000.0,0.0,134833.333333,0.0,0.0,445000.0,0.0,0.0,0.0,...,0.0,0.0,232000.0,0.0,0.0,0,146000.0,0.0,0.0,0.0
3,3,68250.0,0.0,0.0,8600.0,0.0,72666.666667,0.0,0.0,0.0,...,0.0,56171.428571,45000.0,0.0,0.0,0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0


[파생변수 31-3] 파트별 구매액 표준오차

In [31]:
f = pd.pivot_table(tr, index='custid', columns='part_nm', values='tot_amt', 
                   aggfunc=np.std, fill_value=0).reset_index()
features.append(f); f.head()

part_nm,custid,가정용품,가정용품파트,골프/유니캐쥬얼,공산품,공산품파트,남성의류,남성정장스포츠,로얄부띠끄,로얄부틱,...,여성캐쥬얼,영라이브,영어덜트캐쥬얼,영캐릭터,영플라자,인터넷백화점,잡화,잡화파트,"케주얼,구두,아동",패션잡화
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,138674.679256,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117680.353784,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,22443.261795,67522.968314,0.0
2,2,1335455.0,0.0,493285.245404,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0
3,3,58336.31,0.0,0.0,5524.490927,0.0,21962.088547,0.0,0.0,0.0,...,0.0,62538.460547,625872.191426,0.0,0.0,0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0


**[파생변수 32]** buyer_nm 별 구매횟수

In [32]:
f = pd.pivot_table(tr, index='custid', columns='buyer_nm', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f.head()

buyer_nm,custid,가구,가전,기타바이어,니트단품,도자기크리스탈,디자이너부띠끄,문화완구,생활용품,섬유,...,트래디셔널캐주얼,피혁A,피혁B,행사장(남성),행사장(아동스포츠),행사장(여성정장),행사장(여성캐주얼),행사장(여성캐쥬),행사장(잡화),화장품
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,8
1,1,0,1,0,3,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,3
2,2,0,3,0,0,0,0,0,0,0,...,3,1,0,0,0,0,0,0,0,1
3,3,0,1,0,1,0,0,0,0,1,...,1,1,0,0,0,0,0,0,0,1
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


[파생변수 32-1] buyer_nm별 총구매액

In [33]:
f = pd.pivot_table(tr, index='custid', columns='buyer_nm', values='tot_amt', 
                   aggfunc=np.sum, fill_value=0).reset_index()
features.append(f); f.head()

buyer_nm,custid,가구,가전,기타바이어,니트단품,도자기크리스탈,디자이너부띠끄,문화완구,생활용품,섬유,...,트래디셔널캐주얼,피혁A,피혁B,행사장(남성),행사장(아동스포츠),행사장(여성정장),행사장(여성캐주얼),행사장(여성캐쥬),행사장(잡화),화장품
0,0,0,0,0,0,0,0,0,0,0,...,118000,0,0,0,0,0,0,0,0,1056000
1,1,0,350000,0,77500,0,0,0,0,39000,...,0,0,0,0,0,0,0,0,0,167000
2,2,0,1200000,0,0,0,0,0,0,0,...,404500,232000,0,0,0,0,0,0,0,146000
3,3,0,27000,0,68000,0,0,0,0,59000,...,29000,135000,0,0,0,0,0,0,0,187000
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,60000


[파생변수 32-2] buyer_nm별 구매액 표준오차

In [34]:
f = pd.pivot_table(tr, index='custid', columns='buyer_nm', values='tot_amt', 
                   aggfunc=np.mean, fill_value=0).reset_index()
features.append(f); f.head()

buyer_nm,custid,가구,가전,기타바이어,니트단품,도자기크리스탈,디자이너부띠끄,문화완구,생활용품,섬유,...,트래디셔널캐주얼,피혁A,피혁B,행사장(남성),행사장(아동스포츠),행사장(여성정장),행사장(여성캐주얼),행사장(여성캐쥬),행사장(잡화),화장품
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,118000.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,132000.0
1,1,0.0,350000.0,0.0,25833.333333,0.0,0.0,0.0,0.0,39000.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,55666.666667
2,2,0.0,400000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,134833.333333,232000.0,0.0,0.0,0,0.0,0.0,0.0,0.0,146000.0
3,3,0.0,27000.0,0.0,68000.0,0.0,0.0,0.0,0.0,59000.0,...,29000.0,135000.0,0.0,0.0,0,0.0,0.0,0.0,0.0,187000.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,60000.0


[파생변수 33] 평균 할인액

In [35]:
f = tr.groupby('custid')['dis_amt'].agg([('mean','mean')]).reset_index()
features.append(f); f.head()

Unnamed: 0,custid,mean
0,0,15836.363636
1,1,2178.076923
2,2,23190.0
3,3,3055.333333
4,4,5450.0


[파생변수 33-1] 할인액 표준편차

In [36]:
f = tr.groupby('custid')['dis_amt'].agg([('std','std')]).reset_index()
features.append(f); f.head()

Unnamed: 0,custid,std
0,0,11854.051858
1,1,4686.279564
2,2,38782.031922
3,3,9721.854064
4,4,9011.659115


[파생변수 33-2] 총할인액

In [37]:
f = tr.groupby('custid')['dis_amt'].agg([('sum','sum')]).reset_index()
features.append(f); f.head()

Unnamed: 0,custid,sum
0,0,174200
1,1,56630
2,2,255090
3,3,91660
4,4,21800


[파생변수 34] 구매력 지수

In [38]:
tr['sales_date'] = pd.to_datetime(tr['sales_date'])
a = tr.groupby(by = 'custid')['sales_date'].agg([('내점일수','nunique')]).reset_index()
b = tr.groupby(by = 'custid')['tot_amt'].agg([('총구매액','sum')]).reset_index()
f = pd.merge(a,b,how = 'inner')
f['구매력'] = round(f['총구매액']/f['내점일수'],1)
features.append(f); f.head()

Unnamed: 0,custid,내점일수,총구매액,구매력
0,0,7,1742000,248857.1
1,1,16,2772100,173256.2
2,2,7,3750850,535835.7
3,3,13,2300500,176961.5
4,4,2,1045000,522500.0


[파생변수 35] 충동구매

In [39]:
f = tr.groupby('custid')['tot_amt'].agg([('구매건수', 'size')]).reset_index()
f = pd.merge(f,a,how = 'inner')
f['충동구매'] = f['구매건수']/f['내점일수']
features.append(f); f.head()

Unnamed: 0,custid,구매건수,내점일수,충동구매
0,0,11,7,1.571429
1,1,26,16,1.625
2,2,11,7,1.571429
3,3,30,13,2.307692
4,4,4,2,2.0


[파생변수 36] 할인비율

In [40]:
a = tr.groupby('custid')['tot_amt'].agg([('총구입금액','sum')]).reset_index()
b = tr.groupby('custid')['dis_amt'].agg([('총할인금액','sum')]).reset_index()
c = pd.merge(a,b,how = 'inner')
c['할인금액/총금액'] = c['총할인금액']/c['총구입금액']
f = c.loc[:,['custid','할인금액/총금액']]
features.append(f); f.head()

Unnamed: 0,custid,할인금액/총금액
0,0,0.1
1,1,0.020429
2,2,0.068009
3,3,0.039844
4,4,0.020861


[파생변수 37] 고객 방문 빈도

In [41]:
tr['unixtime'] = tr['sales_date'].view('int64')
csu = tr.groupby('custid')['unixtime'].agg([('unique','unique')])
abc = {}
for i in range(0,49995):
    abc[i] = np.var(csu[['unique']].iloc[i,:][0])
abc.values()
csu['고객 방문빈도'] = abc.values() 
csu.head(5)
csu = csu.reset_index()
f = csu.loc[:,['custid','고객 방문빈도']]
features.append(f); f.head()

Unnamed: 0,custid,고객 방문빈도
0,0,6.0987500000000005e+31
1,1,1.108171e+32
2,2,1.492504e+31
3,3,1.078165e+32
4,4,1.679616e+28


[파생변수 38] 팀별 구매횟수

In [42]:
cde = tr.groupby(['team_nm','custid'])['tot_amt'].agg([('team별 구매횟수', 'size')]).reset_index()
f = pd.pivot_table(cde, values='team별 구매횟수', index='custid', columns='team_nm', 
                   aggfunc= sum, fill_value=0).reset_index()
features.append(f); f.head()

team_nm,custid,상품개발영업2과,식품팀,의류패션팀,인터넷백화점,잡화가용팀
0,0,0,0,2,0,9
1,1,0,0,18,0,8
2,2,0,0,5,0,6
3,3,0,3,13,0,14
4,4,0,0,3,0,1


[파생변수 38-1] 팀별 총구매액

In [43]:
cde = tr.groupby(['team_nm','custid'])['tot_amt'].agg([('team별 구매량', 'sum')]).reset_index()
f = pd.pivot_table(cde, values='team별 구매량', index='custid', columns='team_nm', 
                   aggfunc= sum, fill_value=0).reset_index()
features.append(f); f.head()

team_nm,custid,상품개발영업2과,식품팀,의류패션팀,인터넷백화점,잡화가용팀
0,0,0,0,231000,0,1511000
1,1,0,0,1873100,0,899000
2,2,0,0,1081500,0,2669350
3,3,0,25800,746200,0,1528500
4,4,0,0,985000,0,60000


[파생변수 39] pc_nm 별 구매횟수

In [44]:
cde = tr.groupby(['pc_nm','custid'])['tot_amt'].agg([('pc별 구매횟수', 'size')]).reset_index()
f = pd.pivot_table(cde, values='pc별 구매횟수', index='custid', columns='pc_nm', 
                   aggfunc= sum, fill_value=0).reset_index()
features.append(f); f.head()

pc_nm,custid,(주)현스포츠아쌤,가구,가전,가전/문화,골프,남성잡화,남여구두,내의란제리,농산물,...,트래디셔널,트래디셔널캐쥬얼,트랜디 케쥬얼,트레디셔널,패션슈즈,패션시즌,피혁A,피혁B,핸드백,화장품
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,8
1,1,0,0,1,0,1,0,0,3,0,...,0,0,1,0,0,0,0,0,0,3
2,2,0,0,3,0,0,0,0,0,0,...,3,0,0,0,1,0,0,0,0,1
3,3,0,0,0,1,4,0,0,0,0,...,0,1,0,0,1,0,0,0,0,1
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


[파생변수 39-1] pc_nm별 총 구매액

In [45]:
cde = tr.groupby(['pc_nm','custid'])['tot_amt'].agg([('pc별 구매량', 'sum')]).reset_index()
f = pd.pivot_table(cde, values='pc별 구매량', index='custid', columns='pc_nm', 
                   aggfunc= sum, fill_value=0).reset_index()
features.append(f); f.head()

pc_nm,custid,(주)현스포츠아쌤,가구,가전,가전/문화,골프,남성잡화,남여구두,내의란제리,농산물,...,트래디셔널,트래디셔널캐쥬얼,트랜디 케쥬얼,트레디셔널,패션슈즈,패션시즌,피혁A,피혁B,핸드백,화장품
0,0,0,0,0,0,0,0,0,0,0,...,118000,0,0,0,0,0,0,0,0,1056000
1,1,0,0,350000,0,188000,0,0,77500,0,...,0,0,39000,0,0,0,0,0,0,167000
2,2,0,0,1200000,0,0,0,0,0,0,...,404500,0,0,0,232000,0,0,0,0,146000
3,3,0,0,0,27000,816000,0,0,0,0,...,0,29000,0,0,135000,0,0,0,0,187000
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,60000


[파생변수 40] 코너별 구매횟수

In [46]:
cde = tr.groupby(['corner_nm','custid'])['tot_amt'].agg([('corner별 구매횟수', 'size')]).reset_index()
f = pd.pivot_table(cde, values='corner별 구매횟수', index='custid', columns='corner_nm', 
                   aggfunc= sum, fill_value=0).reset_index()
features.append(f); f.head()

corner_nm,custid,DC캐주얼,ERREVNO,GBR 지원,L/B침구,N/B침구,NB제화,NB핸드백,TOP디자이너,"TV,VTR",...,행사핸드백,향수,헤어ACC,헤어악세사리,헤어액세사리,홈데코,홈쇼핑,화장잡화,화장품,훼미닌부틱
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


[파생변수 40-1] 코너별 구매액

In [47]:
cde = tr.groupby(['corner_nm','custid'])['tot_amt'].agg([('corner별 구매량', 'sum')]).reset_index()
f = pd.pivot_table(cde, values='corner별 구매량', index='custid', columns='corner_nm', 
                   aggfunc= sum, fill_value=0).reset_index()
features.append(f); f.head()

corner_nm,custid,DC캐주얼,ERREVNO,GBR 지원,L/B침구,N/B침구,NB제화,NB핸드백,TOP디자이너,"TV,VTR",...,행사핸드백,향수,헤어ACC,헤어악세사리,헤어액세사리,홈데코,홈쇼핑,화장잡화,화장품,훼미닌부틱
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,10000,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


[파생변수 41] buyer 별 총구매액

In [48]:
cde = tr.groupby(['buyer_nm','custid'])['tot_amt'].agg([('buyer별 구매횟수', 'size')]).reset_index()
f = pd.pivot_table(cde, values='buyer별 구매횟수', index='custid', columns='buyer_nm', 
                   aggfunc= sum, fill_value=0).reset_index()
features.append(f); f.head()

buyer_nm,custid,가구,가전,기타바이어,니트단품,도자기크리스탈,디자이너부띠끄,문화완구,생활용품,섬유,...,트래디셔널캐주얼,피혁A,피혁B,행사장(남성),행사장(아동스포츠),행사장(여성정장),행사장(여성캐주얼),행사장(여성캐쥬),행사장(잡화),화장품
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,8
1,1,0,1,0,3,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,3
2,2,0,3,0,0,0,0,0,0,0,...,3,1,0,0,0,0,0,0,0,1
3,3,0,1,0,1,0,0,0,0,1,...,1,1,0,0,0,0,0,0,0,1
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


[파생변수 41-1] buy별 총구매액

In [49]:
cde = tr.groupby(['buyer_nm','custid'])['tot_amt'].agg([('buyer별 구매량', 'sum')]).reset_index()
f = pd.pivot_table(cde, values='buyer별 구매량', index='custid', columns='buyer_nm', 
                   aggfunc= sum, fill_value=0).reset_index()
features.append(f); f.head()

buyer_nm,custid,가구,가전,기타바이어,니트단품,도자기크리스탈,디자이너부띠끄,문화완구,생활용품,섬유,...,트래디셔널캐주얼,피혁A,피혁B,행사장(남성),행사장(아동스포츠),행사장(여성정장),행사장(여성캐주얼),행사장(여성캐쥬),행사장(잡화),화장품
0,0,0,0,0,0,0,0,0,0,0,...,118000,0,0,0,0,0,0,0,0,1056000
1,1,0,350000,0,77500,0,0,0,0,39000,...,0,0,0,0,0,0,0,0,0,167000
2,2,0,1200000,0,0,0,0,0,0,0,...,404500,232000,0,0,0,0,0,0,0,146000
3,3,0,27000,0,68000,0,0,0,0,59000,...,29000,135000,0,0,0,0,0,0,0,187000
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,60000


[파생변수 42] 구매액의 변동 계수

In [50]:
f = tr.groupby(['custid'])['tot_amt'].agg([('구매액의 변동계수', lambda x: (x.std()/x.mean()))]).reset_index()
features.append(f); f.head()

Unnamed: 0,custid,구매액의 변동계수
0,0,0.748534
1,1,1.024865
2,2,2.321574
3,3,2.503148
4,4,0.958138


[파생변수 43] 구매코너 갯수

In [51]:
f = tr.groupby('custid')['corner_nm'].agg([('구매코너종류', lambda x: x.nunique())]).reset_index()
features.append(f); f.head()

Unnamed: 0,custid,구매코너종류
0,0,4
1,1,16
2,2,7
3,3,18
4,4,4


[파생변수 44] 구매 PC 갯수

In [52]:
f = tr.groupby('custid')['pc_nm'].agg([('구매pc종류', lambda x: x.nunique())]).reset_index()
features.append(f); f.head()

Unnamed: 0,custid,구매pc종류
0,0,4
1,1,13
2,2,7
3,3,14
4,4,3


[파생변수 45] 구매 part 갯수

In [53]:
f = tr.groupby('custid')['part_nm'].agg([('구매part종류', lambda x: x.nunique())]).reset_index()
features.append(f); f.head()

Unnamed: 0,custid,구매part종류
0,0,4
1,1,8
2,2,6
3,3,8
4,4,3


[파생변수 46] 구매 buyer 갯수

In [54]:
f = tr.groupby('custid')['buyer_nm'].agg([('구매buyer종류', lambda x: x.nunique())]).reset_index()
features.append(f); f.head()

Unnamed: 0,custid,구매buyer종류
0,0,4
1,1,12
2,2,7
3,3,14
4,4,3


[파생변수 47] 남성파트구매비율

In [55]:
df = tr.groupby(['custid','part_nm'])['tot_amt'].agg([('tot_amt_part', 'sum')]).reset_index()
df['part_nm'] = np.where(df.part_nm.str.contains('남성'), '남성', '여성')
df = df.pivot_table(values='tot_amt_part', index=df.custid, columns='part_nm', aggfunc='first',fill_value=0).reset_index()
df['남성파트구매비율'] = (df['남성'] / (df['남성'] + df['여성'])) * 100
df = df.fillna(0)
features.append(df.iloc[:,[0,-1]])

[파생변수 48] 화장품 구매 비율

In [56]:
df = tr.groupby(['custid','corner_nm'])['tot_amt'].agg([('tot_amt_corner', 'sum')]).reset_index()
df['corner_nm'] = np.where(df.corner_nm.str.contains('화장품'), '화장품', '비화장품')
df = df.pivot_table(values='tot_amt_corner', index=df.custid, columns='corner_nm', aggfunc='first',fill_value=0).reset_index()
df['화장품비율'] = (df['화장품'] / (df['화장품'] + df['비화장품'])) * 100
df = df.fillna(0)
features.append(df.iloc[:,[0,-1]])

[파생변수 49] 할부대비평균실구매

In [57]:
tr['real_amt']= tr.tot_amt / tr.inst_mon
f = tr.groupby('custid')['real_amt'].agg([('할부대비평균실구매', 'mean')]).reset_index()
features.append(f)

[파생변수 50] str_nm별 구매액

In [58]:
df = pd.pivot_table(tr, index='custid', columns='str_nm', values='tot_amt', aggfunc=np.sum, fill_value=0).reset_index()
features.append(df)

[파생변수 51] 수입상품구매 총액

In [59]:
df = tr.groupby(['custid'])['import_flg'].agg([('수입상품구매총액', 'sum')]).reset_index()
features.append(df)

[파생변수 52] 마지막 거래후 경과일

In [60]:
from datetime import timedelta as dt
df = tr.groupby(['custid'])['sales_date'].agg([('sales_date', 'max')]).reset_index()
df['마지막거래후경과일'] = (pd.to_datetime(df.sales_date) - pd.to_datetime(tr.sales_date.min())).dt.days
features.append(df.iloc[:,[0,-1]])

[파생변수 53] 일평균 구매액

In [61]:
test2 = tr.groupby(['sales_date','custid'])['tot_amt'].agg([('day_amt', 'sum')]).reset_index()
test2 = test2.groupby(['custid'])['day_amt'].agg([('일평균구매액', 'mean')]).reset_index()
features.append(test2)

[파생변수 54] 일평균 구매건수

In [62]:
df = tr.groupby(['sales_date','custid'])['custid'].agg([('day_visit', 'count')]).reset_index()
f = df.groupby(['custid'])['day_visit'].agg([('일평균구매건수', 'mean')]).reset_index()
features.append(f)

[파생변수 55] 요일별 구매건수

In [63]:
def f2(x):
    k = x.dayofweek
    if k <= 2 :
        return('월화수_구매건수')
    elif 3 <= k < 5 :
        return('목금_구매건수')
    elif 5 <= k < 6 :
        return('토_구매건수')
    else :
        return('일_구매건수')    
    
tr['요일2'] = pd.to_datetime(tr.sales_date).apply(f2)
f = pd.pivot_table(tr, index='custid', columns='요일2', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f.head()

요일2,custid,목금_구매건수,월화수_구매건수,일_구매건수,토_구매건수
0,0,2,2,5,2
1,1,8,5,4,9
2,2,2,5,4,0
3,3,13,6,6,5
4,4,3,0,1,0


[파생변수 56] 매장 이용 다양성

In [64]:
n = tr.str_nm.nunique()
f = tr.groupby('custid')['str_nm'].agg([('매장이용다양성', lambda x: len(x.unique()) / n)]).reset_index()
features.append(f) ; f.head()

Unnamed: 0,custid,매장이용다양성
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.25


[파생변수 57] 임의 분기별 구매건수

In [65]:
def f1(x):
    k = x.month
    if 2 <= k <= 4 :
        return('234월_구매건수')
    elif 5 <= k <= 7 :
        return('567월_구매건수')
    elif 8 <= k <= 10 :
        return('8910월_구매건수')
    else :
        return('11121월_구매건수')    
    
tr['season2'] = pd.to_datetime(tr.sales_date).apply(f1)
f = pd.pivot_table(tr, index='custid', columns='season2', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f.head()

season2,custid,11121월_구매건수,234월_구매건수,567월_구매건수,8910월_구매건수
0,0,3,1,2,5
1,1,5,5,14,2
2,2,4,0,0,7
3,3,6,7,10,7
4,4,0,0,4,0


[파생변수 58] 시간대별 구매건수

In [66]:
def f2(x):
    if 901 <= x < 1200 :
        return('12시 이전_구매건수')
    elif 1200 <= x < 1400 :
        return('12~2시_구매건수')
    elif 1400 <= x < 1600 :
        return('2~4시_구매건수')
    elif 1600 <= x < 1800 :
        return('4~6시_구매건수')
    else :
        return('6시이후_구매건수')  

tr['timeslot2'] = tr.sales_time.apply(f2)
f = pd.pivot_table(tr, index='custid', columns='timeslot2', values='tot_amt',
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f.head()

timeslot2,custid,12~2시_구매건수,12시 이전_구매건수,2~4시_구매건수,4~6시_구매건수,6시이후_구매건수
0,0,2,0,0,0,9
1,1,3,4,5,9,5
2,2,1,3,1,2,4
3,3,5,3,3,6,13
4,4,0,0,4,0,0


[파생변수 59] pca_dummies (추가 공부 필요)

In [67]:
from sklearn.decomposition import PCA

def dummy_to_pca(tr, column_name:str) :
    max_seq = 300
    max_d = 15
    col_count = tr.groupby(column_name)[column_name].count()
    if len(col_count) > max_seq:
        tops = col_count.sort_values(ascending=False)[0:max_seq].index
        f =tr.loc[tr[column_name].isin(tops)][['custid', column_name]]
    else:
        tops = col_count.index
        f =tr[['custid', column_name]]
    f = pd.get_dummies(f, columns=[column_name])  # This method performs One-hot-encoding
    f = f.groupby('custid').mean()
    if len(tops) < max_d:
        max_d = len(tops)
    pca = PCA(n_components=max_d)
    pca.fit(f)
    cumsum = np.cumsum(pca.explained_variance_ratio_) #분산의 설명량을 누적합
    #print(cumsum)
    num_d = np.argmax(cumsum >= 0.99) + 1 # 분산의 설명량이 99%이상 되는 차원의 수
    if num_d == 1:
        num_d = max_d
    pca = PCA(n_components=num_d)    
    result = pca.fit_transform(f)
    result = pd.DataFrame(result)
    result.columns = [column_name + '_' + str(column) for column in result.columns]
    result.index = f.index
    return result.reset_index()

In [68]:
f = dummy_to_pca(tr, 'brd_nm'); features.append(f)
f = dummy_to_pca(tr, 'corner_nm'); features.append(f)
f = dummy_to_pca(tr, 'pc_nm'); features.append(f)
f = dummy_to_pca(tr, 'part_nm'); features.append(f)
f = dummy_to_pca(tr, 'buyer_nm'); features.append(f)
f = dummy_to_pca(tr, 'team_nm'); features.append(f)
f = dummy_to_pca(tr, 'goodcd'); features.append(f)
f = dummy_to_pca(tr, 'str_nm'); features.append(f)
tr['month'] = pd.to_datetime(tr['sales_date']).dt.month.astype(str)
f = dummy_to_pca(tr, 'month'); features.append(f)
tr['week'] = pd.to_datetime(tr['sales_date']).dt.dayofweek.astype(str)
f = dummy_to_pca(tr, 'week'); features.append(f)
tr['time'] = np.floor(tr['sales_time']/100).astype(int).astype(str)
f = dummy_to_pca(tr, 'time'); features.append(f)

[파생변수 60] str_nm별 size

In [69]:
f = pd.pivot_table(tr, index='custid', columns='str_nm', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f.head()

str_nm,custid,무역점,본점,신촌점,천호점
0,0,5,6,0,0
1,1,4,22,0,0
2,2,8,0,0,3
3,3,4,0,0,26
4,4,4,0,0,0


[파생변수 61] part별 구매 규모

In [70]:
f = pd.pivot_table(tr, index='custid', columns='part_nm', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f.head()

part_nm,custid,가정용품,가정용품파트,골프/유니캐쥬얼,공산품,공산품파트,남성의류,남성정장스포츠,로얄부띠끄,로얄부틱,...,여성캐쥬얼,영라이브,영어덜트캐쥬얼,영캐릭터,영플라자,인터넷백화점,잡화,잡화파트,"케주얼,구두,아동",패션잡화
0,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,6,0,0
1,1,1,1,0,0,0,0,1,6,0,...,0,0,0,0,0,0,0,5,8,0
2,2,4,0,3,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
3,3,2,0,0,3,0,3,0,0,0,...,0,7,3,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


[파생변수 62] buyer별 구매 규모

In [71]:
f = pd.pivot_table(tr, index='custid', columns='buyer_nm', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f.head()

buyer_nm,custid,가구,가전,기타바이어,니트단품,도자기크리스탈,디자이너부띠끄,문화완구,생활용품,섬유,...,트래디셔널캐주얼,피혁A,피혁B,행사장(남성),행사장(아동스포츠),행사장(여성정장),행사장(여성캐주얼),행사장(여성캐쥬),행사장(잡화),화장품
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,8
1,1,0,1,0,3,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,3
2,2,0,3,0,0,0,0,0,0,0,...,3,1,0,0,0,0,0,0,0,1
3,3,0,1,0,1,0,0,0,0,1,...,1,1,0,0,0,0,0,0,0,1
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


[파생변수 63] 수입상품 구매비율

In [72]:
x = tr[tr['import_flg'] == 1].groupby('custid').size() / tr.groupby('custid').size()
f = x.reset_index().rename(columns={0: '수입상품_구매비율'}).fillna(0)
f.iloc[:,1] = (f.iloc[:,1]*100).apply(round, args=(1,))
features.append(f); f.head()

Unnamed: 0,custid,수입상품_구매비율
0,0,63.6
1,1,42.3
2,2,9.1
3,3,0.0
4,4,25.0


[파생변수 64] 할부개월수 총합

In [73]:
df = tr.groupby(['custid'])['import_flg'].agg([('inst_mon_sum', 'sum')]).reset_index()
features.append(df);df.head()

Unnamed: 0,custid,inst_mon_sum
0,0,7
1,1,11
2,2,1
3,3,0
4,4,1


[파생변수 65] 브랜드별 구매갯수

In [74]:
f = pd.pivot_table(tr, index='custid', columns='brd_nm', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f.head()

brd_nm,custid,012베네통,1492,1492마일즈,3N,96NY,@one,A-AND,AB.F.Z,ABFZ,...,휘나래,휠라슈즈,휠라의류,휠라인티모,휠라키즈,휠라행사,휴고보스,휴먼앤휴먼,흙침대,희원상사
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


[파생변수 66] 브랜드별 구매총액

In [75]:
f = pd.pivot_table(tr, index='custid', columns='brd_nm', values='tot_amt', 
                   aggfunc=np.sum, fill_value=0).reset_index()
features.append(f); f.head()

brd_nm,custid,012베네통,1492,1492마일즈,3N,96NY,@one,A-AND,AB.F.Z,ABFZ,...,휘나래,휠라슈즈,휠라의류,휠라인티모,휠라키즈,휠라행사,휴고보스,휴먼앤휴먼,흙침대,희원상사
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,85000,0,68000,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**[파생변수 67]** 브랜드별 총액 합

In [76]:
f = pd.pivot_table(tr, index='custid', columns='brd_nm', values='net_amt', 
                   aggfunc=np.sum, fill_value=0).reset_index()
features.append(f); f.head()

brd_nm,custid,012베네통,1492,1492마일즈,3N,96NY,@one,A-AND,AB.F.Z,ABFZ,...,휘나래,휠라슈즈,휠라의류,휠라인티모,휠라키즈,휠라행사,휴고보스,휴먼앤휴먼,흙침대,희원상사
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,80750,0,64600,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


[파생변수 68] good_cd 갯수

In [77]:
aa = tr[["custid","goodcd"]];aa.head()

bb = aa['goodcd']

cc = bb.value_counts().reset_index()
cc.head()

dd = cc.query("goodcd >= 40")['index']

dd

ee = tr.query("goodcd in @dd")

f = pd.pivot_table(ee, index='custid', columns= 'goodcd', values='tot_amt', 
                   aggfunc=np.size, fill_value=0).reset_index()
features.append(f); f.head()

Unnamed: 0,custid,goodcd
0,0,2116050008000
1,0,4125440008000
2,0,2116052008000
3,0,4106430119900
4,0,2139141008000


### Merge variables

In [84]:
dfX = DataFrame({'custid': df_train.custid.unique()})
for f in features :
    dfX = pd.merge(dfX, f, how='left')
print(dfX)

X_test2 = DataFrame({'custid': df_test.custid.unique()})
for f in features :
    X_test2 = pd.merge(X_test2, f, how='left')
print(X_test2)



       custid      총구매액  구매건수  평균구매가격  평균할부개월수  구매브랜드종류  내점일수  수입상품_구매비율  \
0           0   1742000    11  158364      2.8        7     7       63.6   
1           1   2772100    26  106619      2.5       19    16       42.3   
2           2   3750850    11  340986      3.5        7     7        9.1   
3           3   2300500    30   76683      2.7       21    13        0.0   
4           4   1045000     4  261250      4.5        4     2       25.0   
5           5   5053759    32  157930      1.9       21    21       18.8   
6           6   3785029    31  122098      1.8       23    11        9.7   
7           7   1223182    35   34948      1.4       20    23        8.6   
8           8   1267500    18   70417      2.1       13    10       11.1   
9           9   4956620    59   84011      1.0       35    34       10.2   
10         10   1347970    24   56165      1.9       18    15       16.7   
11         11   7173999    66  108697      1.7       19    42       12.1   
12         1

       custid      총구매액  구매건수  평균구매가격  평균할부개월수  구매브랜드종류  내점일수  수입상품_구매비율  \
0       30000   2078240    27   76972      1.2       16    15        7.4   
1       30001   4158320    27  154012      1.7       19    10        7.4   
2       30002   8007256   100   80073      2.1       42    29       25.0   
3       30003   1367820    43   31810      1.4       28    25       11.6   
4       30004   2890471    55   52554      1.3       23    32        7.3   
5       30005     57000     1   57000      3.0        1     1      100.0   
6       30006    589750     7   84250      1.3        5     5        0.0   
7       30007    295500     6   49250      1.3        6     4       16.7   
8       30008    400220     9   44469      1.4        5     7       11.1   
9       30009    239000     4   59750      3.0        2     2        0.0   
10      30010   1561600    11  141964      2.6       10     4        9.1   
11      30011   1716877    31   55383      1.0       22    16        6.5   
12      3001

In [85]:
dfX = dfX.drop("custid", axis = 1)
X_test2 = X_test2.drop("custid", axis = 1)

In [86]:
dfX = dfX.fillna(0)
X_test2 = X_test2.fillna(0)

### Build Models

In [87]:
# scikit-learn commonly used classes
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
import sys, warnings
from keras import models
from keras import layers
from keras import regularizers
from keras.callbacks import EarlyStopping
if not sys.warnoptions: warnings.simplefilter("ignore")

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [88]:
X_train, X_test, y_train, y_test = train_test_split(dfX, y_tr, test_size=0.3, random_state=0)

In [89]:
%%time
# Learn XGB
from xgboost import XGBClassifier

xgb_pipe = Pipeline([('scalar', StandardScaler()),
                    ("xgb", XGBClassifier(n_jobs = -1, random_state = 0, tree_method = 'hist'))])
model_xgb = xgb_pipe.fit(X_train, y_train)

Wall time: 1min 3s


In [90]:
%%time

gbc_pipe = Pipeline([('scalar', StandardScaler()),
                    ("gbc", GradientBoostingClassifier(random_state = 0))])
model_gbc = gbc_pipe.fit(X_train, y_train)

Wall time: 4min 16s


In [91]:
lgm_pipe = Pipeline([('scalar', StandardScaler()),
                    ("lgm", LGBMClassifier(n_jobs = -1, random_state = 0, tree_method = 'hist', metric  = 'auc'))])
model_lgm = lgm_pipe.fit(X_train, y_train)

In [92]:
ada_pipe = Pipeline([('scalar', StandardScaler()),
                    ("ada", AdaBoostClassifier(random_state = 0))])
model_ada = ada_pipe.fit(X_train, y_train)

In [93]:
from scipy.stats.mstats import gmean

pred_xgb= model_xgb.fit(X_train, y_train).predict_proba(X_test)[:,1]
pred_gbc= model_gbc.fit(X_train, y_train).predict_proba(X_test)[:,1]
pred_lgm= model_lgm.fit(X_train, y_train).predict_proba(X_test)[:,1]
pred_ada= model_ada.fit(X_train, y_train).predict_proba(X_test)[:,1]

roc_auc_score(y_test, gmean([pred_xgb, pred_gbc, pred_lgm, pred_ada], axis=0))

0.7013940661194206

### Make Submissions

In [152]:
from scipy.stats.mstats import gmean

pred_xgb= model_xgb.fit(dfX, y_tr).predict_proba(X_test2)[:,1]
pred_gbc= model_gbc.fit(dfX, y_tr).predict_proba(X_test2)[:,1]
pred_lgm= model_lgm.fit(dfX, y_tr).predict_proba(X_test2)[:,1]
pred_ada= model_ada.fit(dfX, y_tr).predict_proba(X_test2)[:,1]

pred = gmean([pred_xgb, pred_lgm, pred_lgm, pred_ada])

fname = 'submissions_4en.csv'
submissions = pd.concat([pd.Series(IDtest, name="custid"), pd.Series(pred, name="gender")] ,axis=1)
submissions.to_csv(fname, index=False)
print("'{}' is ready to submit." .format(fname))

'submissions_4en.csv' is ready to submit.


## End