# 준비

## 모듈 import

In [1]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
from tqdm.notebook import tqdm
from sklearn.model_selection import RandomizedSearchCV

## 데이터

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


- 데이터 경로 변수

In [5]:
DATA_PATH = "/content/drive/MyDrive/data/"
DATA_PATH

'/content/drive/MyDrive/data/'

- 시드값

In [6]:
SEED = 42

- 데이터 불러오기

In [7]:
import pandas as pd
import numpy as np

train_tr = pd.read_csv(f"{DATA_PATH}train_tr_common_1.csv") # 학습용 구매기록 데이터
train_target = pd.read_csv(f"{DATA_PATH}store_train.csv") # 학습용 정답 데이터
test_tr = pd.read_csv(f"{DATA_PATH}test_tr_common_1.csv") # 테스트용 구매기록 데이터
submit = pd.read_csv(f"{DATA_PATH}store_submission.csv") # 제출 양식 데이터

train_tr.shape , train_target.shape , test_tr.shape , submit.shape

((523105, 12), (14940, 2), (441196, 12), (12225, 2))

- 공통 피처 파일 불러오기

In [8]:
train_ft = pd.read_csv(f"{DATA_PATH}train_common_3.csv") # 학습 데이터(피처)
test_ft = pd.read_csv(f"{DATA_PATH}test_common_3.csv") # 테스트 데이터(피처)

train_ft.shape , test_ft.shape

((14940, 433), (12225, 433))

# 결측치 처리

In [9]:
mask = train_ft.isnull().sum() > 0
train_ft.isnull().sum()[mask]

Unnamed: 0,0
3회이상count,4214
구매금액표준편차,388
구매금액왜도,840
구매금액첨도,1265


In [10]:

mask = test_ft.isnull().sum() > 0
test_ft.isnull().sum()[mask]

Unnamed: 0,0
3회이상count,3284
구매금액표준편차,242
구매금액왜도,599
구매금액첨도,954


In [11]:
train_ft["구매금액표준편차"] = train_ft["구매금액표준편차"].fillna(0)
test_ft["구매금액표준편차"] = test_ft["구매금액표준편차"].fillna(0)

train_ft["3회이상count"] = train_ft["3회이상count"].fillna(0)
test_ft["3회이상count"] = test_ft["3회이상count"].fillna(0)

In [12]:
train_ft["구매금액왜도"] = train_ft["구매금액왜도"].fillna(0)
train_ft["구매금액첨도"] = train_ft["구매금액첨도"].fillna(0)

test_ft["구매금액왜도"] = test_ft["구매금액왜도"].fillna(0)
test_ft["구매금액첨도"] = test_ft["구매금액첨도"].fillna(0)

In [13]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

## 구매주기+거래개월

In [14]:
train_ft['짧은_구매주기'] = train_ft['구매주기'] < train_ft['구매주기'].median()
train_ft['긴_구매주기'] = train_ft['구매주기'] > train_ft['구매주기'].median()
# train_ft['많은_거래개월수'] = train_ft['거래개월수'] > train_ft['거래개월수'].median()
# train_ft['적은_거래개월수'] = train_ft['거래개월수'] < train_ft['거래개월수'].median()

# train_ft['구매주기짧고_거래개월많음'] = train_ft['짧은_구매주기'] & train_ft['많은_거래개월수']
# train_ft['구매주기짧고_거래개월적음'] = train_ft['짧은_구매주기'] & train_ft['적은_거래개월수']
# train_ft['구매주기긴_거래개월많음'] = train_ft['긴_구매주기'] & train_ft['많은_거래개월수']
# train_ft['구매주기긴_거래개월적음'] = train_ft['긴_구매주기'] & train_ft['적은_거래개월수']

In [15]:
test_ft['짧은_구매주기'] = train_ft['구매주기'] < train_ft['구매주기'].median()
test_ft['긴_구매주기'] = train_ft['구매주기'] > train_ft['구매주기'].median()
# test_ft['많은_거래개월수'] = train_ft['거래개월수'] > train_ft['거래개월수'].median()
# test_ft['적은_거래개월수'] = train_ft['거래개월수'] < train_ft['거래개월수'].median()

# test_ft['구매주기짧고_거래개월많음'] = train_ft['짧은_구매주기'] & train_ft['많은_거래개월수']
# test_ft['구매주기짧고_거래개월적음'] = train_ft['짧은_구매주기'] & train_ft['적은_거래개월수']
# test_ft['구매주기긴_거래개월많음'] = train_ft['긴_구매주기'] & train_ft['많은_거래개월수']
# test_ft['구매주기긴_거래개월적음'] = train_ft['긴_구매주기'] & train_ft['적은_거래개월수']

## 아동 카테고리

In [16]:
train_tr.head()

Unnamed: 0,ID,구매일시,지점코드,대분류,중분류,브랜드코드,구매가격,구매일,구매월,구매요일,구매시간,수정_중분류
0,train_13219,2004-05-01 09:40:00,A144000,공산품파트,차류,5100,59700,2004-05-01,5,5,9,차류
1,train_5590,2004-05-01 09:40:00,A144000,잡화파트,화장잡화,5101,17000,2004-05-01,5,5,9,화장잡화
2,train_7200,2004-05-01 10:20:00,A112000,공산품,용기보증,5100,34937,2004-05-01,5,5,10,용기보증
3,train_3010,2004-05-01 10:30:00,A373000,아동_스포츠,아동복,5105,19000,2004-05-01,5,5,10,아동복
4,train_10851,2004-05-01 10:30:00,A112000,가정용품,전화기_카세트,5110,215000,2004-05-01,5,5,10,전화기_카세트


In [17]:
train_ft.head()

Unnamed: 0,ID,거래횟수,내점일수,구매주기,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,12시이전구매비율,...,시간별_거래횟수비율_17,시간별_거래횟수비율_18,시간별_거래횟수비율_19,시간별_거래횟수비율_20,시간별_거래횟수비율_21,시간별_거래횟수비율_22,최대구매액_대분류,최소구매액_대분류,짧은_구매주기,긴_구매주기
0,train_0,16,11,23,0.05,0.25,0.4,0.3,3,0.3,...,0.1875,0.25,0.0625,0.0,0.0,0.0,남성정장스포츠,가정용품,False,True
1,train_1,41,21,16,0.357143,0.166667,0.357143,0.119048,3,0.214286,...,0.121951,0.219512,0.097561,0.0,0.0,0.0,여성캐주얼,공산품,True,False
2,train_2,102,56,6,0.464912,0.140351,0.175439,0.219298,0,0.122807,...,0.578947,0.684211,0.210526,0.0,0.0,0.0,명품잡화,명품잡화,True,False
3,train_3,191,92,3,0.379147,0.180095,0.236967,0.203791,3,0.161137,...,1.85,1.2,1.1,0.05,0.0,0.0,여성의류파트,케주얼_구두_아동,True,False
4,train_4,55,27,11,0.112903,0.612903,0.209677,0.064516,4,0.096774,...,0.238095,0.190476,0.285714,0.0,0.0,0.0,명품잡화,공산품파트,True,False


In [18]:
tmp = train_tr[train_tr['중분류'] == '유아복'].groupby('ID')['구매가격'].count().reset_index()
train_ft = pd.merge(train_ft, tmp, on='ID', how='left').rename(columns={'구매가격': '유아복_구매횟수'})
train_ft['유아복_구매횟수'] = train_ft['유아복_구매횟수'].fillna(0)

train_ft['유아복_구매횟수_비율'] = train_ft['유아복_구매횟수'] / train_ft['거래횟수']
train_ft.head()

Unnamed: 0,ID,거래횟수,내점일수,구매주기,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,12시이전구매비율,...,시간별_거래횟수비율_19,시간별_거래횟수비율_20,시간별_거래횟수비율_21,시간별_거래횟수비율_22,최대구매액_대분류,최소구매액_대분류,짧은_구매주기,긴_구매주기,유아복_구매횟수,유아복_구매횟수_비율
0,train_0,16,11,23,0.05,0.25,0.4,0.3,3,0.3,...,0.0625,0.0,0.0,0.0,남성정장스포츠,가정용품,False,True,0.0,0.0
1,train_1,41,21,16,0.357143,0.166667,0.357143,0.119048,3,0.214286,...,0.097561,0.0,0.0,0.0,여성캐주얼,공산품,True,False,0.0,0.0
2,train_2,102,56,6,0.464912,0.140351,0.175439,0.219298,0,0.122807,...,0.210526,0.0,0.0,0.0,명품잡화,명품잡화,True,False,0.0,0.0
3,train_3,191,92,3,0.379147,0.180095,0.236967,0.203791,3,0.161137,...,1.1,0.05,0.0,0.0,여성의류파트,케주얼_구두_아동,True,False,2.0,0.010471
4,train_4,55,27,11,0.112903,0.612903,0.209677,0.064516,4,0.096774,...,0.285714,0.0,0.0,0.0,명품잡화,공산품파트,True,False,0.0,0.0


In [19]:
tmp = test_tr[test_tr['중분류'] == '유아복'].groupby('ID')['구매가격'].count().reset_index()
test_ft = pd.merge(test_ft, tmp, on='ID', how='left').rename(columns={'구매가격': '유아복_구매횟수'})
test_ft['유아복_구매횟수'] = test_ft['유아복_구매횟수'].fillna(0)

test_ft['유아복_구매횟수_비율'] = test_ft['유아복_구매횟수'] / test_ft['거래횟수']
test_ft.head()

Unnamed: 0,ID,거래횟수,내점일수,구매주기,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,12시이전구매비율,...,시간별_거래횟수비율_19,시간별_거래횟수비율_20,시간별_거래횟수비율_21,시간별_거래횟수비율_22,최대구매액_대분류,최소구매액_대분류,짧은_구매주기,긴_구매주기,유아복_구매횟수,유아복_구매횟수_비율
0,test_0,7,5,32,0.285714,0.285714,0.428571,0.0,1,0.142857,...,0.0,0.0,0.0,0.0,잡화,생식품,False,True,0.0,0.0
1,test_1,4,3,54,0.0,0.0,0.75,0.25,0,0.25,...,0.0,0.0,0.0,0.0,남성의류,명품잡화,True,False,0.0,0.0
2,test_2,4,2,8,0.0,0.6,0.4,0.0,0,0.4,...,0.0,0.0,0.0,0.0,여성의류파트,잡화파트,True,False,0.0,0.0
3,test_3,28,16,16,0.0625,0.46875,0.34375,0.125,3,0.0,...,0.233333,0.0,0.0,0.0,패션잡화,가정용품,True,False,0.0,0.0
4,test_4,19,9,29,0.380952,0.285714,0.238095,0.095238,6,0.0,...,0.035533,0.0,0.0,0.0,여성캐주얼,공산품,True,False,0.0,0.0


In [20]:
agg_list =[
    ('신생아_포함', lambda x: x[(x.str.contains("신생아"))].count()),
    # ('캐릭터_포함', lambda x: x[(x.str.contains("캐릭터"))].count()),
]

tmp = train_tr.groupby("ID")["중분류"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how='left',on="ID")

tmp = test_tr.groupby("ID")["중분류"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left',on="ID")

## 내점일수+구매주기

In [21]:
train_ft['많은_내점일수'] = train_ft['내점일수'] > train_ft['내점일수'].median()
train_ft['적은_내점일수'] = train_ft['내점일수'] < train_ft['내점일수'].median()
test_ft['많은_내점일수'] = train_ft['내점일수'] > train_ft['내점일수'].median()
test_ft['적은_내점일수'] = train_ft['내점일수'] < train_ft['내점일수'].median()

train_ft['구매주기짧고_내점일수많음'] = train_ft['짧은_구매주기'] & train_ft['많은_내점일수']
train_ft['구매주기짧고_내점일수적음'] = train_ft['짧은_구매주기'] & train_ft['적은_내점일수']
train_ft['구매주기긴_내점일수많음'] = train_ft['긴_구매주기'] & train_ft['많은_내점일수']
train_ft['구매주기긴_내점일수적음'] = train_ft['긴_구매주기'] & train_ft['적은_내점일수']

test_ft['구매주기짧고_내점일수많음'] = train_ft['짧은_구매주기'] & train_ft['많은_내점일수']
test_ft['구매주기짧고_내점일수적음'] = train_ft['짧은_구매주기'] & train_ft['적은_내점일수']
test_ft['구매주기긴_내점일수많음'] = train_ft['긴_구매주기'] & train_ft['많은_내점일수']

# 0이고 차류 구매 횟수가 높을수록 미혼으로 예측할 확률이 높아진다?
test_ft['구매주기긴_내점일수적음'] = train_ft['긴_구매주기'] & train_ft['적은_내점일수']

## 주구매시간 4시 이후

In [22]:
train_tr["구매일시"] = pd.to_datetime(train_tr["구매일시"])
test_tr["구매일시"] = pd.to_datetime(test_tr["구매일시"])

In [23]:
train_ft['주구매시간_16시이후'] = train_tr['구매시간'] >= 16
test_ft['주구매시간_16시이후'] = test_tr['구매시간'] >= 16

# 특성 공학(Feature Engineering)

- ID 변수 제외

In [24]:
train_ft = train_ft.iloc[:,1:]
test_ft = test_ft.iloc[:,1:]
train_ft.shape, test_ft.shape

((14940, 444), (12225, 444))

- 추가 피처 만들어 보기

In [25]:
cols = [ col for col in train_ft.columns if col.startswith("수정_중_pivot_횟수비율_") ]

In [26]:
train_ft["중분류별_구매횟수비율_std"] = train_ft[cols].std(axis=1)
train_ft["중분류별_구매횟수비율_skew"] = train_ft[cols].skew(axis=1)
train_ft["중분류별_구매횟수비율_kurt"] = train_ft[cols].kurt(axis=1)

test_ft["중분류별_구매횟수비율_std"] = test_ft[cols].std(axis=1)
test_ft["중분류별_구매횟수비율_skew"] = test_ft[cols].skew(axis=1)
test_ft["중분류별_구매횟수비율_kurt"] = test_ft[cols].kurt(axis=1)

train_ft.shape, test_ft.shape

((14940, 447), (12225, 447))

## Feature Encoding

In [27]:
train_ft.shape, test_ft.shape

((14940, 447), (12225, 447))

In [28]:
cols = train_ft.select_dtypes("object").columns.tolist()
train_ft[cols].nunique()

Unnamed: 0,0
주구매지점,4
최대구매액_대분류,28
최소구매액_대분류,28


In [29]:
%pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.0/82.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.4


In [30]:
import category_encoders as ce

In [31]:
enc = ce.one_hot.OneHotEncoder()
tmp = enc.fit_transform(train_ft[["주구매지점","최소구매액_대분류", "최대구매액_대분류"]])
train_ft = pd.concat([train_ft,tmp],axis =1)

tmp = enc.transform(test_ft[["주구매지점","최소구매액_대분류", "최대구매액_대분류"]])
test_ft = pd.concat([test_ft,tmp],axis =1)

train_ft.shape, test_ft.shape

((14940, 507), (12225, 507))

- 문자열 피처 삭제

In [32]:
cols

['주구매지점', '최대구매액_대분류', '최소구매액_대분류']

In [33]:
train_ft = train_ft.drop(columns=cols)
test_ft = test_ft.drop(columns=cols)
train_ft.shape, test_ft.shape

((14940, 504), (12225, 504))

In [34]:
train_ft.select_dtypes("object").columns , test_ft.select_dtypes("object").columns

(Index([], dtype='object'), Index([], dtype='object'))

## Feature Scaling

In [35]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [36]:
train_ft[train_ft.columns] = scaler.fit_transform(train_ft)
test_ft[test_ft.columns] = scaler.transform(test_ft)
train_ft.head()

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Unnamed: 0,거래횟수,내점일수,구매주기,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,12시이전구매비율,12시이후_18시이전구매비율,...,최대구매액_대분류_19,최대구매액_대분류_20,최대구매액_대분류_21,최대구매액_대분류_22,최대구매액_대분류_23,최대구매액_대분류_24,최대구매액_대분류_25,최대구매액_대분류_26,최대구매액_대분류_27,최대구매액_대분류_28
0,-0.437126,-0.369867,0.002987,-1.029777,0.001191,0.838272,0.338186,0.109631,0.718557,-0.65415,...,-0.193649,-0.131248,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497
1,0.239394,0.14411,-0.356452,0.323951,-0.390607,0.620171,-0.552996,0.109631,0.269465,-0.542415,...,-0.193649,-0.131248,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497
2,1.890101,1.943028,-0.869935,0.798943,-0.514333,-0.304527,-0.059266,-1.64337,-0.20983,0.532172,...,-0.193649,-0.131248,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497
3,4.29851,3.793345,-1.02398,0.420933,-0.327474,0.008592,-0.135636,0.109631,-0.009001,0.054536,...,-0.193649,-0.131248,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497
4,0.618244,0.452496,-0.613193,-0.752532,1.70741,-0.130285,-0.821561,0.693965,-0.346226,0.591326,...,-0.193649,-0.131248,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497


# 정답 데이터

In [37]:
target = train_target["target"]
target

Unnamed: 0,target
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
14935,0.0
14936,0.0
14937,0.0
14938,1.0


In [38]:
train_ft.shape, test_ft.shape

((14940, 504), (12225, 504))

# 중간 저장

In [39]:
train_ft.to_csv(f"{DATA_PATH}train_tmp.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_tmp.csv",index=False)

In [40]:
train_tmp = pd.read_csv(f"{DATA_PATH}train_tmp.csv")
test_tmp = pd.read_csv(f"{DATA_PATH}test_tmp.csv")

In [41]:
train_ft = train_tmp.copy()
test_ft = test_tmp.copy()

In [42]:
train_ft.shape, test_ft.shape

((14940, 504), (12225, 504))

# cv 점수 확인해보기

## KFold

In [43]:
cv = KFold(n_splits=5,shuffle=True, random_state=SEED)

In [44]:
model = LGBMClassifier(random_state=SEED)
scores = cross_val_score(model,train_ft,target,cv = cv ,scoring='f1_macro',n_jobs = -1)
np.mean(scores)

0.717874994753301

# 평가를 위한 제출 파일 생성
- 예측 결과를 target 컬럼에 넣어 csv 파일로 저장후에 제출한다.

In [None]:
pred = model.predict(test_ft)
pred

In [None]:
submit

Unnamed: 0,ID,target
0,test_0,0
1,test_1,0
2,test_2,0
3,test_3,1
4,test_4,1
...,...,...
12220,test_12220,1
12221,test_12221,0
12222,test_12222,0
12223,test_12223,0


In [None]:
submit["target"] = pred
submit

Unnamed: 0,ID,target
0,test_0,0
1,test_1,0
2,test_2,0
3,test_3,1
4,test_4,1
...,...,...
12220,test_12220,1
12221,test_12221,0
12222,test_12222,0
12223,test_12223,0


In [None]:
submit["target"] = pred_proba
submit

Unnamed: 0,ID,target
0,test_0,0.103547
1,test_1,0.395797
2,test_2,0.294680
3,test_3,0.853496
4,test_4,0.617971
...,...,...
12220,test_12220,0.746509
12221,test_12221,0.323029
12222,test_12222,0.068244
12223,test_12223,0.223477


- 예측 결과를 csv 파일로 저장하여 제출

In [None]:
submit.to_csv(f"{DATA_PATH}submit.csv",index=False)