In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.max_row", 50)
pd.set_option("display.max_column", 50)

from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import StratifiedKFold

from sklearn.ensemble import ExtraTreesClassifier

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

import gc
import re
import os
import time
import random
import platform
import datetime as dt
from tqdm import tqdm
from itertools import combinations
from scipy.stats.mstats import gmean
import warnings; warnings.filterwarnings("ignore")

### 데이터 로드

In [2]:
train = pd.read_csv('./dataset/trainset.csv')
train = train.drop(columns=['매니저최초가입일' , '매니저최초서비스일','매니저주소'])
train['매니저생년월일'] = train['매니저생년월일'].apply(lambda x: int(str(x)[:-6]))

test = pd.read_csv('./dataset/testset_final.csv')
submission = pd.read_csv('./dataset/sample_submission_final.csv')

In [3]:
tr = pd.concat([train, test])

### 데이터 전처리

#### 1. 결측치 처리

In [4]:
#To Fill
col1 = {'평수':'없음',
        '결재형태' : '신용카드',
        '고객가입일' : '2020-06-23',
        '반려동물':'없음',
        '부재중여부': '모름',
        '우선청소' :'없음',
        '매니저사용휴대폰': '안드로이드',
        '매니저이동방법': '대중교통',
        'CS교육이수여부': 0,
        '청소교육이수여부': 0}

tr = tr.fillna(col1)

#### 2. 결측치 많은 행 삭제

In [5]:
#To drop
col2 = {'접수시각', '매칭성공여부'}
tr = tr.drop(col2, axis='columns')

#### 3. 월, 일 컬럼 생성 및 날짜 데이터 처리

In [6]:
tr['접수월'] = tr['접수일'].apply(lambda x: x[5:7]).astype(int)
tr['접수일일'] = tr['접수일'].apply(lambda x: x[-2:]).astype(int)

In [7]:
tr.최초서비스일 = pd.to_datetime(tr.최초서비스일)
tr.서비스일자 = pd.to_datetime(pd.to_datetime(tr.서비스일자))

#### 4. 추가 Feature 생성

#### 4-1. 서비스이용기간 feature 생성

In [8]:
tr['서비스이용기간'] = tr.서비스일자-tr.최초서비스일
tr['서비스이용기간'] = tr['서비스이용기간'].apply(lambda x: int(str(x)[:-14]))

#### 4-2. 매니저연령대 feature 생성

In [9]:
def age(x):
    if x < 1960:
        return 1
    elif x < 1970:
        return 2
    elif x < 1980:
        return 3
    elif x < 1990:
        return 4
    else:
        return 5
    
tr['매니저연령대'] = tr['매니저생년월일'].apply(lambda x: age(x))

#### 4-3. 반려동물여부 feature 생성

In [10]:
def pet(x):
    if x == '없음':
        return 1
    else:
        return 0
    
tr['반려동물여부'] = tr['반려동물'].apply(lambda x: pet(x))

#### 4-4. 종합지수 feature 생성

In [11]:
tr['종합지수'] = tr['부재중서비스가능여부'] - tr['쿠폰사용여부'] + tr['CS교육이수여부'] + tr['청소교육이수여부'] + tr['추천인여부'] + tr['반려동물여부']

#### 4-5. 서비스시간대 feature 생성

In [12]:
tr['서비스시작시간_2'] = tr['서비스시작시간'].apply(lambda x: int(x[:-6]))

In [13]:
def hour(x):
    if x <=12 and x >= 6:
        return 0
    else:
        return 1
    
tr['서비스시간대'] = tr['서비스시작시간_2'].apply(lambda x: hour(x))
tr = tr.drop(columns = '서비스시작시간_2', axis=1)

#### 4-6. 주거가중치 feature 생성

In [14]:
def house_size(x):
    if x == '10평대':
        return 1
    elif x == '20평대':
        return 2
    elif x == '30평대':
        return 3
    elif x == '모름':
        return 0
    else:
        return 4

tr['평수_1'] = tr['평수'].apply(lambda x: house_size(x))

In [15]:
def house_type(x):
    if x == '일반주택':
        return 1
    else:
        return 2
    
tr['주거형태_1'] = tr['주거형태'].apply(lambda x: house_type(x))
tr['주거가중치'] = tr['주거형태_1'] + tr['평수_1']
tr = tr.drop(columns =['평수_1','주거형태_1'], axis=1)

In [16]:
tr

Unnamed: 0,SEQ,접수일,장기서비스여부,최초서비스일,전체회차,현재회차,서비스일자,서비스시작시간,서비스종료시간,기존고객여부,결재형태,서비스주소,주거형태,평수,고객가입일,반려동물,부재중여부,우선청소,쿠폰사용여부,매니저생년월일,매니저성별,매니저사용휴대폰,매니저이동방법,근무가능지역,CS교육이수여부,청소교육이수여부,부재중서비스가능여부,추천인여부,접수월,접수일일,서비스이용기간,매니저연령대,반려동물여부,종합지수,서비스시간대,주거가중치
0,T06420,2019-07-09,1,2019-07-15,3,3,2019-07-29,9:00:00,13:00:00,1,무통장입금,충남 천안시,일반주택,없음,2019-04-22,없음,모름,없음,0,1956,0,안드로이드,대중교통,서울,0.0,0.0,1,0,7,9,14,1,1,2.0,0,5
1,T15430,2019-07-12,1,2019-07-15,4,3,2019-08-20,10:00:00,16:00:00,1,무통장입금,충남 천안시,일반주택,없음,2019-05-21,없음,모름,없음,0,1976,0,안드로이드,대중교통,천안/아산,1.0,1.0,1,0,7,12,36,3,1,4.0,0,5
2,T23790,2020-11-18,1,2020-11-23,10,6,2020-12-11,9:00:00,13:00:00,1,신용카드,충남 천안시,일반주택,40평대이상,2020-10-14,없음,0.0,없음,0,1970,0,안드로이드,대중교통,천안/아산,0.0,0.0,0,0,11,18,18,3,1,1.0,0,5
3,T18740,2020-11-09,1,2020-11-09,4,4,2020-11-30,9:00:00,14:00:00,1,무통장입금,서울 강서구,일반주택,30평대,2020-02-06,없음,0.0,방,0,1956,0,안드로이드,대중교통,서울,0.0,0.0,1,0,11,9,21,1,1,2.0,0,4
4,T08451,2020-02-25,1,2020-03-31,2,2,2020-04-07,9:00:00,13:00:00,1,무통장입금,충남 천안시,일반주택,없음,2019-09-20,없음,모름,없음,0,1966,0,안드로이드,대중교통,천안/아산,0.0,0.0,1,0,2,25,7,2,1,2.0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4239,T31589,2020-09-28,1,2020-10-05,8,1,2020-10-05,9:00:00,13:00:00,1,신용카드,충남 천안시,일반주택,40평대이상,2020-02-13,없음,0.0,방,1,1961,0,안드로이드,대중교통,천안/아산,0.0,0.0,1,0,9,28,0,2,1,1.0,0,5
4240,T36236,2020-01-01,1,2020-01-07,4,1,2020-01-07,14:00:00,18:00:00,0,무통장입금,충남 천안시,일반주택,없음,2020-01-07,없음,모름,없음,0,1976,0,안드로이드,자차,천안/아산,0.0,0.0,1,0,1,1,0,3,1,2.0,1,5
4241,T34729,2020-12-10,1,2020-12-17,4,1,2020-12-17,9:00:00,13:00:00,1,신용카드,충남 천안시,일반주택,40평대이상,2019-04-11,없음,0.0,방,0,1961,0,안드로이드,대중교통,천안/아산,0.0,0.0,1,0,12,10,0,2,1,2.0,0,5
4242,T53830,2020-01-02,1,2020-01-09,8,6,2020-02-20,9:00:00,13:00:00,1,무통장입금,충남 아산시,일반주택,없음,2019-06-20,없음,모름,없음,1,1964,0,기타,자차,천안/아산,0.0,0.0,0,0,1,2,42,2,1,0.0,0,5


#### 4-7. 서비스주소_광역 feature 생성

In [17]:
tr = tr.replace('수도권', '서울/경기/인천')
tr = tr.replace('천안/아산', '충남/천안/아산')

In [18]:
region = []
for i in tr.서비스주소 :
    region.append(i[:2])

tr['서비스주소_광역'] = region

In [19]:
tr.서비스주소_광역.value_counts()

충남    18164
서울     3558
강원     3325
광주      825
부산      805
경기      533
경남       43
Name: 서비스주소_광역, dtype: int64

#### 4-8. 서비스시간 feature 생성

In [20]:
tr['서비스시작시간'] = pd.to_datetime(pd.to_datetime(tr.서비스시작시간))
tr['서비스종료시간'] = pd.to_datetime(pd.to_datetime(tr.서비스종료시간))

In [21]:
tr['서비스시간'] = tr['서비스종료시간'].dt.hour - tr['서비스시작시간'].dt.hour 

In [22]:
tr['접수일'] = pd.to_datetime(pd.to_datetime(tr['접수일']))

#### 4-9. 서비스대기기간 feature 생성

In [23]:
tr['서비스대기기간'] = (tr['최초서비스일'] - tr['접수일']).apply(lambda x: int(str(x)[:-14]))

#### 4-10. 접수요일 feature 생성

In [24]:
tr['접수요일'] = tr['접수일'].dt.weekday

#### 4-11. 매니저나이 feature 생성

In [25]:
tr['매니저나이'] = 2021 - tr['매니저생년월일']

#### 4-12. 서비스이용기준(장.단기 고객여부) feature 생성

In [26]:
train['전체회차'].describe()

count    23009.000000
mean         4.642357
std          2.323255
min          1.000000
25%          4.000000
50%          4.000000
75%          4.000000
max         30.000000
Name: 전체회차, dtype: float64

In [27]:
def service(x):
    if x >= 5:
        return '장기'
    else:
        return '단기'

train['서비스이용기준'] = train['전체회차'].apply(lambda x: service(x))

In [28]:
tr['고객가입일'] = pd.to_datetime(pd.to_datetime(tr.고객가입일))

#### 4-13. 고객탐색기간 feature 생성

In [29]:
tr['고객탐색기간'] = tr['최초서비스일'] - tr['고객가입일']

In [30]:
tr['고객탐색기간'] = tr['고객탐색기간'].apply(lambda x : int(str(x)[:-14]))

In [31]:
tr = tr.astype({'매니저나이': 'object'})
tr = tr.astype({'매니저연령대': 'object'})

In [32]:
col3 = {'서비스시작시간','서비스종료시간', '고객가입일'}      
tr = tr.drop(col3, axis='columns')

In [33]:
tl = ['접수일', '최초서비스일', '서비스일자', '고객탐색기간', '매니저성별', '매니저생년월일', '접수월', '접수일일', '접수요일']
for i in tl:
    tr[i] = tr[i].astype('object')

In [34]:
##aggregate

In [35]:
tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27253 entries, 0 to 4243
Data columns (total 39 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SEQ         27253 non-null  object 
 1   접수일         27253 non-null  object 
 2   장기서비스여부     27253 non-null  int64  
 3   최초서비스일      27253 non-null  object 
 4   전체회차        27253 non-null  int64  
 5   현재회차        27253 non-null  int64  
 6   서비스일자       27253 non-null  object 
 7   기존고객여부      27253 non-null  int64  
 8   결재형태        27253 non-null  object 
 9   서비스주소       27253 non-null  object 
 10  주거형태        27253 non-null  object 
 11  평수          27253 non-null  object 
 12  반려동물        27253 non-null  object 
 13  부재중여부       27253 non-null  object 
 14  우선청소        27253 non-null  object 
 15  쿠폰사용여부      27253 non-null  int64  
 16  매니저생년월일     27253 non-null  object 
 17  매니저성별       27253 non-null  object 
 18  매니저사용휴대폰    27253 non-null  object 
 19  매니저이동방법     27253 non-null

## 학습 데이터 준비

In [36]:
train_data = tr[0:len(train)]
train_label = train.매칭성공여부
train_data['매칭성공여부'] = train_label

In [37]:
tr1 = tr.drop('SEQ', axis=1)
tr1 = pd.get_dummies(tr1)
tr = pd.concat([tr.SEQ,tr1],axis=1)

In [38]:
train_data = tr.iloc[:len(train),:]
test_data = tr.iloc[len(train):,:]

train_data = train_data.drop('SEQ', axis='columns')
test_data = test_data.drop('SEQ', axis='columns')

train_label = train.매칭성공여부

In [39]:
seed = 42

#### 데이터 불균형 해소 (SMOTE)

In [41]:
pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
Collecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
Installing collected packages: joblib, imbalanced-learn, imblearn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Uninstalling joblib-1.1.0:
      Successfully uninstalled joblib-1.1.0
Successfully installed imbalanced-learn-0.10.1 imblearn-0.0 joblib-1.2.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
from imblearn.over_sampling import *
from imblearn.under_sampling import *
from imblearn.combine import *

smote = SMOTETomek(sampling_strategy=0.5, random_state=seed)
train_data_over,train_label_over = smote.fit_resample(train_data,train_label)

In [None]:
train_data_over.reset_index(drop = True, inplace=True)
train_label_over.reset_index(drop = True, inplace=True)

## ExtraTree분류 모델 학습

In [None]:
n_it = 60
random_state=42
Kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

# 모델 훈련 함수 정의
def return_fitted_model(model, train, target):
    scores = []
    models = []
    for iter_count, (train_idx, valid_idx) in enumerate(Kfold.split(train, target)):

        X_train, X_valid = train[train_idx], train[valid_idx]
        y_train, y_valid = target[train_idx], target[valid_idx]

        model.fit(X_train, y_train)

        pred = model.predict(X_valid)
        ra_score = roc_auc_score(y_valid, pred)
        scores.append(ra_score)
        models.append(model)
    return model, np.mean(scores)


In [None]:
model = ExtraTreesClassifier(random_state=seed, n_jobs=-1)

In [None]:
model, avg_score = return_fitted_model(model, train_data_over.values, train_label_over.values)
print('score: ',avg_score)

In [None]:
model.predict_proba(test_data)[:,1]

## 제출용 데이터 생성

In [None]:
pd.DataFrame({'SEQ': test.SEQ , 
              'pred': model.predict_proba(test_data)[:,1]}).to_csv("check.csv", index=False)