In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer, KNNImputer
from xgboost import XGBClassifier
from sklearn import metrics
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder, KBinsDiscretizer

In [9]:
train_df = pd.read_csv("/home/ubuntu/try/file/preprocessed_train.csv")
test_df = pd.read_csv("/home/ubuntu/try/file/preprocessed_test.csv")

# 열 리스트(범주, 순서, 수치)
categorical_cols = ['Occupation','TypeofContact', 'Gender', 'Marital_Status']
ordinal_cols = ['ProductPitched', 'Designation']
numerical_cols = ['Age', 'CityTier', 'DurationOfPitch', 'NumberOfPersonVisiting', 'NumberOfFollowups', 'PreferredPropertyStar', 'NumberOfTrips', 'Passport', 'PitchSatisfactionScore', 'MonthlyIncome', 'Car_Ownership', 'Has_Children']

#순서형 변수 'Product Pitched' 와 'Designation'의 변수 순서
orders = [['basic','standard','deluxe','super deluxe','king'],
          ['vp','avp','executive','manager','senior manager']]

# 순서형 열과 범주형 열에 대해 Label Encoding, Ordinal Encoding 적용
for id, col in enumerate(ordinal_cols):
    oe = OrdinalEncoder(categories=[orders[id]])
    train_df[col] = oe.fit_transform(train_df[[col]].astype(str))

for id, col in enumerate(categorical_cols):
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))


#train / target data 나누기
X = train_df


결측치처리

In [10]:
# 결측치 처리
# 수치형 - KNN Imputer
numerical_imputer = KNNImputer()
X[numerical_cols] = numerical_imputer.fit_transform(X[numerical_cols])

# 범주, 순서형 - 최빈
categorical_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_cols + ordinal_cols] = categorical_imputer.fit_transform(X[categorical_cols + ordinal_cols])


In [12]:
X.isnull.

Age                          0
TypeofContact                0
CityTier                     0
DurationOfPitch              0
Occupation                   0
Gender                       0
NumberOfPersonVisiting       0
NumberOfFollowups            0
ProductPitched               0
PreferredPropertyStar        0
NumberOfTrips                0
Passport                     0
PitchSatisfactionScore       0
Designation                  0
MonthlyIncome                0
Marital_Status               0
Car_Ownership                0
Has_Children                 0
ProdTaken                    0
NumberOfFollowups_outlier    0
Income_Category              0
dtype: int64

예측모델 함수 생성

In [None]:
target = 'id'
IDcol = 'ProdTaken'

In [None]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='error', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['ProdTaken'], eval_metric='error')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['ProdTaken'].values, dtrain_predictions))

하이퍼파라미터 튜닝 depth,weight

In [None]:
param_test1 = {
    'max_depth': range(3, 10, 3),
    'min_child_weight': range(1, 6, 2)
}

# StratifiedKFold 객체 생성
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV 객체 생성
gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, 
                                                  n_estimators=1000, 
                                                  max_depth=5, 
                                                  min_child_weight=1, 
                                                  gamma=0, 
                                                  subsample=0.8, 
                                                  colsample_bytree=0.8,
                                                  objective='binary:logistic', 
                                                  scale_pos_weight=1, 
                                                  seed=2019),
                        param_grid=param_test1, 
                        scoring='roc_auc',  # AUC를 계산할 때 'roc_auc' 사용
                        n_jobs=-1, 
                        cv=skf, )

# 모델 학습
gsearch1.fit(X[predictors], X[target])

# 결과 출력
print(gsearch1.cv_results_)
print(gsearch1.best_params_)
print(gsearch1.best_score_)