<a href="https://colab.research.google.com/github/HwangHanJae/Dacon_airline_classification/blob/main/voting_algo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

plt.style.use("ggplot")
warnings.filterwarnings("ignore")

In [2]:
train_path = "/content/drive/MyDrive/Colab Notebooks/DataScience/Data/airline_dataset/train.csv"
test_path = "/content/drive/MyDrive/Colab Notebooks/DataScience/Data/airline_dataset/test.csv"
sub_path = "/content/drive/MyDrive/Colab Notebooks/DataScience/Data/airline_dataset/sample_submission.csv"

df = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [3]:
#로그화
def log(df):
  log_feature = ["Age", "Flight Distance"]
  for feature in log_feature:
    df[feature] = np.log1p(df[feature])
  return df

In [4]:
#라벨인코딩 함수
from sklearn.preprocessing import LabelEncoder

def labeling(df):
    encoder = LabelEncoder()
    columns = ["Gender","Customer Type","Type of Travel", "Class"]
    for column in columns:
      df[column] = encoder.fit_transform(df[column])

    return df

In [5]:
def get_mean(df):
  df["Mean Delay in Minutes"] = (df["Departure Delay in Minutes"] + df['Arrival Delay in Minutes']) / 2
  return df

In [6]:
#불필요한 데이터 제거
def drop_columns(df):
  drop_col = ["id", "Departure Delay in Minutes","Arrival Delay in Minutes","Food and drink"]
  df = df.drop(drop_col, axis=1)
  return df

In [7]:
def preprocessing(df):
  df = log(df)
  df = labeling(df)
  df = get_mean(df)
  df = drop_columns(df)
  return df

In [8]:
#데이터 다시 불러오기
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [9]:
train = preprocessing(train)
test = preprocessing(test)

In [10]:
#타겟데이터 분리
X = train.drop("target", axis=1)
y = train['target']

In [13]:
#훈련/평가 데이터 분리
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


print("훈련 데이터 세트 크기 : {}".format(X_train.shape))
print("훈련 레이블 세트 크기 : {}".format(y_train.shape))
print("테스트 데이터 세트 크기 : {}".format(X_test.shape))
print("테스트 레이블 세트 크기 : {}".format(y_test.shape))

훈련 데이터 세트 크기 : (2400, 20)
훈련 레이블 세트 크기 : (2400,)
테스트 데이터 세트 크기 : (600, 20)
테스트 레이블 세트 크기 : (600,)


In [21]:
#훈련/평가 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

gbm = GradientBoostingClassifier(n_estimators =1000,random_state=42)
lr = LogisticRegression(C= 1.5, penalty= 'l2')
rf = RandomForestClassifier(n_estimators = 1000, n_jobs= -1, random_state=42)
xgb = XGBClassifier(n_estimators = 1000, n_jobs=-1)
light = LGBMClassifier(n_estimators = 1000, n_jobs=-1, random_state=42)

models = [gbm,lr, rf, xgb, light]
basic_estimators = []
for model in models:
    basic_estimators.append((model.__class__.__name__, model))
    print(model.__class__.__name__)
    scores = cross_val_score(model, X, y, cv=10, scoring="accuracy")
    print("교차 검증별 정확도 : ", np.round(scores, 4))
    print("평균 검증 정확도 : ", np.round(np.mean(scores),4))
    print("\n")

GradientBoostingClassifier
교차 검증별 정확도 :  [0.93   0.92   0.91   0.9133 0.94   0.94   0.95   0.91   0.9433 0.9267]
평균 검증 정확도 :  0.9283


LogisticRegression
교차 검증별 정확도 :  [0.8567 0.8433 0.8267 0.8067 0.85   0.8567 0.8433 0.82   0.8267 0.8467]
평균 검증 정확도 :  0.8377


RandomForestClassifier
교차 검증별 정확도 :  [0.92   0.9067 0.9233 0.9067 0.9267 0.92   0.9333 0.9067 0.9233 0.94  ]
평균 검증 정확도 :  0.9207


XGBClassifier
교차 검증별 정확도 :  [0.9233 0.91   0.9067 0.9233 0.9367 0.94   0.95   0.9133 0.93   0.92  ]
평균 검증 정확도 :  0.9253


LGBMClassifier
교차 검증별 정확도 :  [0.9367 0.93   0.9267 0.9233 0.9367 0.9467 0.94   0.9133 0.93   0.9233]
평균 검증 정확도 :  0.9307




#기본 모델 보팅

In [22]:
from sklearn.ensemble import VotingClassifier
basic_voting = VotingClassifier(estimators =basic_estimators, voting='soft',n_jobs=-1)
basic_voting.fit(X, y)
basic_pred = basic_voting.predict(test)

In [23]:
submission = pd.read_csv(sub_path)
submission["target"] = basic_pred
submission.to_csv("basic_voting_model.csv", index=False)

#하이퍼 파라미터 튜닝

In [24]:
voting_models = []

In [27]:
from sklearn.model_selection import GridSearchCV
lr = LogisticRegression()
params = {"penalty":['l2',"l1"],
          "C" : [0.01,.01,1,1.5,2,3,5,10,20]}

lr_grid = GridSearchCV(lr, param_grid = params, n_jobs=-1, scoring='accuracy', cv= 10)
lr_grid.fit(X, y)
print("best param : {}".format(lr_grid.best_params_))
print("best score : {}".format(lr_grid.best_score_))
lr_best = lr_grid.best_estimator_
voting_models.append(lr_best)

best param : {'C': 1.5, 'penalty': 'l2'}
best score : 0.8386666666666667


In [28]:
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
params = {
    "n_estimators" : [1000],
    "max_depth" : [6, 8, 10, 12],
    "min_samples_leaf" : [8, 12, 15],
    "min_samples_split" : [8,12, 16]
     
}
rf_grid = GridSearchCV(rf, param_grid = params, n_jobs = -1, scoring='accuracy', cv=10)
rf_grid.fit(X,y)
print("best param : {}".format(rf_grid.best_params_))
print("best score : {:.3f}".format(rf_grid.best_score_))
rf_best = rf_grid.best_estimator_
voting_models.append(rf_best)

best param : {'max_depth': 10, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 1000}
best score : 0.896


In [29]:
xgb = XGBClassifier(n_jobs=-1)
params = {
    "n_estimators" : [1000],
    "max_depth" : [3,5,7,9],
    "min_child_weight" : [1,3,5,7]
}
xgb_grid = GridSearchCV(xgb, param_grid = params, n_jobs = -1, scoring='accuracy', cv=10)
xgb_grid.fit(X, y)
print("best param : {}".format(xgb_grid.best_params_))
print("best score : {:.3f}".format(xgb_grid.best_score_))
xgb_best = xgb_grid.best_estimator_
voting_models.append(xgb_best)

best param : {'max_depth': 9, 'min_child_weight': 1, 'n_estimators': 1000}
best score : 0.932


In [30]:
light = LGBMClassifier(n_jobs=-1, random_state=42)
params = {
    "n_estimators" : [1000],
    "learning_rate" : [0.05,0.1],
    "num_leaves" : [16,32,64],
    "max_depth" : [64,100, 128],
    "min_child_samples" : [30,60, 100]
}
light_grid = GridSearchCV(light, param_grid = params, n_jobs = -1, scoring='accuracy', cv=10)
light_grid.fit(X, y)
print("best param : {}".format(light_grid.best_params_))
print("best score : {:.3f}".format(light_grid.best_score_))
light_best = light_grid.best_estimator_
voting_models.append(light_best)

best param : {'learning_rate': 0.1, 'max_depth': 64, 'min_child_samples': 60, 'n_estimators': 1000, 'num_leaves': 32}
best score : 0.938


In [31]:
gbm = GradientBoostingClassifier(random_state=42)
params = {
    "n_estimators" : [1000],
    "learning_rate" : [0.05, 0.1]
}
gbm_grid = GridSearchCV(gbm, param_grid = params, n_jobs = -1, scoring='accuracy', cv=10)
gbm_grid.fit(X, y)
print("best param : {}".format(gbm_grid.best_params_))
print("best score : {:.3f}".format(gbm_grid.best_score_))
gbm_best = gbm_grid.best_estimator_
voting_models.append(gbm_best)

best param : {'learning_rate': 0.05, 'n_estimators': 1000}
best score : 0.929


#보팅(Voting)

In [44]:
from sklearn.ensemble import VotingClassifier
estimators = []
for model in voting_models:
  estimators.append((str(model.__class__.__name__), model))

voting = VotingClassifier(estimators = estimators,voting='soft',n_jobs=-1)
voting

VotingClassifier(estimators=[('LogisticRegression', LogisticRegression(C=1.5)),
                             ('RandomForestClassifier',
                              RandomForestClassifier(max_depth=10,
                                                     min_samples_leaf=8,
                                                     min_samples_split=8,
                                                     n_estimators=1000,
                                                     n_jobs=-1,
                                                     random_state=42)),
                             ('XGBClassifier',
                              XGBClassifier(max_depth=9, n_estimators=1000,
                                            n_jobs=-1)),
                             ('LGBMClassifier',
                              LGBMClassifier(max_depth=64, min_child_samples=60,
                                             n_estimators=1000, num_leaves=32,
                                             random_st

In [45]:
from sklearn.model_selection import cross_val_score

for model in voting_models:
  print("#### {} ####".format(model.__class__.__name__))
  scores = cross_val_score(model, X, y, cv=10, scoring="accuracy")
  print("교차 검증별 정확도 : ", np.round(scores, 4))
  print("평균 검증 정확도 : ", np.round(np.mean(scores),4))

#### LogisticRegression ####
교차 검증별 정확도 :  [0.8567 0.8433 0.8267 0.8067 0.85   0.8567 0.8433 0.82   0.8267 0.8467]
평균 검증 정확도 :  0.8377
#### RandomForestClassifier ####
교차 검증별 정확도 :  [0.89   0.88   0.8967 0.8833 0.9    0.89   0.9033 0.8967 0.9067 0.9167]
평균 검증 정확도 :  0.8963
#### XGBClassifier ####
교차 검증별 정확도 :  [0.9333 0.9233 0.92   0.93   0.94   0.9433 0.9467 0.9167 0.9367 0.93  ]
평균 검증 정확도 :  0.932
#### LGBMClassifier ####
교차 검증별 정확도 :  [0.9433 0.94   0.93   0.94   0.94   0.95   0.95   0.91   0.94   0.9367]
평균 검증 정확도 :  0.938
#### GradientBoostingClassifier ####
교차 검증별 정확도 :  [0.9333 0.9167 0.9233 0.9067 0.94   0.94   0.94   0.9167 0.94   0.93  ]
평균 검증 정확도 :  0.9287


In [46]:
voting.fit(X,y)
pred = voting.predict(test)
submission = pd.read_csv(sub_path)
submission["target"] = pred
submission.to_csv("voting_tunning_model.csv", index=False)
#LB = 0.927