# 항공사 고객 예측 경진대회 (주최 : Dacon)

### 데이터 상세 설명

#### 이 데이터는 항공사 조직에서 제공한 것입니다. 회사의 실제 이름은 다양한 목적으로 인해 제공되지 않아 Invisistico Airlines라는 이름이 사용되었습니다.
#### 데이터 세트는 이미 함께 비행한 고객의 세부 정보로 구성됩니다. 다양한 상황과 비행 데이터에 대한 고객의 피드백이 통합되었습니다.
#### 이 데이터 세트의 주요 목적은 다른 매개변수 값의 세부사항을 고려할 때 미래의 고객이 서비스에 만족할지 여부를 예측하는 것입니다.


### 라이브러리

In [2]:
import warnings
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn import datasets, model_selection, linear_model, ensemble,neighbors
from sklearn.metrics import mean_squared_error,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.ensemble import RandomForestClassifier 
from xgboost import plot_importance
from xgboost import XGBClassifier
from vecstack import stacking
from vecstack import StackingTransformer

#### train 데이터 정리하기

In [3]:
df_train = pd.read_csv("train.csv", encoding = 'utf-8')
df_train

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,target
0,1,Female,disloyal Customer,22,Business travel,Eco,1599,3,0,3,...,4,5,4,4,4,5,4,0,0.0,0
1,2,Female,Loyal Customer,37,Business travel,Business,2810,2,4,4,...,5,5,4,2,1,5,2,18,18.0,0
2,3,Male,Loyal Customer,46,Business travel,Business,2622,1,1,1,...,4,4,4,4,5,4,3,0,0.0,1
3,4,Female,disloyal Customer,24,Business travel,Eco,2348,3,3,3,...,3,2,4,5,3,4,3,10,2.0,0
4,5,Female,Loyal Customer,58,Business travel,Business,105,3,3,3,...,4,4,4,4,4,4,5,0,0.0,1
5,6,Female,Loyal Customer,42,Business travel,Business,1780,4,4,4,...,5,5,4,5,3,5,4,0,0.0,1
6,7,Female,disloyal Customer,20,Business travel,Eco,2980,3,3,3,...,5,1,5,3,1,4,5,12,34.0,0
7,8,Male,Loyal Customer,38,Business travel,Eco,1386,4,3,3,...,4,1,2,2,1,5,4,0,0.0,1
8,9,Female,Loyal Customer,51,Business travel,Business,2346,2,2,2,...,3,3,3,3,3,3,3,0,0.0,1
9,10,Male,Loyal Customer,8,Personal Travel,Eco,1336,2,5,2,...,5,5,5,5,4,5,5,0,0.0,0


In [4]:
# train data 정보확인

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 24 columns):
id                                   3000 non-null int64
Gender                               3000 non-null object
Customer Type                        3000 non-null object
Age                                  3000 non-null int64
Type of Travel                       3000 non-null object
Class                                3000 non-null object
Flight Distance                      3000 non-null int64
Seat comfort                         3000 non-null int64
Departure/Arrival time convenient    3000 non-null int64
Food and drink                       3000 non-null int64
Gate location                        3000 non-null int64
Inflight wifi service                3000 non-null int64
Inflight entertainment               3000 non-null int64
Online support                       3000 non-null int64
Ease of Online booking               3000 non-null int64
On-board service                  

In [5]:
# 문자로 구성된 column 값들 모두 숫자로 바꿔주기

df_train[df_train.columns[df_train.dtypes=='O']] = df_train[df_train.columns[df_train.dtypes=='O']].astype(str).apply(LabelEncoder().fit_transform)

In [6]:
df_train.head()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,target
0,1,0,1,22,0,1,1599,3,0,3,...,4,5,4,4,4,5,4,0,0.0,0
1,2,0,0,37,0,0,2810,2,4,4,...,5,5,4,2,1,5,2,18,18.0,0
2,3,1,0,46,0,0,2622,1,1,1,...,4,4,4,4,5,4,3,0,0.0,1
3,4,0,1,24,0,1,2348,3,3,3,...,3,2,4,5,3,4,3,10,2.0,0
4,5,0,0,58,0,0,105,3,3,3,...,4,4,4,4,4,4,5,0,0.0,1


In [7]:
# train 데이터셋 -> 학습시킬값 나누기

X_train = df_train.iloc[:,1:-1]
Y_train = df_train.target

In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 24 columns):
id                                   3000 non-null int64
Gender                               3000 non-null int32
Customer Type                        3000 non-null int32
Age                                  3000 non-null int64
Type of Travel                       3000 non-null int32
Class                                3000 non-null int32
Flight Distance                      3000 non-null int64
Seat comfort                         3000 non-null int64
Departure/Arrival time convenient    3000 non-null int64
Food and drink                       3000 non-null int64
Gate location                        3000 non-null int64
Inflight wifi service                3000 non-null int64
Inflight entertainment               3000 non-null int64
Online support                       3000 non-null int64
Ease of Online booking               3000 non-null int64
On-board service                     3

#### test 데이터 정리하기

In [9]:
df_test = pd.read_csv("test.csv", encoding='utf-8')
df_test

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,1,Female,Loyal Customer,61,Personal Travel,Eco,2037,1,1,1,...,5,5,5,5,5,3,5,3,51,58.0
1,2,Female,disloyal Customer,27,Business travel,Business,1846,1,1,1,...,1,1,3,4,5,4,4,1,0,0.0
2,3,Female,Loyal Customer,52,Business travel,Business,1622,4,4,4,...,4,5,5,5,5,4,5,3,0,0.0
3,4,Male,Loyal Customer,54,Business travel,Business,3534,4,4,4,...,5,2,2,2,2,5,2,1,0,0.0
4,5,Female,Loyal Customer,41,Business travel,Eco,1471,4,3,3,...,4,4,3,1,2,5,4,4,0,0.0
5,6,Female,Loyal Customer,67,Personal Travel,Eco Plus,1079,4,2,2,...,4,4,3,2,1,2,1,4,0,0.0
6,7,Female,Loyal Customer,48,Business travel,Eco,365,5,2,2,...,3,5,5,5,5,1,5,4,0,0.0
7,8,Male,Loyal Customer,59,Personal Travel,Business,1109,5,4,4,...,4,4,4,4,4,4,4,3,1,2.0
8,9,Female,Loyal Customer,47,Personal Travel,Eco,611,4,5,4,...,5,4,4,4,2,3,4,3,2,2.0
9,10,Male,Loyal Customer,20,Business travel,Business,3013,4,4,4,...,2,2,4,5,3,1,4,2,3,0.0


In [10]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 23 columns):
id                                   2000 non-null int64
Gender                               2000 non-null object
Customer Type                        2000 non-null object
Age                                  2000 non-null int64
Type of Travel                       2000 non-null object
Class                                2000 non-null object
Flight Distance                      2000 non-null int64
Seat comfort                         2000 non-null int64
Departure/Arrival time convenient    2000 non-null int64
Food and drink                       2000 non-null int64
Gate location                        2000 non-null int64
Inflight wifi service                2000 non-null int64
Inflight entertainment               2000 non-null int64
Online support                       2000 non-null int64
Ease of Online booking               2000 non-null int64
On-board service                  

In [11]:
# 문자로 구성된 column 값들 모두 숫자로 바꿔주기

df_test[df_test.columns[df_test.dtypes=='O']] = df_test[df_test.columns[df_test.dtypes=='O']].astype(str).apply(LabelEncoder().fit_transform)

In [12]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 23 columns):
id                                   2000 non-null int64
Gender                               2000 non-null int32
Customer Type                        2000 non-null int32
Age                                  2000 non-null int64
Type of Travel                       2000 non-null int32
Class                                2000 non-null int32
Flight Distance                      2000 non-null int64
Seat comfort                         2000 non-null int64
Departure/Arrival time convenient    2000 non-null int64
Food and drink                       2000 non-null int64
Gate location                        2000 non-null int64
Inflight wifi service                2000 non-null int64
Inflight entertainment               2000 non-null int64
Online support                       2000 non-null int64
Ease of Online booking               2000 non-null int64
On-board service                     2

In [13]:
# 학습시킬값 나누기

X_test = df_test.iloc[:,1:]

### 머신러닝으로 예측하기

#### 랜덤포레스트

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()     
classifier.fit(X_train, Y_train)         

RandomForestClassifier()

In [15]:
prediction = classifier.predict(X_test)   # 예측
print(prediction)

[1 0 1 ... 0 1 1]
