# 문제 1. 
- 다음은 Travel Insurance 데이터 세트이다. 주어진 훈련 데이터 세트를 이용하여 고객별 여행보험 가입 여부 예측 모형을 만들고, 가장 높은 Accuracy 값을 가지는 최종 모델을 도출하시오. 해당 모델을 활용하여 보험 가입 여부 예측값을 계산하고 결괏값은 CSV 파일로 제출하시오.
- 결과 제출 양식 : 제출한 예측값의 ROC_AUC 점수 결과를 통해 영역별 배점에 따라 최종 점수가 반영될 예정

In [1]:
import pandas as pd 
pd.DataFrame({
    "ID"  : [0, 1, 2], 
    "y_pred" : [0.2543, 0.1324, 0.5892]
})

Unnamed: 0,ID,y_pred
0,0,0.2543
1,1,0.1324
2,2,0.5892


## 데이터 불러오기

In [2]:
import pandas as pd 
X_test = pd.read_csv("data/3회/304_x_test.csv")
X_train = pd.read_csv("data/3회/304_x_train.csv")
y_train = pd.read_csv("data/3회/304_y_train.csv")

## 데이터 정보

In [3]:
import pandas as pd 
pd.DataFrame({
    "변수" : list(X_train.columns) + ["TravelInsurance"], 
    "설명" : [
        "고객 ID", 
        "고객 나이", 
        "고객 직업 유형", 
        "대졸 여부", 
        "연 소득", 
        "가족 수",
        "만성질환 여부", 
        "FrequentFlyer 자격 여부", 
        "해외여행 경험",   
        "여행보험 가입 여부(0: 미가입, 1: 가입)"
    ]
})

Unnamed: 0,변수,설명
0,ID,고객 ID
1,Age,고객 나이
2,Employment Type,고객 직업 유형
3,GraduateOrNot,대졸 여부
4,AnnualIncome,연 소득
5,FamilyMembers,가족 수
6,ChronicDiseases,만성질환 여부
7,FrequentFlyer,FrequentFlyer 자격 여부
8,EverTravelledAbroad,해외여행 경험
9,TravelInsurance,"여행보험 가입 여부(0: 미가입, 1: 가입)"


## 데이터탐색

In [4]:
X_test.head()

Unnamed: 0,ID,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad
0,1569,27,Government Sector,Yes,500000,5,0,No,No
1,1344,25,Private Sector/Self Employed,Yes,1700000,3,0,Yes,No
2,1429,32,Government Sector,Yes,650000,3,0,No,No
3,896,33,Government Sector,Yes,600000,4,0,No,No
4,101,33,Private Sector/Self Employed,Yes,1500000,3,1,Yes,Yes


In [5]:
y_train.head()

Unnamed: 0,ID,TravelInsurance
0,1704,0
1,491,1
2,414,1
3,120,0
4,1268,1


In [6]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   497 non-null    int64 
 1   Age                  497 non-null    int64 
 2   Employment Type      497 non-null    object
 3   GraduateOrNot        497 non-null    object
 4   AnnualIncome         497 non-null    int64 
 5   FamilyMembers        497 non-null    int64 
 6   ChronicDiseases      497 non-null    int64 
 7   FrequentFlyer        497 non-null    object
 8   EverTravelledAbroad  497 non-null    object
dtypes: int64(5), object(4)
memory usage: 35.1+ KB


In [7]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   ID               1490 non-null   int64
 1   TravelInsurance  1490 non-null   int64
dtypes: int64(2)
memory usage: 23.4 KB


In [8]:
len(X_test['ID'].unique())

497

In [9]:
len(y_train['ID'].unique())

1490

In [10]:
y_train['TravelInsurance'].value_counts()

0    950
1    540
Name: TravelInsurance, dtype: int64

## 데이터셋 분리

In [11]:
from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

X_tr.shape, X_te.shape, y_tr.shape, y_te.shape

((1192, 9), (298, 9), (1192, 2), (298, 2))

## 컬럼 분리

In [12]:
X_tr_id = X_tr.pop('ID')
X_te_id = X_te.pop('ID')
y_tr_id = y_tr.pop('ID')
y_te_id = y_te.pop('ID')

X_tr_id.shape, X_te_id.shape, y_tr_id.shape, y_te_id.shape

((1192,), (298,), (1192,), (298,))

In [13]:
X_tr.shape, X_te.shape, y_tr.shape, y_te.shape

((1192, 8), (298, 8), (1192, 1), (298, 1))

## 데이터타입별로 분리

In [14]:
object_df = X_tr.select_dtypes(include=object)
object_df.head()

Unnamed: 0,Employment Type,GraduateOrNot,FrequentFlyer,EverTravelledAbroad
1166,Government Sector,Yes,No,No
1106,Private Sector/Self Employed,Yes,Yes,No
538,Private Sector/Self Employed,Yes,No,Yes
439,Private Sector/Self Employed,Yes,Yes,Yes
1478,Private Sector/Self Employed,No,No,No


In [15]:
import numpy as np

number_df = X_tr.select_dtypes(include=np.number)
number_df.head()

Unnamed: 0,Age,AnnualIncome,FamilyMembers,ChronicDiseases
1166,34,1300000,4,0
1106,26,600000,3,0
538,25,1400000,5,1
439,25,1400000,3,1
1478,25,1150000,6,0


In [16]:
for column in object_df.columns:
    print(object_df[column].value_counts())

Private Sector/Self Employed    856
Government Sector               336
Name: Employment Type, dtype: int64
Yes    1013
No      179
Name: GraduateOrNot, dtype: int64
No     949
Yes    243
Name: FrequentFlyer, dtype: int64
No     954
Yes    238
Name: EverTravelledAbroad, dtype: int64


## 모델 생성

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer

# 컬럼변환
transformer = ColumnTransformer([
    ('minmax_scaler', MinMaxScaler(), number_df.columns),
    ('ohc_encoder', OneHotEncoder(), object_df.columns)
], remainder='passthrough')

# 파이프라인
pipeline = Pipeline([
    ('preprocessor', transformer),
    ('model', LGBMClassifier(random_state=42, max_depth=3, learning_rate=0.001))
])

# 모델학습
pipeline.fit(X_tr, y_tr)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


## 모델평가

In [18]:
from sklearn.metrics import roc_auc_score

def get_scores(model, X_train, X_test, y_train, y_test):
    train_prediction = model.predict_proba(X_train)
    test_prediction = model.predict_proba(X_test)
    train_score = roc_auc_score(y_train, train_prediction[:, 1])
    test_score = roc_auc_score(y_test, test_prediction[:, 1])
    return f'train : {train_score}, test : {test_score}'
get_scores(pipeline, X_tr, X_te, y_tr, y_te)

'train : 0.8151930639699965, test : 0.8149025413274118'

## 결과제출

In [19]:
predictions = pipeline.predict_proba(X_te)
result = pd.DataFrame({"ID":X_te_id, 'y_pred':predictions[:, 1]})
result.head()

Unnamed: 0,ID,y_pred
941,472,0.332026
297,799,0.350273
271,827,0.350273
774,814,0.350273
420,781,0.350273


In [20]:
result.to_csv('2.csv', index=False)