# 라이브러리 호출

In [1]:
import numpy as np # Numpy
import pandas as pd # Pandas
import matplotlib as mpl #Matplotlib 세팅용
import matplotlib.pyplot as plt # 시각화 도구
import seaborn as sns # 시각화 도구
from sklearn.preprocessing import StandardScaler, MinMaxScaler # 스케일링
from sklearn.model_selection import train_test_split # 데이터셋 분리
from sklearn.model_selection import KFold # KFold 교차검증
from sklearn.cluster import KMeans # 클러스터링
from sklearn.metrics import silhouette_score # 실루엣 점수
import xgboost as xgb # XGBoost
from sklearn.model_selection import GridSearchCV # 그리드 서치
from sklearn.metrics import accuracy_score, precision_score # 평가 지표
from sklearn.metrics import recall_score, confusion_matrix, roc_auc_score, f1_score # 평가 지표
from imblearn.combine import SMOTEENN, SMOTETomek # 복합샘플링
from hyperopt import hp, fmin, tpe, Trials # HyperOPT
from nltk.corpus import names # nltk
import nltk
nltk.download("names")
from nltk import NaiveBayesClassifier
from scipy import stats
from collections import Counter
import random

import warnings # 경고문 제거용


%matplotlib inline
%config Inlinebackend.figure_format = 'retina'

# 한글 폰트 설정
mpl.rc('font', family='D2Coding')
# 유니코드에서 음수 부호 설정
mpl.rc('axes', unicode_minus = False)

warnings.filterwarnings('ignore')
sns.set(font="D2Coding", rc={"axes.unicode_minus":False}, style='darkgrid')
plt.rc('figure', figsize=(10,8))

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


# 데이터로딩

In [2]:
data = pd.read_excel('train_test_na_filled.xlsx', sheet_name='Train')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   bool   
 3   Cabin1        8590 non-null   object 
 4   Cabin2        8590 non-null   float64
 5   Combi         8590 non-null   object 
 6   Cabin3        8590 non-null   object 
 7   Cabin         8590 non-null   object 
 8   Destination   8693 non-null   object 
 9   Age           8693 non-null   int64  
 10  VIP           8693 non-null   bool   
 11  RoomService   8693 non-null   int64  
 12  FoodCourt     8693 non-null   int64  
 13  ShoppingMall  8693 non-null   int64  
 14  Spa           8693 non-null   int64  
 15  VRDeck        8693 non-null   int64  
 16  Name          8493 non-null   object 
 17  Transported   8693 non-null   bool   
dtypes: bool(3), float64(1), int6

In [3]:
test = pd.read_excel('train_test_na_filled.xlsx', sheet_name='Test')
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin1,Cabin2,Combi,Cabin3,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0013_01,Earth,True,G,3.0,G3,S,G/3/S,TRAPPIST-1e,27,False,0,0,0,0,0,Nelly Carsoning,
1,0018_01,Earth,False,F,4.0,F4,S,F/4/S,TRAPPIST-1e,19,False,0,9,0,2823,0,Lerome Peckers,
2,0019_01,Europa,True,C,0.0,C0,S,C/0/S,55 Cancri e,31,False,0,0,0,0,0,Sabih Unhearfus,
3,0021_01,Europa,False,C,1.0,C1,S,C/1/S,TRAPPIST-1e,38,False,0,6652,0,181,585,Meratz Caltilter,
4,0023_01,Earth,False,F,5.0,F5,S,F/5/S,TRAPPIST-1e,20,False,10,0,635,0,0,Brence Harperez,


# 탐색

## ANOVA 분석

In [4]:
numeric_data = [column for column in data.select_dtypes(["int", "float"])]

for column in numeric_data:
  df_anova = data[[column,'Transported']]
  grouped_anova = df_anova.groupby(['Transported'])
  f_value, p_value = stats.f_oneway(grouped_anova.get_group(True)[column],
                                    grouped_anova.get_group(False)[column])
  result = ""
  if p_value < 0.05:
    result = "{}은/는 예측에 중요한 feature입니다.".format(column)
  else:
    result = "{}은/는 예측에 중요하지않은 feature입니다.".format(column)
  print(result)

Cabin2은/는 예측에 중요하지않은 feature입니다.
Age은/는 예측에 중요한 feature입니다.
RoomService은/는 예측에 중요한 feature입니다.
FoodCourt은/는 예측에 중요한 feature입니다.
ShoppingMall은/는 예측에 중요하지않은 feature입니다.
Spa은/는 예측에 중요한 feature입니다.
VRDeck은/는 예측에 중요한 feature입니다.


In [5]:
def outlier_detection_train(df, n, columns):
    rows = []
    will_drop_train = []
    for col in columns:
        Q1 = np.nanpercentile(df[col], 25)
        Q3 = np.nanpercentile(df[col], 75)
        IQR = Q3 - Q1
        outlier_point = 1.5 * IQR
        rows.extend(df[(df[col] < Q1 - outlier_point)|(df[col] > Q3 + outlier_point)].index)
    for r, c in Counter(rows).items():
        if c >= n: will_drop_train.append(r)
    return will_drop_train

In [6]:
data.drop('Cabin2', inplace=True, axis=1)

In [7]:
test.drop('Cabin2', inplace=True, axis=1)

In [8]:
data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin1', 'Combi', 'Cabin3',
       'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt',
       'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Transported'],
      dtype='object')

# 또처리~

## 이상치 확인 및 제거

In [9]:
def outlier_detection_train(df, n, columns):
    rows = []
    will_drop_train = []
    for col in columns:
        Q1 = np.nanpercentile(data[col], 25)
        Q3 = np.nanpercentile(data[col], 75)
        IQR = Q3 - Q1
        outlier_point = 1.5 * IQR
        rows.extend(df[(df[col] < Q1 - outlier_point)|(df[col] > Q3 + outlier_point)].index)
    for r, c in Counter(rows).items():
        if c >= n: will_drop_train.append(r)
    return will_drop_train

In [10]:
will_drop_train = outlier_detection_train(data, 5 ,data.select_dtypes(["float", "int"]).columns)
will_drop_train

[338,
 1390,
 6469,
 7038,
 1936,
 3317,
 3980,
 4762,
 6509,
 7007,
 7065,
 7294,
 7689,
 7957,
 8064]

In [11]:
data.drop(will_drop_train, inplace = True, axis = 0)

## 새로운 feature 생성

### 총 사용금액, 그리고 사용한 금액에 따라 poor, middle, rich로 분류

In [12]:
data["Total"] = data["RoomService"] + data["FoodCourt"] + data["ShoppingMall"] + data["Spa"] +\
data["VRDeck"] 
data["RichPoor"] = data["Total"].apply(lambda x: "poor" if x < 5000\
                                       else ("middle" if x>=5000 and x<20000 else "rich"))

test["Total"] = test["RoomService"] + test["FoodCourt"] + test["ShoppingMall"] + test["Spa"] +\
test["VRDeck"] 
test["RichPoor"] = test["Total"].apply(lambda x: "poor" if x < 5000\
                                       else ("middle" if x>=5000 and x<20000 else "rich"))

### 그룹여행객 여부

In [13]:
data["GroupId"] = data["PassengerId"].apply(lambda x: x.split("_")[0])
test["GroupId"] = test["PassengerId"].apply(lambda x: x.split("_")[0])
data["GroupNo"] = data["PassengerId"].apply(lambda x: x.split("_")[1])
test["GroupNo"] = test["PassengerId"].apply(lambda x: x.split("_")[1])

train_g = data[data["GroupId"].duplicated()]["GroupId"]
test_g = test[test["GroupId"].duplicated()]["GroupId"]
data["Group"] = data["GroupId"].apply(lambda x: x in train_g.values)
test["Group"] = test["GroupId"].apply(lambda x: x in test_g.values)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8678 entries, 0 to 8692
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   PassengerId   8678 non-null   object
 1   HomePlanet    8678 non-null   object
 2   CryoSleep     8678 non-null   bool  
 3   Cabin1        8575 non-null   object
 4   Combi         8575 non-null   object
 5   Cabin3        8575 non-null   object
 6   Cabin         8575 non-null   object
 7   Destination   8678 non-null   object
 8   Age           8678 non-null   int64 
 9   VIP           8678 non-null   bool  
 10  RoomService   8678 non-null   int64 
 11  FoodCourt     8678 non-null   int64 
 12  ShoppingMall  8678 non-null   int64 
 13  Spa           8678 non-null   int64 
 14  VRDeck        8678 non-null   int64 
 15  Name          8478 non-null   object
 16  Transported   8678 non-null   bool  
 17  Total         8678 non-null   int64 
 18  RichPoor      8678 non-null   object
 19  GroupI

### 나이브 베어스를 활용한 이름을 통한 성별 분류

In [15]:
# Train_Data
names_train_data = []
for n in data["Name"]:
    n = str(n)
    a = n.split()
    names_train_data.append(a[0])

#### 훈련셋

In [16]:
# 이름과 성 분리
names_train_data = []
for i in data["Name"]:
    i = str(i)
    a = i.split()
    names_train_data.append(a[0])

In [17]:
# NLTK의 names 파일을 활용하여 이름을 여성과 남성으로 분리
labeled_names = [(name, "female") for name in names.words("female.txt")] + \
[(name, "male") for name in names.words("male.txt")]
random.shuffle(labeled_names)

In [18]:
# 이름의 마지막 단어 가져오는 함수
def gender_features(word):
    return {'last_letter': word[-1]}

In [19]:
names_train_data

['Maham',
 'Juanna',
 'Altark',
 'Solam',
 'Willy',
 'Sandie',
 'Billex',
 'Candra',
 'Andona',
 'Erraiam',
 'Altardr',
 'Wezena',
 'Berers',
 'Reney',
 'Elle',
 'Justie',
 'Flats',
 'Carry',
 'Alus',
 'Lyde',
 'Philda',
 'Almary',
 'Glendy',
 'Mollen',
 'Breney',
 'Mael',
 'Terta',
 'nan',
 'Penton',
 'Karard',
 'Anyoni',
 'Ceros',
 'Ginia',
 'Coobix',
 'Cinets',
 'Dontch',
 'Ziba',
 'Luse',
 'Marina',
 'Loise',
 'Jorgie',
 'Margia',
 'Ankalik',
 'Jodye',
 'Kayne',
 'Cassa',
 'Zelowl',
 'Mass',
 'Sony',
 'Vivia',
 'Elaney',
 'Elson',
 'Okulas',
 'Instab',
 'Zinoces',
 'Warry',
 'Shanya',
 'Sterry',
 'nan',
 'Colatz',
 'Diandy',
 'Ninaha',
 'Celine',
 'Velyne',
 'Cinst',
 'nan',
 'Meremy',
 'Nelly',
 'Thell',
 'Gorn',
 'Aldibah',
 'Conk',
 'Pon',
 'Spuri',
 'Dellie',
 'Totse',
 'Eaturs',
 'nan',
 'Coren',
 'Furudah',
 'Jodye',
 'Stmeal',
 'Heremy',
 'Deanne',
 'Tinez',
 'Gracy',
 'Stald',
 'Tiney',
 'Alchium',
 'Doria',
 'Leence',
 'Aliey',
 'Thewis',
 'Book',
 'Ritany',
 'Arlen',
 'Sh

In [20]:
# 나이브 베어스 모델 학습
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
classifier = NaiveBayesClassifier.train(featuresets)

In [21]:
# 성별 feature 생성
names_gender = []
for i in names_train_data:
    names_gender.append(classifier.classify(gender_features(i)))

# create new column called 'gender'    
data["Gender"] = names_gender

In [22]:
data.Gender[data.Name.isna()] = 'female'

In [23]:
data.Gender[data.Name.isna()].unique()

array(['female'], dtype=object)

#### 테스트셋

In [24]:
# 이름과 성 분리
names_test_data = []
for i in test["Name"]:
    i = str(i)
    a = i.split()
    names_test_data.append(a[0])

In [25]:
# NLTK의 names 파일을 활용하여 이름을 여성과 남성으로 분리
labeled_names = [(name, "female") for name in names.words("female.txt")] + \
[(name, "male") for name in names.words("male.txt")]
random.shuffle(labeled_names)

In [26]:
# 나이브 베어스 모델 학습
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
classifier = NaiveBayesClassifier.train(featuresets)

In [27]:
# 이름의 마지막 단어 가져오는 함수
def gender_features(word):
    return {'last_letter': word[-1]}

In [28]:
# 성별 feature 생성
names_gender = []
for i in names_test_data:
    names_gender.append(classifier.classify(gender_features(i)))

In [29]:
test["Gender"] = pd.Series(names_gender)

In [30]:
test.Gender[test.Name.isna()] = 'female'

In [31]:
data.Gender[data.Name.isna()].unique()

array(['female'], dtype=object)

## Cabin 결측값들 제거

In [32]:
data.dropna(axis=0, inplace=True)

## 필요없는 features 제거

In [33]:
target = data['Transported']
data = data.drop(["PassengerId", "Name", "Cabin", "Combi", "Total", "GroupId",\
                  "GroupNo", "Transported"], axis = 1)
test = test.drop(["PassengerId", "Name", "Cabin", "Combi", "Total", "GroupId",\
                  "GroupNo", "Transported"], axis = 1)


## 째려보기

In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8375 entries, 0 to 8692
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   HomePlanet    8375 non-null   object
 1   CryoSleep     8375 non-null   bool  
 2   Cabin1        8375 non-null   object
 3   Cabin3        8375 non-null   object
 4   Destination   8375 non-null   object
 5   Age           8375 non-null   int64 
 6   VIP           8375 non-null   bool  
 7   RoomService   8375 non-null   int64 
 8   FoodCourt     8375 non-null   int64 
 9   ShoppingMall  8375 non-null   int64 
 10  Spa           8375 non-null   int64 
 11  VRDeck        8375 non-null   int64 
 12  RichPoor      8375 non-null   object
 13  Group         8375 non-null   bool  
 14  Gender        8375 non-null   object
dtypes: bool(3), int64(6), object(6)
memory usage: 875.1+ KB


In [35]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   HomePlanet    4277 non-null   object
 1   CryoSleep     4277 non-null   bool  
 2   Cabin1        4214 non-null   object
 3   Cabin3        4214 non-null   object
 4   Destination   4277 non-null   object
 5   Age           4277 non-null   int64 
 6   VIP           4277 non-null   bool  
 7   RoomService   4277 non-null   int64 
 8   FoodCourt     4277 non-null   int64 
 9   ShoppingMall  4277 non-null   int64 
 10  Spa           4277 non-null   int64 
 11  VRDeck        4277 non-null   int64 
 12  RichPoor      4277 non-null   object
 13  Group         4277 non-null   bool  
 14  Gender        4277 non-null   object
dtypes: bool(3), int64(6), object(6)
memory usage: 413.6+ KB


## 원핫인코딩

### boolean 타입 피처들 object로 캐스팅

In [36]:
bool_data = [column for column in data.select_dtypes(["bool"])]

bool_data

['CryoSleep', 'VIP', 'Group']

In [37]:
data["VIP"] = data["VIP"].replace(to_replace = [False, True], 
                                  value = ["No", "Yes"]).astype("object")
data["CryoSleep"] = data["CryoSleep"].replace(to_replace = [False, True], 
                                  value = ["No", "Yes"]).astype("object")
data["Group"] = data["Group"].replace(to_replace = [False, True], 
                                  value = ["No", "Yes"]).astype("object")

test["VIP"] = test["VIP"].replace(to_replace = [False, True], 
                                  value = ["No", "Yes"]).astype("object")
test["CryoSleep"] = test["CryoSleep"].replace(to_replace = [False, True], 
                                  value = ["No", "Yes"]).astype("object")
test["Group"] = test["Group"].replace(to_replace = [False, True], 
                                  value = ["No", "Yes"]).astype("object")

In [38]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8375 entries, 0 to 8692
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   HomePlanet    8375 non-null   object
 1   CryoSleep     8375 non-null   object
 2   Cabin1        8375 non-null   object
 3   Cabin3        8375 non-null   object
 4   Destination   8375 non-null   object
 5   Age           8375 non-null   int64 
 6   VIP           8375 non-null   object
 7   RoomService   8375 non-null   int64 
 8   FoodCourt     8375 non-null   int64 
 9   ShoppingMall  8375 non-null   int64 
 10  Spa           8375 non-null   int64 
 11  VRDeck        8375 non-null   int64 
 12  RichPoor      8375 non-null   object
 13  Group         8375 non-null   object
 14  Gender        8375 non-null   object
dtypes: int64(6), object(9)
memory usage: 1.0+ MB


In [39]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   HomePlanet    4277 non-null   object
 1   CryoSleep     4277 non-null   object
 2   Cabin1        4214 non-null   object
 3   Cabin3        4214 non-null   object
 4   Destination   4277 non-null   object
 5   Age           4277 non-null   int64 
 6   VIP           4277 non-null   object
 7   RoomService   4277 non-null   int64 
 8   FoodCourt     4277 non-null   int64 
 9   ShoppingMall  4277 non-null   int64 
 10  Spa           4277 non-null   int64 
 11  VRDeck        4277 non-null   int64 
 12  RichPoor      4277 non-null   object
 13  Group         4277 non-null   object
 14  Gender        4277 non-null   object
dtypes: int64(6), object(9)
memory usage: 501.3+ KB


### 더미화

In [40]:
# drop_first 첫번째 범주는 제거하고 더미화
# 다른 범주가 전부 0이면 자동적으로 첫번째 범주가 1인걸 알 수 있기에 사용
df = pd.get_dummies(data, drop_first = True)
t_df = pd.get_dummies(test, drop_first=True)

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8375 entries, 0 to 8692
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   Age                        8375 non-null   int64
 1   RoomService                8375 non-null   int64
 2   FoodCourt                  8375 non-null   int64
 3   ShoppingMall               8375 non-null   int64
 4   Spa                        8375 non-null   int64
 5   VRDeck                     8375 non-null   int64
 6   HomePlanet_Europa          8375 non-null   uint8
 7   HomePlanet_Mars            8375 non-null   uint8
 8   CryoSleep_Yes              8375 non-null   uint8
 9   Cabin1_B                   8375 non-null   uint8
 10  Cabin1_C                   8375 non-null   uint8
 11  Cabin1_D                   8375 non-null   uint8
 12  Cabin1_E                   8375 non-null   uint8
 13  Cabin1_F                   8375 non-null   uint8
 14  Cabin1_G                

In [42]:
t_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   Age                        4277 non-null   int64
 1   RoomService                4277 non-null   int64
 2   FoodCourt                  4277 non-null   int64
 3   ShoppingMall               4277 non-null   int64
 4   Spa                        4277 non-null   int64
 5   VRDeck                     4277 non-null   int64
 6   HomePlanet_Europa          4277 non-null   uint8
 7   HomePlanet_Mars            4277 non-null   uint8
 8   CryoSleep_Yes              4277 non-null   uint8
 9   Cabin1_B                   4277 non-null   uint8
 10  Cabin1_C                   4277 non-null   uint8
 11  Cabin1_D                   4277 non-null   uint8
 12  Cabin1_E                   4277 non-null   uint8
 13  Cabin1_F                   4277 non-null   uint8
 14  Cabin1_G                

## 스케일링

In [43]:
scaler = StandardScaler()
scaler.fit(df)
df = scaler.transform(df)
t_df = scaler.transform(t_df)

# 데이터셋 분리

In [44]:
X_train, X_test, y_train, y_test = train_test_split(df, target, shuffle = True, random_state = 109)
X_train, X_val, y_train, y_val = train_test_split(df, target, shuffle = True,\
                                                  test_size=0.15, random_state = 109)

# 알고리즘명