In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


# step 1. 라이브러리 및 데이터 불러오기
- 라이브러리 버전 확인 반드시
- print(np.__version__)
- print(pd.__version__)

In [2]:
import numpy as np
print(np.__version__)
import pandas as pd
print(pd.__version__)

1.20.3
1.3.5


In [3]:
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [4]:
from lightgbm import LGBMClassifier


import time
import warnings
warnings.filterwarnings('ignore')

## 데이터 불러오기

In [5]:
train = pd.read_csv("../input/spaceship-titanic/train.csv")
test = pd.read_csv("../input/spaceship-titanic/test.csv")
submission = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")

random_state = 42
folds = 5
strategy = 'median'
print(train)
print(test.shape)
print(test)

     PassengerId HomePlanet CryoSleep     Cabin    Destination   Age    VIP  \
0        0001_01     Europa     False     B/0/P    TRAPPIST-1e  39.0  False   
1        0002_01      Earth     False     F/0/S    TRAPPIST-1e  24.0  False   
2        0003_01     Europa     False     A/0/S    TRAPPIST-1e  58.0   True   
3        0003_02     Europa     False     A/0/S    TRAPPIST-1e  33.0  False   
4        0004_01      Earth     False     F/1/S    TRAPPIST-1e  16.0  False   
...          ...        ...       ...       ...            ...   ...    ...   
8688     9276_01     Europa     False    A/98/P    55 Cancri e  41.0   True   
8689     9278_01      Earth      True  G/1499/S  PSO J318.5-22  18.0  False   
8690     9279_01      Earth     False  G/1500/S    TRAPPIST-1e  26.0  False   
8691     9280_01     Europa     False   E/608/S    55 Cancri e  32.0  False   
8692     9280_02     Europa     False   E/608/S    TRAPPIST-1e  44.0  False   

      RoomService  FoodCourt  ShoppingMall     Spa 

## Column 설명
- PassengerId는 "AAAA_BB"형태이다. AAAA는 그룹을 나타내고, BB는 그룹안에서의 개개인을 지칭한다.
- HomePlanet은 고향별을 나타낸다.
- CryoSleep은 여행중에 냉동수면 여부를 나타낸다.
- cabin은 승객이 머무르는 공간을 나타낸다. deck/number/side 형태로 구성된다. side의 P는 Port를 나타내고, S는 Starboard를 나타낸다.
- Age는 승객의 나이이다.
- VIP는 승객이 special VIP service 비용을 지불했는지를 보여준다.
- RoomService, FoodCourt, ShoppingMall, Spa, VRDeck는 각각 우주선의 편의시설에 승객이 비용을 지불했는지를 나타낸다.
- Name은 승객의 이름으로, first name, last name 순이다.
- Transported 칼럼이 타깃 데이터이다. 승객이 다른 차원으로 운송되었는지 여부를 보여준다. 훈련 데이터에는 Transported 칼럼이 있고, 테스트 데이터에는 Transported 칼럼이 있다. Transported 칼럼을 예측해야 할 것이다.

# Step 2. 탐색적 자료 분석(EDA)
- 데이터 시각화
- 산점도, 막대 그래프 등
- 그래프 해석해서 설명을 달아야 한다.
- 약간의 데이터 전처리
- 결측치는 없는지

## train data

In [6]:
# first 5 rows of train dataset
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [8]:
# OKBLUE = '\033[94m'
# HEADER = '\033[95m'
# OKGREEN = '\033[92m'
# WARNING = '\033[93m'
# FAIL = '\033[91m'
# ENDC = '\033[0m'
# BOLD = '\033[1m'
# UNDERLINE = '\033[4m'
print(f'\033[94mNumber of rows in train data: {train.shape[0]}')
print(f'\033[95mNumber of columns in train data: {train.shape[1]}')
print(f'\033[92mNumber of vlaues in train data: {train.count().sum()}')
print(f'\033[93mNumber of with missing values in train date: {sum(train.isna().sum())}')

[94mNumber of rows in train data: 8693
[95mNumber of columns in train data: 14
[92mNumber of vlaues in train data: 119378
[93mNumber of with missing values in train date: 2324


- 훈련 데이터에는 14개의 열이 있고, 8,693개의 행(샘플)이 있다.
- 훈련 데이터에는 119,378개의 특성이 있다.
- 훈련 데이터안에는 2,324개의 결측치가 있다.
- Transported 열은 훈련 데이터 안에서만 사용할 수 있다.

In [9]:
print(train.isna().sum().sort_values(ascending = False))

CryoSleep       217
ShoppingMall    208
VIP             203
HomePlanet      201
Name            200
Cabin           199
VRDeck          188
FoodCourt       183
Spa             183
Destination     182
RoomService     181
Age             179
PassengerId       0
Transported       0
dtype: int64


- 14개의 열 중에서 CryoSleep 열 안에 217개로 가장 많은 결측치가 있다. 

In [10]:
train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


- 갯수, 평균, 표준편차, 최솟값, 최댓값, 4분위수 등의 기초 통계량을 확인한다.

## test data

In [11]:
# first 5 rows of test data 
test.head()


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [13]:
print(f'\033[94mNumber of rows in test data: {test.shape[0]}')
print(f'\033[95mNumber of columns in test data: {test.shape[1]}')
print(f'\033[92mNumber of vlaues in test data: {test.count().sum()}')
print(f'\033[93mNumber of rows with missing values in test date: {sum(test.isna().sum())}')

[94mNumber of rows in test data: 4277
[95mNumber of columns in test data: 13
[92mNumber of vlaues in test data: 54484
[93mNumber of rows with missing values in test date: 1117


- 테스트 데이터에는 13개의 열이 있고, 4,277개의 행(샘플)이 있다.
- 테스트 데이터에는 54,484개의 특성이 있다.
- 훈련 데이터안에는 1,117개의 결측치가 있다.

In [14]:
print((test.isna().sum().sort_values(ascending = False)))

FoodCourt       106
Spa             101
Cabin           100
ShoppingMall     98
Name             94
CryoSleep        93
VIP              93
Destination      92
Age              91
HomePlanet       87
RoomService      82
VRDeck           80
PassengerId       0
dtype: int64


- 13개의 열 중에서 FoodCourt 열 안에 217개로 가장 많은 결측치가 있다.

In [15]:
test.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,4186.0,4195.0,4171.0,4179.0,4176.0,4197.0
mean,28.658146,219.266269,439.484296,177.295525,303.052443,310.710031
std,14.179072,607.011289,1527.663045,560.821123,1117.186015,1246.994742
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,26.0,0.0,0.0,0.0,0.0,0.0
75%,37.0,53.0,78.0,33.0,50.0,36.0
max,79.0,11567.0,25273.0,8292.0,19844.0,22272.0


- 갯수, 평균, 표준편차, 최솟값, 최댓값, 4분위수 등의 기초 통계량을 확인한다.

### Submission File

In [16]:
# first 5 rows of submission file
submission.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False


## 데이터의 개요

In [17]:
train.drop(["PassengerId"] , axis = 1 , inplace = True)
test.drop(["PassengerId"] , axis = 1 , inplace = True)
target = 'Transported'
features = [col for col in train.columns if col not in ['id', target]]
random_state= 12


In [18]:
train.iloc[:, :-1].describe().T.sort_values(by='std', ascending=False)\
.style.background_gradient(cmap='GnBu')\
.bar(subset=['max'], color='#BB0000')\
.bar(subset=["mean",], color='green')

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
FoodCourt,8510.0,458.077203,1611.48924,0.0,0.0,0.0,76.0,29813.0
VRDeck,8505.0,304.854791,1145.717189,0.0,0.0,0.0,46.0,24133.0
Spa,8510.0,311.138778,1136.705535,0.0,0.0,0.0,59.0,22408.0
RoomService,8512.0,224.687617,666.717663,0.0,0.0,0.0,47.0,14327.0
ShoppingMall,8485.0,173.729169,604.696458,0.0,0.0,0.0,27.0,23492.0
Age,8514.0,28.82793,14.489021,0.0,19.0,27.0,38.0,79.0


## 결측치 분포


In [19]:
test_null = pd.DataFrame(test.isna().sum())
test_null = test_null.sort_values(by = 0 ,ascending = False)
train_null = pd.DataFrame(train.isna().sum())
train_null = train_null.sort_values(by = 0 ,ascending = False)[:-1]


fig = make_subplots(rows=1, 
                    cols=2,
                    column_titles = ["Train Data", "Test Data"] ,
                    x_title="Missing Values")

fig.add_trace(go.Bar(x=train_null[0],
                     y=train_null.index,
                     orientation="h",
                    marker=dict(color=[n for n in range(12)], 
                                line_color='rgb(0,0,0)' , 
                                line_width = 2,
                                coloraxis="coloraxis")),
              1, 1)
fig.add_trace(go.Bar(x=test_null[0], 
                     y=test_null.index,
                     orientation="h",
                    marker=dict(color=[n for n in range(12)], 
                                line_color='rgb(0,0,0)', 
                                line_width = 2,
                                coloraxis="coloraxis")),
              1, 2)

fig.update_layout(showlegend=False, title_text="Column wise Null Value Distribution", title_x=0.5)

In [20]:
missing_train_row = train.isna().sum(axis=1)
missing_train_row = pd.DataFrame(missing_train_row.value_counts()/train.shape[0]).reset_index()
missing_test_row = test.isna().sum(axis=1)
missing_test_row = pd.DataFrame(missing_test_row.value_counts()/test.shape[0]).reset_index()
missing_train_row.columns = ['no', 'count']
missing_test_row.columns = ['no', 'count']
missing_train_row["count"] = missing_train_row["count"]*100
missing_test_row["count"] = missing_test_row["count"]*100


fig = make_subplots(rows=1, 
                    cols=2,
                    column_titles = ["Train Data", "Test Data"] ,
                    x_title="Missing Values",)

fig.add_trace(go.Bar(x=missing_train_row["no"], 
                     y=missing_train_row["count"]  ,
                    marker=dict(color=[n for n in range(4)], 
                                line_color='rgb(0,0,0)' ,
                                line_width = 3
                                ,coloraxis="coloraxis")),
              1, 1)
fig.add_trace(go.Bar(x= missing_test_row["no"], 
                     y=missing_test_row["count"],
                    marker=dict(color=[n for n in range(4)], 
                                line_color='rgb(0,0,0)',
                                line_width = 3 ,
                                coloraxis="coloraxis")),
              1, 2)
fig.update_layout(showlegend=False, title_text="Row wise Null Value Distribution", title_x=0.5)

# Step 3. 데이터 전처리
- Feature Engineering
- 머신러닝 모형을 돌리기 위해 표준화, 원핫-인코딩
- 파생변수 (도출 변수)
    + 왜 이 변수를 만들었는지에 대한 여러분들의 설명 필요

# Step 4. 머신러닝 모형 개발
- 모형에 대한 설명 필요
- 모형을 1~2개만 사용하기
- 교차 검증
- 하이퍼파라미터 튜닝

# Step 5. 모형 평가
- 훈련 데이터 + 검증데이터 (테스트 데이터는 건드리지 않는다)
- 정확도 비교
- 혼동행렬 (Confusion Matrix) 설명???
    - 잘 찾아서 넣기
  

# Step 6. 제출
- 제출 양식은 강사님께서 만들어주신다고 함.

# Reference
- 다른 사람의 code 설명을 쭉 따라쳤을 때,
- 노트북 표절 방지 위해, 참조한 코드는 반드시 링크 걸어둘 것.
- 저자 이름, 글 제목, 링크 주소

# 마감일
- 4월 12일 17시 40분
- 제출 형태
    + Leaderboard 랭킹 사진 캡쳐
    + 고용노동부 보고 양식(다음주에 확인해서 알려주신다고 함)
