# Kaggle spaceship Titanic 예측하기

## 0. 라이브러리 불러오기

In [None]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle

## 1. 데이터 준비 및 데이터 살펴보기

In [None]:
train=pd.read_csv('train.csv')

In [None]:
train

In [None]:
train.columns

### 1-1. 변수해석

#### 1-1-1. 타겟 변수 살펴보기

In [None]:
train['Transported'].value_counts()

In [None]:
train.groupby(['Transported','VIP']).size()

In [None]:
train.groupby(['HomePlanet','VIP']).size() ###Homeplanet이 earth이면 vip가 아니다!!

In [None]:
train.groupby(['Transported','HomePlanet']).size()

#### 1-1-2. 예측 변수들 살펴보기

##### 1-1-2-2. Groupby로 살펴보기

### 1-2. 변수의 결측치/ 중복값 살펴보기

In [None]:
train.isnull().sum().sum() #space 데이터의 총 결측치 수

In [None]:
train.isnull().sum() #space 데이터의 항목별 결측치 분포

In [None]:
train.info()

In [None]:
train.nunique()

In [None]:
train.duplicated().sum()  #train에 중복 데이터는 없음

### 1-3. 각 변수들의 처리에 대한 고민 및 살펴보기(하나씩 해보자!)

In [None]:
train

#### 0)PassengerId 를 그룹별로 나눠볼 수도 있음.(데이터의 행이 줄어든다 => 유의미한 결과 도출할 수 있을까?

In [None]:
dic_id = {}
for i in train['PassengerId'] :
    dic_id[i] = i[:4]

In [None]:
dic_id

In [None]:
train['pid'] = train['PassengerId'].map(dic_id)

In [None]:
train

In [None]:
train['pid'].nunique()

#### 1)HomePlanet 거주행성별 transported를 살펴본다?

In [None]:
train.groupby('HomePlanet')['Transported'].mean()

#### 2)CryoSleep 냉동수면상태에서의 transported를 살펴본다? 

In [None]:
train.groupby('CryoSleep')['Transported'].mean()

#### 3) deck, num, side 별 transported와 비교

##### 3-1) Cabin에서 deck과 num과 side를 train에 새항목들로 추가하기

In [None]:
cab=train['Cabin'].str.split('/', expand=True) #cabin항목을 3개 항목으로 나누기

In [None]:
cab.columns= (['deck','num','side'])

In [None]:
train = pd.concat([train,cab], axis=1) #나눈 3항목을 기존 train에 추가

In [None]:
train

##### 3-2) deck num side 항목들 살펴보기

In [None]:
elements = ['deck','num','side']
for ele in elements :
    print(train.groupby(ele)['Transported'].mean())

In [None]:
train['num'].value_counts()

#### 4) Destination 에 따른 transported의 상관도 살펴보기

In [None]:
train['Destination'].value_counts()

In [None]:
train.groupby('Destination')['Transported'].mean()

#### 5) 나이 분포를 살펴보고 그룹별 나이 분포에 따른 transported 살펴보기?

In [None]:
train['Age'].value_counts()

In [None]:
plt.figure(figsize=(10,4))

# Histogram
sns.histplot(data=train, x='Age', hue='Transported', binwidth=1, kde=True)

# Aesthetics
plt.title('Age distribution')
plt.xlabel('Age (years)')

#### 6) vip 여부에 따른 transported 살펴보기?

In [None]:
train.groupby('VIP')['Transported'].mean()

In [None]:
train['VIP'].value_counts()

In [None]:
train['VIP'].isnull().sum()

#### 7) 승객이 Spaceship Titanic 의 다양한 고급 편의 시설 각각에 대해 청구한 금액 // 앞의 vip 여부와 비교 분석

In [None]:
train.groupby('VIP')['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

In [None]:
train['fee'] =train['RoomService']+train['FoodCourt']+train['ShoppingMall']+train['Spa']+train['VRDeck']

In [None]:
train

In [None]:
train.groupby('VIP')['fee'].describe()

#### 8) Name 은 큰 연관없을듯? (상관계수 정도 보고 의미없으면 예측 변수에서 제외시키자!)

## 2. 데이터 전처리

### 2-1. 각 항목 결측치에 대한 전처리

In [None]:
train.isnull().sum()

#### 2-1-1) HomePlanet의 null값 채우기

In [None]:
train_HP_n=train[train['HomePlanet'].isnull()]

In [None]:
train_HP_n

In [None]:
train['pid']

##### 추측1) 같은 Passenger Id그룹은 같은 homeplanet을 갖는다!?

In [None]:
train['pid']

In [None]:
home_group =train.groupby(['pid','HomePlanet']).size().reset_index()

In [None]:
home_group

In [None]:
home_group[home_group['pid'].duplicated()] #그룹이 중복된 값이 없다

In [None]:
home_group[0].sum()

In [None]:
train[train['HomePlanet'].isnull()]

In [None]:
home_group

In [None]:
home_dict = {}
for p, hp in zip(home_group['pid'], home_group['HomePlanet']) :
    #print(p, hp)
    home_dict[p] = hp

In [None]:
home_dict

In [None]:
train['HomePlanet'].fillna(train['pid'].map(home_dict), inplace=True)

In [None]:
train['HomePlanet'].isnull().sum()

##### 추측2) last name이 같으면 같은 homeplanet을 갖는다!?

In [None]:
train['lastname'] = train['Name'].str.split(' ').str[-1]

In [None]:
train

In [None]:
train['lastname'].value_counts()

In [None]:
name_group =train.groupby(['lastname','HomePlanet']).size().reset_index()

In [None]:
name_group[name_group['lastname'].duplicated()] #성이 중복되는 경우는 없다

In [None]:
name_group

In [None]:
name_dict = {}
for g, hp in zip(name_group['lastname'], name_group['HomePlanet']) :
    #print(g, hp)
    name_dict[g] = hp

In [None]:
name_dict

In [None]:
train['HomePlanet'].fillna(train['lastname'].map(name_dict), inplace=True)

In [None]:
train['HomePlanet'].isnull().sum()

In [None]:
train[train['HomePlanet'].isnull()]

##### 추측3) HomePlanet이 같으면 Destination이 같지않을까? 

In [None]:
train.groupby(['Destination','HomePlanet']).size()

In [None]:
train['Destination']

In [None]:
home_dest= {'TRAPPIST-1e' :'Earth', '55 Cancri e' :'Europa', 'PSO J318.5-22' :'Earth'}

In [None]:
train['HomePlanet'].fillna(train['Destination'].map(home_dest), inplace=True)

In [None]:
train['HomePlanet'].isnull().sum()

#### 2-1-2) CryoSleep  null값 채우기

In [None]:
train[train['CryoSleep']== True]['fee'].mean() #자고 있는 사람들의 평균이 0 (소모 비용이 없음)

In [None]:
train[train['CryoSleep'].isnull()]['fee']

In [None]:
train['CryoSleep'] = np.where(train['fee'] >0 , False, True)

In [None]:
train['CryoSleep'].isnull().sum()

#### 2-1-3) Cabin  null값 채우기 (deck,side)

In [None]:
train['Cabin'].str.split('/').str[0]

In [None]:
train[train['Cabin'].isnull()]

In [None]:
train.groupby(['HomePlanet','deck']).size()

In [None]:
# Europa는 B,  Earth는 G , Mars는 F로 !!

In [None]:
train['deck'].fillna(train['HomePlanet'].map({'Earth':'G', 'Europa':'B','Mars':'F'}), inplace=True)

In [None]:
train['deck'].isnull().sum()

In [None]:
side_group = train.groupby(['pid','side']).size().reset_index()

In [None]:
side_group[side_group['pid'].duplicated()]

In [None]:
train['side'].value_counts()

In [None]:
train['pid'].value_counts()

In [None]:
train['side'].isnull().sum()

In [None]:
train[train['side'].isnull()]

In [None]:
side_group =train.groupby(['pid','side']).size().reset_index()

In [None]:
side_group

In [None]:
side_dict = {}
for p,s in zip(side_group['pid'], side_group['side']) :
    #print(g, hp)
    side_dict[p] = s

In [None]:
side_dict

In [None]:
train['side'].fillna(train['pid'].map(side_dict), inplace=True)

In [None]:
train['side'].isnull().sum()

In [None]:
train[train['side'].isnull()]

In [None]:
train['side'].value_counts()

In [None]:
train.isnull().sum()

In [None]:
train['side'].fillna('S', inplace=True)

In [None]:
train.isnull().sum()

#### 2-1-4) Destination null값 채우기

In [None]:
dest_group =train.groupby(['pid','Destination']).size().reset_index()

In [None]:
dest_group = dest_group.sort_values(0, ascending = False) #정렬 한 후

In [None]:
dest_group = dest_group.drop_duplicates() #가장 위에값만 남기고 중복값들을 지움 / 가장 큰 값만 남음.

In [None]:
dest_dict ={}
for p, d in zip(dest_group['pid'], dest_group['Destination']) :
    dest_dict[p] = d

In [None]:
train['Destination'].fillna(train['Destination'].map(dest_dict), inplace=True)

In [None]:
train['Destination'].isnull().sum()

In [None]:
train.groupby(['HomePlanet','Destination']).size()

In [None]:
train['Destination'].fillna('TRAPPIST-1e', inplace=True)

In [None]:
train['Destination'].isnull().sum()

#### 2-1-5) Age null값 채우기

In [28]:
train.groupby('Transported')['Age'].mean() #별 차이안남>> 중앙값 넣자!

Transported
False    29.922858
True     27.748834
Name: Age, dtype: float64

In [29]:
train['Age'].fillna(train['Age'].median(), inplace=True)

In [31]:
train['Age'].median()

27.0

In [30]:
train.isnull().sum()

PassengerId       0
HomePlanet        0
CryoSleep         0
Cabin           199
Destination       0
Age               0
VIP               0
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            200
Transported       0
pid               0
fee               0
lastname        200
deck              0
num             199
side              0
dtype: int64

#### 2-1-6) VIP null값 채우기

In [None]:
train['VIP'].value_counts(dropna=False)

In [None]:
train[train['VIP']==True]['HomePlanet'].value_counts() #Earth가 없다!! 하지만 큰 의미없음.. 그냥 False로 채우는것이 best

In [None]:
train['VIP'].fillna(False, inplace=True)

In [None]:
train['VIP'].isnull().sum()

#### 2-1-7) fee null값 채우기

In [None]:
fee_list =['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

In [None]:
train['RoomService'].median()

In [None]:
for col in fee_list :
    train[col].fillna(train[col].median(), inplace=True) #중앙값이 0 .. 0으로 null 다 채우고 fee항목만 남기고 다 지우자!

In [None]:
train['fee'].isnull().sum()

In [None]:
train['fee'] =train['RoomService']+train['FoodCourt']+train['ShoppingMall']+train['Spa']+train['VRDeck']

In [None]:
train['fee'].describe()

-----------------

In [None]:
train.isnull().sum()

In [None]:
del_col = ['PassengerId' , 'Cabin' , 'Name' , 'lastname', 'num']

In [None]:
train = train.drop(del_col, axis=1)

In [None]:
train.drop('num', axis=1, inplace=True)

In [None]:
train.isnull().sum()

In [None]:
train.head()

### 2-2. object 항목들을 수치화하기

In [None]:
train.info()

In [None]:
col_list = ['HomePlanet','Destination','pid','deck','side']
for col in col_list : 
    label = LabelEncoder()
    label.fit(train[col].unique())
    train[col] = label.transform(train[col]) 

In [None]:
train

### 2-3-1) 상관 계수 살펴보기 & 해석 (!!!!! 전부 수치화 이후에 다시 봐야함 !!!)

In [None]:
corr = train.corr().round(2)

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(corr, annot=True)

## 3. 데이터 모델링 및 학습, 모델평가

### 3-1) 모델 선정 및 저장

In [None]:
rfc = RandomForestClassifier()

In [None]:
X = train.drop(['Transported'],axis=1)

In [None]:
Y = train['Transported']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2
                                                    , random_state=0, stratify=Y)

In [None]:
rfc.fit(x_train, y_train)

In [None]:
pre_train = rfc.predict(x_train)

In [None]:
mean_absolute_error(pre_train, y_train)

In [None]:
rfc.score(x_train, y_train)

In [None]:
pre_test = rfc.predict(x_test)

In [None]:
mean_absolute_error(pre_test, y_test)

In [None]:
rfc.score(x_test, y_test) #모델 정확도

In [None]:
with open('rfc_model.pkl', 'wb') as f :
    pickle.dump(rfc, f) #모델 저장!

### 3-2) 라벨 저장

In [35]:
col_list = ['HomePlanet','Destination','pid','deck','side']
for col in col_list : 
    label = LabelEncoder()
    label.fit(train[col].unique())
    train[col] = label.transform(train[col])
    filename = 'label_' +col+'.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(label, f)

### 3-3) 코드 정리

In [1]:
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle

In [2]:
train=pd.read_csv('train.csv')

In [3]:
dic_id = {}
for i in train['PassengerId'] :
    dic_id[i] = i[:4]

train['pid'] = train['PassengerId'].map(dic_id) #pid 변수 생성

In [4]:
dest_group =train.groupby(['pid','Destination']).size().reset_index()

In [5]:
dest_group = dest_group.sort_values(0, ascending = False) #정렬 한 후

In [6]:
dest_group = dest_group.drop_duplicates() #가장 위에값만 남기고 중복값들을 지움 / 가장 큰 값만 남음.

In [7]:
dest_dict ={}
for p, d in zip(dest_group['pid'], dest_group['Destination']) :
    dest_dict[p] = d

In [8]:
train['Destination'].fillna(train['Destination'].map(dest_dict), inplace=True) #그룹이 같은 사람은 목적지가 같을거란 가정으로 Destination의 null값 1차전처리

In [9]:
train['Destination'].fillna('TRAPPIST-1e', inplace=True) # 나머지는 최빈값인 TRAPPIST-1e로 전처리

In [10]:
train['fee'] =train['RoomService']+train['FoodCourt']+train['ShoppingMall']+train['Spa']+train['VRDeck'] #돈쓰는 항목들 5개의 총합을 내어 fee라는 변수 생성!

In [12]:
train['CryoSleep'] = np.where(train['fee'] >0 , False, True) #냉동된 상태인 사람들은 fee소모가 없을 것이기 때문에 fee가 있으면 false로 아니면 true로 전처리

In [14]:
home_group =train.groupby(['pid','HomePlanet']).size().reset_index()

home_dict = {}
for p, hp in zip(home_group['pid'], home_group['HomePlanet']) :
    #print(p, hp)
    home_dict[p] = hp

train['HomePlanet'].fillna(train['pid'].map(home_dict), inplace=True) #HomePlanet null 전처리과정 1번

train['lastname'] = train['Name'].str.split(' ').str[-1] #lastname 변수 생성

In [15]:
name_group =train.groupby(['lastname','HomePlanet']).size().reset_index()

name_dict = {}
for g, hp in zip(name_group['lastname'], name_group['HomePlanet']) :
    #print(g, hp)
    name_dict[g] = hp #HomePlanet null 전처리과정 2번

In [16]:
home_dest= {'TRAPPIST-1e' :'Earth', '55 Cancri e' :'Europa', 'PSO J318.5-22' :'Earth'}

train['HomePlanet'].fillna(train['Destination'].map(home_dest), inplace=True) #HomePlanet 최종전처리

In [17]:
cab=train['Cabin'].str.split('/', expand=True) #cabin항목을 3개 항목으로 나누기
cab.columns= (['deck','num','side'])
train = pd.concat([train,cab], axis=1) #나눈 3항목을 기존 train에 추가 ( deck, num, side )

In [18]:
train['deck'].fillna(train['HomePlanet'].map({'Earth':'G', 'Europa':'B','Mars':'F'}), inplace=True) #deck 항목의 null값 전처리

In [19]:
side_group =train.groupby(['pid','side']).size().reset_index()

In [20]:
side_dict = {}
for p,s in zip(side_group['pid'], side_group['side']) :
    #print(g, hp)
    side_dict[p] = s

In [21]:
train['side'].fillna(train['pid'].map(side_dict), inplace=True)  #side 항목의 1차 전처리 (id의 그룹넘버로 유추한 같은 그룹인은 같은 side에 있다는 가정)

In [22]:
train['side'].fillna('S', inplace=True) #side 항목의 최종 전처리 (그래도 남는 항목은 S로 처리~~)

In [23]:
train['VIP'].fillna(False, inplace=True) #VIP 항목의 전처리 (최빈값인 False로 다 채움)

In [24]:
fee_list =['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

In [25]:
for col in fee_list :
    #print(col, train[col].isnull().sum(), train[col].median())
    train[col].fillna(0, inplace=True)  #5개 항목의 median이 전부다 0이라 5개항목의 null값들에 전부다 0을 부여

In [26]:
train['fee'] =train['RoomService']+train['FoodCourt']+train['ShoppingMall']+train['Spa']+train['VRDeck'] #돈쓰는 항목들 5개의 총합을 내어 fee라는 변수 생성!

In [32]:
train['Age'].fillna(27, inplace=True) #Age의 median값인 27로 null값 전부다 처리

In [36]:
del_col = ['PassengerId' , 'Cabin' , 'Name' , 'lastname', 'num', 'pid']

In [37]:
train = train.drop(del_col, axis=1)

In [None]:
pred = model.predict(train)

## 4. 실제 데이터로 모델 검정 및 예측

In [None]:
  train[col]  = label.fit_transform(x)