## Setup

In [1]:
import sys

sys.path.append("..")
sys.path.append("../../inputs")

In [2]:
import logging

logging.basicConfig(
    # filename=__file__.replace('.py', '.log'),
    level=logging.getLevelName("INFO"),
    format="%(asctime)s [%(levelname)s] [%(module)s] %(message)s",
)

log = logging.getLogger(__name__)

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns

from src.utils import df_stats

## 目的

宇宙船タイタニック号が時空の異変に遭遇した際、乗客が異次元に転送されたかどうかを予測する競技です。予測には、船内のコンピュータシステムから復元された個人記録が使用されます。

## 評価

投稿された作品は、分類精度（予測されたラベルの正答率）で評価されます。


## Training Data

In [4]:
%%time

train = pd.read_csv("../../inputs/train.csv")
# train = pd.read_feather("../../inputs/train.f")

CPU times: user 17.2 ms, sys: 5.08 ms, total: 22.3 ms
Wall time: 21.3 ms


In [5]:
len(train)

8693

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB



- train.csv - 約3分の2 (~8700) の乗客の個人記録で、学習データとして使用される。
    - PassengerId - 各乗客のユニークなID。各Idはggggg_ppの形式をとり、gggggはその乗客のグループ、ppはそのグループ内の番号である。グループ内の人は家族であることが多いが、必ずしもそうではない。
    - HomePlanet - 乗客が出発した惑星、通常は定住している惑星。
    - CryoSleep - 乗客が航海中、仮死状態になることを選択したかどうかを示す。冷凍睡眠中のお客様は、キャビンに閉じ込められます。
    - キャビン - 乗客が滞在しているキャビンの番号。deck/num/sideの形式で、sideはP（ポート）かS（スターボード）のどちらかである。
    - 目的地 - 乗客が下船する惑星。
    - 年齢 - 乗客の年齢。
    - VIP - 乗客が航海中に特別なVIPサービスを受けたかどうか。
    - ルームサービス、フードコート、ショッピングモール、スパ、VRデッキ - 宇宙船タイタニックの多くの豪華な設備で乗客が請求した金額です。
    - 名前 - 乗客の姓と名。
    - 輸送された - 乗客が別の次元に輸送されたかどうか。これは、ターゲット、あなたが予測しようとしている列です。



In [7]:
train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [37]:
train[train["Cabin"].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
15,0012_01,Earth,False,,TRAPPIST-1e,31.0,False,32.0,0.0,876.0,0.0,0.0,Justie Pooles,False
93,0101_01,Mars,True,,TRAPPIST-1e,31.0,False,0.0,0.0,0.0,0.0,0.0,Book Trad,True
103,0110_01,Europa,False,,TRAPPIST-1e,32.0,False,0.0,410.0,6.0,3929.0,764.0,Graviph Aloubtled,False
222,0239_01,Mars,False,,TRAPPIST-1e,37.0,False,637.0,0.0,0.0,92.0,319.0,Diedow Resty,False
227,0244_01,Mars,True,,TRAPPIST-1e,43.0,False,0.0,0.0,0.0,0.0,0.0,Froos Sad,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8209,8772_02,Europa,False,,55 Cancri e,53.0,False,0.0,1127.0,0.0,3939.0,,Naosura Motled,False
8475,9057_01,Europa,False,,55 Cancri e,36.0,True,132.0,3479.0,0.0,3786.0,0.0,Coxan Statch,False
8485,9069_03,Europa,True,,55 Cancri e,25.0,False,0.0,0.0,0.0,0.0,0.0,Bath Brakeng,True
8509,9081_03,Earth,True,,TRAPPIST-1e,1.0,False,0.0,0.0,0.0,0.0,0.0,Beula Clemondsey,False


In [9]:
train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [10]:
df_stats(train)

Unnamed: 0,カラム名,ユニーク値数,最頻値,最頻値の出現回数,最頻値の割合,欠損値の割合,タイプ
0,PassengerId,8693,0001_01,1,0.011504,0.0,object
1,HomePlanet,3,Earth,4602,52.939146,2.312205,object
2,CryoSleep,2,False,5439,62.567583,2.496261,object
3,Cabin,6560,G/734/S,8,2.289198,2.289198,object
4,Destination,3,TRAPPIST-1e,5915,68.043253,2.093639,object
5,Age,80,24.0,324,3.727137,2.059128,float64
6,VIP,2,False,8291,95.37559,2.335212,object
7,RoomService,1273,0.0,5577,64.155067,2.082135,float64
8,FoodCourt,1507,0.0,5456,62.763143,2.105142,float64
9,ShoppingMall,1115,0.0,5587,64.270102,2.39273,float64


In [11]:
train["HomePlanet"].value_counts()

Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64

In [29]:
train["CryoSleep"].value_counts()

False    5439
True     3037
Name: CryoSleep, dtype: int64

In [12]:
train["Destination"].value_counts()

TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: Destination, dtype: int64

In [13]:
train["Cabin"].apply(lambda x: str(x).split("/")[0]).value_counts()

F      2794
G      2559
E       876
B       779
C       747
D       478
A       256
nan     199
T         5
Name: Cabin, dtype: int64

In [14]:
train["Cabin"].apply(lambda x: str(x).split("/")[-1]).value_counts()

S      4288
P      4206
nan     199
Name: Cabin, dtype: int64

# Test Data

In [15]:
test = pd.read_csv("../../inputs/test.csv")

In [16]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


- test.csv - 残りの3分の1（~4300人）の乗客の個人記録で、テストデータとして使用されます。あなたのタスクは、このセットの乗客のTransportedの値を予測することです。


In [17]:
test

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [18]:
test.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,4186.0,4195.0,4171.0,4179.0,4176.0,4197.0
mean,28.658146,219.266269,439.484296,177.295525,303.052443,310.710031
std,14.179072,607.011289,1527.663045,560.821123,1117.186015,1246.994742
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,26.0,0.0,0.0,0.0,0.0,0.0
75%,37.0,53.0,78.0,33.0,50.0,36.0
max,79.0,11567.0,25273.0,8292.0,19844.0,22272.0


In [19]:
df_stats(test)

Unnamed: 0,カラム名,ユニーク値数,最頻値,最頻値の出現回数,最頻値の割合,欠損値の割合,タイプ
0,PassengerId,4277,0013_01,1,0.023381,0.0,object
1,HomePlanet,3,Earth,2263,52.910919,2.034136,object
2,CryoSleep,2,False,2640,61.725509,2.174421,object
3,Cabin,3265,G/160/P,8,2.338087,2.338087,object
4,Destination,3,TRAPPIST-1e,2956,69.113865,2.15104,object
5,Age,79,18.0,176,4.115034,2.12766,float64
6,VIP,2,False,4110,96.095394,2.174421,object
7,RoomService,842,0.0,2726,63.736264,1.917232,float64
8,FoodCourt,902,0.0,2690,62.894552,2.478373,float64
9,ShoppingMall,715,0.0,2744,64.157119,2.291326,float64


In [20]:
test["HomePlanet"].value_counts()

Earth     2263
Europa    1002
Mars       925
Name: HomePlanet, dtype: int64

In [21]:
test["Destination"].value_counts()

TRAPPIST-1e      2956
55 Cancri e       841
PSO J318.5-22     388
Name: Destination, dtype: int64

In [33]:
test["Cabin"].apply(lambda x: str(x).split("/")[0]).value_counts()

F      1445
G      1222
E       447
B       362
C       355
D       242
nan     100
A        98
T         6
Name: Cabin, dtype: int64

In [35]:
test["Cabin"].fillna("//").apply(lambda x: str(x).split("/")[-1]).value_counts()

S    2093
P    2084
      100
Name: Cabin, dtype: int64

In [39]:
test[test["Cabin"].isnull()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
18,0047_02,Europa,False,,TRAPPIST-1e,29.0,False,0.0,7708.0,243.0,569.0,343.0,Muons Prucerod
99,0227_01,Earth,,,TRAPPIST-1e,22.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Buckentry
135,0293_01,Europa,True,,TRAPPIST-1e,47.0,False,0.0,0.0,0.0,0.0,0.0,Tauxon Suptibler
147,0323_01,Earth,True,,55 Cancri e,18.0,False,0.0,0.0,0.0,0.0,0.0,Joyn Gaineyerson
180,0364_02,Mars,False,,TRAPPIST-1e,37.0,False,731.0,0.0,517.0,50.0,0.0,Anakes Chité
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4209,9138_01,Europa,,,TRAPPIST-1e,41.0,False,0.0,1998.0,0.0,1023.0,867.0,Misamak Trupistic
4248,9223_01,Mars,True,,TRAPPIST-1e,24.0,False,0.0,0.0,,0.0,0.0,Weessh Sun
4249,9223_02,Mars,True,,TRAPPIST-1e,17.0,False,0.0,0.0,0.0,0.0,0.0,Perit Sun
4258,9238_05,Earth,True,,TRAPPIST-1e,14.0,False,0.0,0.0,0.0,0.0,0.0,Caseye Emenez


## Sample Submission

In [24]:
sample_sub = pd.read_csv("../../inputs/sample_submission.csv")

In [25]:
sample_sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PassengerId  4277 non-null   object
 1   Transported  4277 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 37.7+ KB


- sample_submission.csv - 正しい書式の投稿ファイル。
    - PassengerId - テストセット内の各乗客の ID。
    - Transported - 対象。各旅客について、True または False のいずれかを予測する。


In [26]:
sample_sub

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False
