# Code for estimating survival on Titanic (Kaggle Competition)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv("~/samurai/kaggle/titanic/data/train.csv")
test = pd.read_csv("~/samurai/kaggle/titanic/data/test.csv")

In [3]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
test = test.drop(columns=("Embarked"))

In [6]:
test = test.drop(columns=("Cabin"))

In [7]:
test = test.drop(columns=("Ticket"))

In [8]:
test = test.drop

## Pclass

In [9]:
pclass_table = pd.crosstab(
    train["Survived"]==1,
    train["Pclass"],
    margins=True,
    normalize="columns" 
)

In [10]:
pclass_table

Pclass,1,2,3,All
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,0.37037,0.527174,0.757637,0.616162
True,0.62963,0.472826,0.242363,0.383838


In [11]:
pclass_table[1][1]

0.6296296296296297

In [12]:
# Given the Pclass,
# Pclass(1) = 62.9% Chance of survival
# Pclass(2) = 47.3% Chance of survival
# Pclass(3) = 24.2% Chance of survival

## Sex

In [13]:
sex_table = pd.crosstab(
    train["Survived"]==1,
    train["Sex"],
    margins=True,
    normalize="columns"
)

In [14]:
sex_table

Sex,female,male,All
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,0.257962,0.811092,0.616162
True,0.742038,0.188908,0.383838


In [15]:
# Given Sex,
# Male = 18.9% Chance of survival
# Female = 74.2% Chance of survival

## Age

In [16]:
age_table =pd.crosstab(
    train["Survived"]==1,
    train["Age"],
    normalize="columns"
)

In [17]:
age_table

Age,0.42,0.67,0.75,0.83,0.92,1.00,2.00,3.00,4.00,5.00,...,62.00,63.00,64.00,65.00,66.00,70.00,70.50,71.00,74.00,80.00
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,0.0,0.0,0.0,0.0,0.0,0.285714,0.7,0.166667,0.3,0.0,...,0.5,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
True,1.0,1.0,1.0,1.0,1.0,0.714286,0.3,0.833333,0.7,1.0,...,0.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [18]:
age_table[1][1]

0.7142857142857143

In [19]:
age_table[min(age_table, key=lambda x:abs(x-1))][1] #pd.crosstabのmarginsを消さないとエラーが出る？

0.7142857142857143

In [20]:
# Use age table to calculate chance of survival per age
# use later

## SibSp

In [21]:
sibsp_table = pd.crosstab(
    train["Survived"]==1,
    train["SibSp"],
    margins=True,
    normalize="columns"
)

In [22]:
sibsp_table

SibSp,0,1,2,3,4,5,8,All
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,0.654605,0.464115,0.535714,0.75,0.833333,1.0,1.0,0.616162
True,0.345395,0.535885,0.464286,0.25,0.166667,0.0,0.0,0.383838


In [23]:
sibsp_table[0][1]

0.34539473684210525

## Parch

In [24]:
parch_table = pd.crosstab(
    train["Survived"]==0,
    train["Parch"],
    normalize="columns"
)

In [25]:
parch_table

Parch,0,1,2,3,4,5,6
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
False,0.343658,0.550847,0.5,0.6,0.0,0.2,0.0
True,0.656342,0.449153,0.5,0.4,1.0,0.8,1.0


## Fare

In [26]:
titanic_fare = train[["Survived","Fare"]]

In [27]:
fare_table = pd.crosstab(
    train["Survived"]==1,
    train["Fare"],
    normalize="columns"
)

In [28]:
fare_table

Fare,0.0000,4.0125,5.0000,6.2375,6.4375,6.4500,6.4958,6.7500,6.8583,6.9500,...,153.4625,164.8667,211.3375,211.5000,221.7792,227.5250,247.5208,262.3750,263.0000,512.3292
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,0.933333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.333333,0.0,0.0,1.0,1.0,0.25,0.5,0.0,0.5,0.0
True,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.666667,1.0,1.0,0.0,0.0,0.75,0.5,1.0,0.5,1.0


In [29]:
fare_table[min(fare_table, key=lambda x:abs(x-0))][1]

0.06666666666666667

# Time to test

In [30]:
test = test.drop(columns=("Name"))

AttributeError: 'function' object has no attribute 'drop'

In [None]:
test

In [None]:
test["Age"].describe()

In [None]:
test.isna().sum()

In [None]:
test["Age"].fillna(30.27,inplace=True)#NaNには平均年齢を代用

In [None]:
test.isna().sum()

In [None]:
test["Fare"].describe()

In [None]:
test["Fare"].fillna(35.6, inplace=True)

In [None]:
test.isna().sum()

## Calculating data based off test.csv

In [None]:
# 考えているコードの書き方のまとめ
# すごく雑で効率がすごく悪い気がしますが、取り敢えず試してみます
# Get range of test list = 417
# Use the crosstab tables made earlier to get the survival rate and add it to a 
# Add all of the values and divide by the number of factors

In [None]:
test_pclass_calculated = []
for x in range(418):
    test_pclass_calculated.append(pclass_table[(test["Pclass"][x])][1])

In [None]:
test_sex_calculated = []
for x in range(418):
    test_sex_calculated.append(sex_table[(test["Sex"][x])][1])

In [None]:
test_age_calculated = []
for x in range(418):
    b = test["Age"][x]
    test_age_calculated.append(
        age_table[
            min(age_table, key=lambda x:abs(x-b))][1]
    )

In [None]:
test_sibsp_calculated = []
for x in range(418):
    test_sibsp_calculated.append(sibsp_table[(test["SibSp"][x])][1])

In [None]:
test_parch_calculated = []
for x in range(418):
    c = test["Parch"][x]
    test_parch_calculated.append(
        parch_table[
            min(parch_table, key=lambda x:abs(x-c))][1]
    )

In [None]:
test_fare_calculated = []
for x in range(418):
    d = test["Fare"][x]
    test_fare_calculated.append(
        fare_table[
            min(fare_table, key=lambda x:abs(x-d))][1]
    )

In [None]:
#全部入ったか確かめる

In [None]:
test_fare_calculated

In [None]:
test_age_calculated

In [None]:
test_parch_calculated

In [None]:
test_pclass_calculated

In [None]:
test_age_calculated

In [None]:
test_sibsp_calculated

In [None]:
survival_sum1 = np.add(
    test_pclass_calculated, 
    test_sex_calculated
)
#一気に全てのリストを足す方法がわからないから一つづつ

In [None]:
survival_sum2 = np.add(
    survival_sum1, 
    test_age_calculated)

In [None]:
survival_sum3 = np.add(
    survival_sum2, 
    test_sibsp_calculated)

In [None]:
survival_sum4 = np.add(
    survival_sum3, 
    test_parch_calculated)

In [None]:
survival_total = np.add(
    survival_sum4, 
    test_fare_calculated
)

In [None]:
survival_total

In [None]:
survival_final_preprocessed = survival_total/5

In [None]:
len(survival_final_preprocessed)

In [None]:
survival_final_preprocessed

In [None]:
# ここで問題が生存する確率がどこを上まったら生き残るの仮定するか
# 取り敢えず x > 0.5なら生存するとする

In [None]:
survival_final = []
for x in survival_final_preprocessed:
    if x>0.5:
        survival_final.append(1)
    else:
        survival_final.append(0)

In [None]:
survival_final

In [None]:
test.head()

In [None]:
test["Survived"] = survival_final

In [None]:
test = test.drop(columns=("Pclass"))

In [None]:
test = test.drop(columns=("Fare"))
test = test.drop(columns=("Parch"))
test = test.drop(columns=("SibSp"))
test = test.drop(columns=("Age"))
test = test.drop(columns=("Sex"))
test

In [None]:
test

In [None]:
test.to_csv("submit.csv", index=False)