# Titanic - Machine Learning From Disaster

https://www.youtube.com/watch?v=3gK_2XdjOdY

### Importing

In [603]:
import os
for dirname, _, filenames in os.walk('/home/juuso/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import numpy as np
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer


# Data löytyy sivuilta: 
# https://www.kaggle.com/competitions/titanic/data?select=train.csv
# https://www.kaggle.com/competitions/titanic/data?select=test.csv

# Korjatkaa alempana olevat polut oikeisiin tiedostoihin.

/home/juuso/kaggle/input/train.csv
/home/juuso/kaggle/input/test.csv


In [604]:
train_data = pd.read_csv("/home/juuso/kaggle/input/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [605]:
test_data = pd.read_csv("/home/juuso/kaggle/input/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### Documentation

In [606]:
# Document the aims, decisions, and solutions of your data analysis directly into 
# the Jupyter Notebook as the analysis proceeds. Make the document self-explanatory.
# (0,25 grade units)

The project was started by downloading the trainset and testset from Kaggle's website. One table was printed from each dataset to verify that the downloading was successful. 

### Visualisation

In [607]:
# Describe your data. Calculate statistics, like means,
# standard deviations, and correlations.
# Draw diagrams, like histograms, scatter diagrams. 
# Visualize correlations. (0,25 grade units)

In [608]:
train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [609]:
train_data["Ticket"].value_counts()

347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: Ticket, Length: 681, dtype: int64

In [610]:
train_data["Age"].value_counts()

24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: Age, Length: 88, dtype: int64

### Preparing data (pakollinen)

In [611]:
# Prepare your data. When and if necessary, calculate new attributes, scale attributes,
# convert categorical variables into numeric variables. (0,25 grade units)

### Preparing train set

In [612]:
train_data = train_data.drop("Cabin", axis=1)

In [613]:
sex_cat = train_data[["Sex"]]

In [614]:
ordinal_encoder = OrdinalEncoder()
sex_cat_encoded = ordinal_encoder.fit_transform(sex_cat)
sex_cat_encoded[:10]

array([[1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.]])

In [615]:
train_data["Sex"] = sex_cat_encoded

In [616]:
# impute the null values of Age to median values

In [617]:
median_age = train_data["Age"].median()
train_data["Age"].fillna(median_age,inplace=True)

imputer = SimpleImputer(strategy="median")




In [618]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1.0,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.0,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",0.0,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.0,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",1.0,35.0,0,0,373450,8.05,S


In [619]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    float64
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(3), int64(5), object(3)
memory usage: 76.7+ KB


In [620]:
# ordinal encoder to embarked

In [621]:
train_data["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [622]:
embarked_cat = train_data[["Embarked"]]

In [623]:
ordinal_encoder = OrdinalEncoder()
embarked_cat_encoded = ordinal_encoder.fit_transform(embarked_cat)
embarked_cat_encoded[:10]

array([[2.],
       [0.],
       [2.],
       [2.],
       [2.],
       [1.],
       [2.],
       [2.],
       [2.],
       [0.]])

In [624]:
train_data["Embarked"] = embarked_cat_encoded

In [625]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1.0,22.0,1,0,A/5 21171,7.25,2.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.0,38.0,1,0,PC 17599,71.2833,0.0
2,3,1,3,"Heikkinen, Miss. Laina",0.0,26.0,0,0,STON/O2. 3101282,7.925,2.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.0,35.0,1,0,113803,53.1,2.0
4,5,0,3,"Allen, Mr. William Henry",1.0,35.0,0,0,373450,8.05,2.0


In [626]:
median_embarked = train_data["Embarked"].median()
train_data["Embarked"].fillna(median_embarked,inplace=True)

imputer = SimpleImputer(strategy="median")




In [627]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    float64
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    float64
dtypes: float64(4), int64(5), object(2)
memory usage: 76.7+ KB


### Preparing test set

In [628]:
test_data = test_data.drop("Cabin", axis=1)

In [629]:
sex_cat = test_data[["Sex"]]

In [630]:
ordinal_encoder = OrdinalEncoder()
sex_cat_encoded = ordinal_encoder.fit_transform(sex_cat)
sex_cat_encoded[:10]

array([[1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.]])

In [631]:
test_data["Sex"] = sex_cat_encoded

In [632]:
median_age = test_data["Age"].median()
test_data["Age"].fillna(median_age,inplace=True)

imputer = SimpleImputer(strategy="median")




In [633]:
embarked_cat = test_data[["Embarked"]]

In [634]:
ordinal_encoder = OrdinalEncoder()
embarked_cat_encoded = ordinal_encoder.fit_transform(embarked_cat)
embarked_cat_encoded[:10]

array([[1.],
       [2.],
       [1.],
       [2.],
       [2.],
       [2.],
       [1.],
       [2.],
       [0.],
       [2.]])

In [635]:
test_data["Embarked"] = embarked_cat_encoded

In [636]:
median_embarked = test_data["Embarked"].median()
test_data["Embarked"].fillna(median_embarked,inplace=True)

imputer = SimpleImputer(strategy="median")


In [637]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    float64
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Embarked     418 non-null    float64
dtypes: float64(4), int64(4), object(2)
memory usage: 32.8+ KB


In [638]:
test_data[test_data['Fare'].isnull()]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
152,1044,3,"Storey, Mr. Thomas",1.0,60.5,0,0,3701,,2.0


In [639]:
test_data = test_data.drop(152)

### Train model (pakollinen)

In [657]:
# Train two different classification models. One of them should be Random Forest or SVM.
# You can select the other one by yourselves. (0,25)


### Random forest

In [641]:
df_train = train_data.drop(['Name', 'SibSp', 'Parch', 'Embarked', 'Ticket'], axis=1)
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare
0,1,0,3,1.0,22.0,7.2500
1,2,1,1,0.0,38.0,71.2833
2,3,1,3,0.0,26.0,7.9250
3,4,1,1,0.0,35.0,53.1000
4,5,0,3,1.0,35.0,8.0500
...,...,...,...,...,...,...
886,887,0,2,1.0,27.0,13.0000
887,888,1,1,0.0,19.0,30.0000
888,889,0,3,0.0,28.0,23.4500
889,890,1,1,1.0,26.0,30.0000


In [642]:
train_data2 = df_train.values
train_data2

array([[  1.    ,   0.    ,   3.    ,   1.    ,  22.    ,   7.25  ],
       [  2.    ,   1.    ,   1.    ,   0.    ,  38.    ,  71.2833],
       [  3.    ,   1.    ,   3.    ,   0.    ,  26.    ,   7.925 ],
       ...,
       [889.    ,   0.    ,   3.    ,   0.    ,  28.    ,  23.45  ],
       [890.    ,   1.    ,   1.    ,   1.    ,  26.    ,  30.    ],
       [891.    ,   0.    ,   3.    ,   1.    ,  32.    ,   7.75  ]])

In [643]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100)

In [644]:
# Training data features, skip the first column 'Survived'
train_features = train_data2[:, 1:]

# 'Survived' column values
train_target = train_data2[:, 0]

# Fit the model to our training data
clf = clf.fit(train_features, train_target)
score = clf.score(train_features, train_target)
"Mean accuracy of Random Forest: {0}".format(score)

'Mean accuracy of Random Forest: 0.856341189674523'

### Testing 

In [645]:
df_test = test_data.drop(['Name', 'SibSp', 'Parch', 'Embarked', 'Ticket'], axis=1)
df_test

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare
0,892,3,1.0,34.5,7.8292
1,893,3,0.0,47.0,7.0000
2,894,2,1.0,62.0,9.6875
3,895,3,1.0,27.0,8.6625
4,896,3,0.0,22.0,12.2875
...,...,...,...,...,...
413,1305,3,1.0,27.0,8.0500
414,1306,1,0.0,39.0,108.9000
415,1307,3,1.0,38.5,7.2500
416,1308,3,1.0,27.0,8.0500


In [646]:
test_data2 = df_test.values

In [647]:
test_x = test_data2[:, 0:]

# Predict the Survival values for the test data
test_y = clf.predict(test_x)

In [652]:
df_test["Survived"] = test_y

In [653]:
survived_cat = df_test["Survived"]

In [None]:
# ordinal_encoder = OrdinalEncoder()
# survived_cat_encoded = ordinal_encoder.fit_transform(survived_cat)
# survived_cat_encoded[:10]

In [654]:
def muunto(lista):
    uusi_lista = []
    for i in lista:
        if i < 500:
            uusi_lista.append(0)
        else:
            uusi_lista.append(1)
    return uusi_lista
df_test["Survived"] = muunto(df_test["Survived"])

In [655]:
df_test["Survived"].describe()

count    417.000000
mean       0.443645
std        0.497411
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: Survived, dtype: float64

In [656]:
df_test[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Sex,Survived
1,1.0,0.464151
0,0.0,0.407895


### Linear Regression

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)


In [None]:
housing_labels = strat_train_set["median_house_value"].copy()


In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)