# Titanic - Machine Learning From Disaster

https://www.youtube.com/watch?v=3gK_2XdjOdY

### Importing

In [191]:
import os
for dirname, _, filenames in os.walk('/home/juuso/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import numpy as np
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer


# Data löytyy sivuilta: 
# https://www.kaggle.com/competitions/titanic/data?select=train.csv
# https://www.kaggle.com/competitions/titanic/data?select=test.csv

# Korjatkaa alempana olevat polut oikeisiin tiedostoihin.

/home/juuso/kaggle/input/train.csv
/home/juuso/kaggle/input/test.csv


In [192]:
train_data = pd.read_csv("/home/juuso/kaggle/input/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [193]:
test_data = pd.read_csv("/home/juuso/kaggle/input/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### Documentation

In [194]:
# Document the aims, decisions, and solutions of your data analysis directly into 
# the Jupyter Notebook as the analysis proceeds. Make the document self-explanatory.
# (0,25 grade units)

The project was started by downloading the trainset and testset from Kaggle's website. One table was printed from each dataset to verify that the downloading was successful. 

### Visualisation

In [195]:
# Describe your data. Calculate statistics, like means,
# standard deviations, and correlations.
# Draw diagrams, like histograms, scatter diagrams. 
# Visualize correlations. (0,25 grade units)

In [196]:
train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [197]:
train_data["Ticket"].value_counts()

347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: Ticket, Length: 681, dtype: int64

In [198]:
train_data["Age"].value_counts()

24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: Age, Length: 88, dtype: int64

### Preparing data (pakollinen)

In [199]:
# Prepare your data. When and if necessary, calculate new attributes, scale attributes,
# convert categorical variables into numeric variables. (0,25 grade units)

In [200]:
train_data = train_data.drop("Cabin", axis=1)

In [201]:
sex_cat = train_data[["Sex"]]

In [202]:
ordinal_encoder = OrdinalEncoder()
sex_cat_encoded = ordinal_encoder.fit_transform(sex_cat)
sex_cat_encoded[:10]

array([[1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.]])

In [203]:
train_data["Sex"] = sex_cat_encoded

In [204]:
# impute the null values of Age to median values

In [205]:
median_age = train_data["Age"].median()
train_data["Age"].fillna(median_age,inplace=True)

imputer = SimpleImputer(strategy="median")




In [206]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1.0,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.0,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",0.0,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.0,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",1.0,35.0,0,0,373450,8.05,S


In [207]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    float64
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(3), int64(5), object(3)
memory usage: 76.7+ KB


In [208]:
# ordinal encoder to embarked

In [209]:
train_data["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [210]:
embarked_cat = train_data[["Embarked"]]

In [214]:
ordinal_encoder = OrdinalEncoder()
embarked_cat_encoded = ordinal_encoder.fit_transform(embarked_cat)
embarked_cat_encoded[:10]

array([[2.],
       [0.],
       [2.],
       [2.],
       [2.],
       [1.],
       [2.],
       [2.],
       [2.],
       [0.]])

In [215]:
train_data["Embarked"] = embarked_cat_encoded

In [217]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1.0,22.0,1,0,A/5 21171,7.25,2.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.0,38.0,1,0,PC 17599,71.2833,0.0
2,3,1,3,"Heikkinen, Miss. Laina",0.0,26.0,0,0,STON/O2. 3101282,7.925,2.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.0,35.0,1,0,113803,53.1,2.0
4,5,0,3,"Allen, Mr. William Henry",1.0,35.0,0,0,373450,8.05,2.0


In [221]:
median_embarked = train_data["Embarked"].median()
train_data["Embarked"].fillna(median_embarked,inplace=True)

imputer = SimpleImputer(strategy="median")




In [222]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    float64
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    float64
dtypes: float64(4), int64(5), object(2)
memory usage: 76.7+ KB


### Train model (pakollinen)

In [212]:
# Train two different classification models. One of them should be Random Forest or SVM.
# You can select the other one by yourselves. (0,25)
