In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split

%matplotlib inline
sns.set()


In [14]:
test_df = pd.read_csv('./titanic-test.csv')
train_df = pd.read_csv('./titanic-train.csv')


In [4]:
print(train_df['Age'].mean())
print(train_df['Age'].median())
print(train_df['Age'].mode())


29.69911764705882
28.0
0    24.0
dtype: float64


In [6]:
print(test_df['Age'].mean())
print(test_df['Age'].median())
print(test_df['Age'].mode())


30.272590361445783
27.0
0    21.0
1    24.0
dtype: float64


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
train_df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [9]:
test_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [10]:
test_df['Embarked'].value_counts()


S    270
C    102
Q     46
Name: Embarked, dtype: int64

In [11]:
def prepare_df(df):
  df['Age'] = df['Age'].fillna(df['Age'].median())
  df['Embarked'] = df['Embarked'].fillna('S')

  train_predictors = df.drop(
    ['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'],
    axis=1
  )
  categorical_cols = [
    cname for cname in train_predictors.columns if
      train_predictors[cname].nunique() < 10 and
      train_predictors[cname].dtype == 'object'
  ]
  numerical_cols = [
    cname for cname in train_predictors.columns if
      train_predictors[cname].dtype in ['int64', 'float64']
  ]
  my_cols = categorical_cols + numerical_cols

  train_predictors = train_predictors[my_cols]

  return train_predictors


In [28]:
def prepare_test_df(df):
  df['Age'] = df['Age'].fillna(df['Age'].median())
  df['Embarked'] = df['Embarked'].fillna('S')

  train_predictors = df.drop(
      ['PassengerId', 'Name', 'Ticket', 'Cabin'],
      axis=1
  )
  categorical_cols = [
      cname for cname in train_predictors.columns if
      train_predictors[cname].nunique() < 10 and
      train_predictors[cname].dtype == 'object'
  ]
  numerical_cols = [
      cname for cname in train_predictors.columns if
      train_predictors[cname].dtype in ['int64', 'float64']
  ]
  my_cols = categorical_cols + numerical_cols

  train_predictors = train_predictors[my_cols]

  return train_predictors


In [15]:
train_df_cleaned = prepare_df(train_df)
# test_df = prepare_df(test_df)


In [16]:
train_df_cleaned.head()

Unnamed: 0,Sex,Embarked,Pclass,Age,SibSp,Parch,Fare
0,male,S,3,22.0,1,0,7.25
1,female,C,1,38.0,1,0,71.2833
2,female,S,3,26.0,0,0,7.925
3,female,S,1,35.0,1,0,53.1
4,male,S,3,35.0,0,0,8.05


In [17]:
dummy_encoded_train_predictors = pd.get_dummies(train_df_cleaned)


In [18]:
y_target = train_df['Survived'].values
x_features_one = dummy_encoded_train_predictors.values

x_train, x_validation, y_train, y_validation = train_test_split(
    x_features_one,
    y_target,
    test_size=.25,
    random_state=1
)


In [19]:
tree_one = tree.DecisionTreeClassifier()
tree_one = tree_one.fit(x_train, y_train)


In [20]:
tree_one_accuracy = round(tree_one.score(x_validation, y_validation), 4)
print('Accuracy: %0.4f' % (tree_one_accuracy))


Accuracy: 0.7489


In [42]:
clean_test_df.dropna()


Unnamed: 0,Sex,Embarked,Pclass,Age,SibSp,Parch,Fare
0,male,Q,3,34.5,0,0,7.8292
1,female,S,3,47.0,1,0,7.0000
2,male,Q,2,62.0,0,0,9.6875
3,male,S,3,27.0,0,0,8.6625
4,female,S,3,22.0,1,1,12.2875
...,...,...,...,...,...,...,...
413,male,S,3,27.0,0,0,8.0500
414,female,C,1,39.0,0,0,108.9000
415,male,S,3,38.5,0,0,7.2500
416,male,S,3,27.0,0,0,8.0500


In [43]:

clean_test_df = prepare_test_df(test_df)
dummy_encoded_test = pd.get_dummies(clean_test_df)


In [44]:
tree_one.predict(dummy_encoded_test.values)


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').