In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
train_data = pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
test_data = pd.read_csv("test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [7]:
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

% of men who survived: 0.18890814558058924


In [8]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


## Dealing with missing data

In [16]:
# read in data, this is the famous titanic dataset
data = pd.read_csv('test.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [17]:
# see the null values
data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

The data missing is age which is a float value and cabin which is categorical.

1. Drop the data

In [18]:
# easiest method is to drop the data
data = data.dropna()
data.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [19]:
# There are now no null entries! However, take a look at how many training examples are left
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 87 entries, 12 to 414
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  87 non-null     int64  
 1   Pclass       87 non-null     int64  
 2   Name         87 non-null     object 
 3   Sex          87 non-null     object 
 4   Age          87 non-null     float64
 5   SibSp        87 non-null     int64  
 6   Parch        87 non-null     int64  
 7   Ticket       87 non-null     object 
 8   Fare         87 non-null     float64
 9   Cabin        87 non-null     object 
 10  Embarked     87 non-null     object 
dtypes: float64(2), int64(4), object(5)
memory usage: 8.2+ KB


However, we now only have about a 20% of the data left! This is not good.

 I would say a maximum reduction of 5% would be fine otherwise you may lose valuable data that will affect the training of your model.

2. Impute Data with mean, median or mode

In [23]:
data = pd.read_csv('test.csv')

In [24]:
data[100:110]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
100,992,1,"Stengel, Mrs. Charles Emil Henry (Annie May Mo...",female,43.0,1,0,11778,55.4417,C116,C
101,993,2,"Weisz, Mr. Leopold",male,27.0,1,0,228414,26.0,,S
102,994,3,"Foley, Mr. William",male,,0,0,365235,7.75,,Q
103,995,3,"Johansson Palmquist, Mr. Oskar Leander",male,26.0,0,0,347070,7.775,,S
104,996,3,"Thomas, Mrs. Alexander (Thamine Thelma"")""",female,16.0,1,1,2625,8.5167,,C
105,997,3,"Holthen, Mr. Johan Martin",male,28.0,0,0,C 4001,22.525,,S
106,998,3,"Buckley, Mr. Daniel",male,21.0,0,0,330920,7.8208,,Q
107,999,3,"Ryan, Mr. Edward",male,,0,0,383162,7.75,,Q
108,1000,3,"Willer, Mr. Aaron (Abi Weller"")""",male,,0,0,3410,8.7125,,S
109,1001,2,"Swane, Mr. George",male,18.5,0,0,248734,13.0,F,S


In [25]:
# fill columns with the mean
data.fillna(data.mean(), inplace=True)

In [26]:
data[100:110]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
100,992,1,"Stengel, Mrs. Charles Emil Henry (Annie May Mo...",female,43.0,1,0,11778,55.4417,C116,C
101,993,2,"Weisz, Mr. Leopold",male,27.0,1,0,228414,26.0,,S
102,994,3,"Foley, Mr. William",male,30.27259,0,0,365235,7.75,,Q
103,995,3,"Johansson Palmquist, Mr. Oskar Leander",male,26.0,0,0,347070,7.775,,S
104,996,3,"Thomas, Mrs. Alexander (Thamine Thelma"")""",female,16.0,1,1,2625,8.5167,,C
105,997,3,"Holthen, Mr. Johan Martin",male,28.0,0,0,C 4001,22.525,,S
106,998,3,"Buckley, Mr. Daniel",male,21.0,0,0,330920,7.8208,,Q
107,999,3,"Ryan, Mr. Edward",male,30.27259,0,0,383162,7.75,,Q
108,1000,3,"Willer, Mr. Aaron (Abi Weller"")""",male,30.27259,0,0,3410,8.7125,,S
109,1001,2,"Swane, Mr. George",male,18.5,0,0,248734,13.0,F,S


In [27]:
# For the ‘Cabin’ column the entries are still NaN as you can’t calculate the mean for an object datatype as it’s categorical. This can be fixed by computing its mode:
data = data.fillna(data['Cabin'].value_counts().index[0])

The mean is only useful for continous data. In regards to the ‘Cabin’ feature, it only has 91 entries, which is about 25% of the total examples. Therefore, the mode value that we previously calculated is not very reliable. A better way is to assign these NaN values their own category: We need a way of handling categorical data as well. To do this, we can simply create a new category for the NaN data.

In [28]:
# set the cabin to be the mode
data['Cabin'] = data['Cabin'].fillna('Unkown')

In [29]:
data[100:110]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
100,992,1,"Stengel, Mrs. Charles Emil Henry (Annie May Mo...",female,43.0,1,0,11778,55.4417,C116,C
101,993,2,"Weisz, Mr. Leopold",male,27.0,1,0,228414,26.0,B57 B59 B63 B66,S
102,994,3,"Foley, Mr. William",male,30.27259,0,0,365235,7.75,B57 B59 B63 B66,Q
103,995,3,"Johansson Palmquist, Mr. Oskar Leander",male,26.0,0,0,347070,7.775,B57 B59 B63 B66,S
104,996,3,"Thomas, Mrs. Alexander (Thamine Thelma"")""",female,16.0,1,1,2625,8.5167,B57 B59 B63 B66,C
105,997,3,"Holthen, Mr. Johan Martin",male,28.0,0,0,C 4001,22.525,B57 B59 B63 B66,S
106,998,3,"Buckley, Mr. Daniel",male,21.0,0,0,330920,7.8208,B57 B59 B63 B66,Q
107,999,3,"Ryan, Mr. Edward",male,30.27259,0,0,383162,7.75,B57 B59 B63 B66,Q
108,1000,3,"Willer, Mr. Aaron (Abi Weller"")""",male,30.27259,0,0,3410,8.7125,B57 B59 B63 B66,S
109,1001,2,"Swane, Mr. George",male,18.5,0,0,248734,13.0,F,S


Algorithms as K-Nearest Neighbor, Naive Bayes, and XGBoost all work with missing data.