<a href="https://colab.research.google.com/github/Neoneto/Coding_Dojo_Week8/blob/main/Kaggle_Exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predicting Survivability from Titanic Dataset Using Boosting
Submitted by Kenneth Alaba

## Data Loading and Pre-Processing

In [180]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

In [181]:
from google.colab import drive
drive.mount('/content/drive')

df0 = pd.read_csv('/content/drive/MyDrive/Coding Dojo/08 Week 8: Boosting/train.csv')
df0.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [182]:
# Check dataframe info
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [183]:
# Check how many missing values
df0.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [184]:
# Check the distribution of the target 
y.value_counts(normalize = True)

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [185]:
# Define the target and features by selecting the necessary columns
y = df0['Survived']
X = df0[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]

### Dealing with missing values and inconsistencies

In [186]:
# fill missing values with placeholders
X.Age.fillna(0, inplace = True) # not necessarily 0 y.o.
X.Cabin.fillna('others', inplace = True)

X.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.25,others,S
1,1,female,38.0,1,0,71.2833,C85,C
2,3,female,26.0,0,0,7.925,others,S
3,1,female,35.0,1,0,53.1,C123,S
4,3,male,35.0,0,0,8.05,others,S


In [187]:
# Check for inconsistencies

for c_name in ['Sex', 'Embarked', 'Cabin']:
  print(df0[c_name].unique())

['male' 'female']
['S' 'C' 'Q' nan]
[nan 'C85' 'C123' 'E46' 'G6' 'C103' 'D56' 'A6' 'C23 C25 C27' 'B78' 'D33'
 'B30' 'C52' 'B28' 'C83' 'F33' 'F G73' 'E31' 'A5' 'D10 D12' 'D26' 'C110'
 'B58 B60' 'E101' 'F E69' 'D47' 'B86' 'F2' 'C2' 'E33' 'B19' 'A7' 'C49'
 'F4' 'A32' 'B4' 'B80' 'A31' 'D36' 'D15' 'C93' 'C78' 'D35' 'C87' 'B77'
 'E67' 'B94' 'C125' 'C99' 'C118' 'D7' 'A19' 'B49' 'D' 'C22 C26' 'C106'
 'C65' 'E36' 'C54' 'B57 B59 B63 B66' 'C7' 'E34' 'C32' 'B18' 'C124' 'C91'
 'E40' 'T' 'C128' 'D37' 'B35' 'E50' 'C82' 'B96 B98' 'E10' 'E44' 'A34'
 'C104' 'C111' 'C92' 'E38' 'D21' 'E12' 'E63' 'A14' 'B37' 'C30' 'D20' 'B79'
 'E25' 'D46' 'B73' 'C95' 'B38' 'B39' 'B22' 'C86' 'C70' 'A16' 'C101' 'C68'
 'A10' 'E68' 'B41' 'A20' 'D19' 'D50' 'D9' 'A23' 'B50' 'A26' 'D48' 'E58'
 'C126' 'B71' 'B51 B53 B55' 'D49' 'B5' 'B20' 'F G63' 'C62 C64' 'E24' 'C90'
 'C45' 'E8' 'B101' 'D45' 'C46' 'D30' 'E121' 'D11' 'E77' 'F38' 'B3' 'D6'
 'B82 B84' 'D17' 'A36' 'B102' 'B69' 'E49' 'C47' 'D28' 'E17' 'A24' 'C50'
 'B42' 'C148']


There are entries in the Cabin that contains multiple cabin codes all of which contain a white space. In the mean time, they are replaced with the category 'others'

In [188]:
for entry in X.Cabin:
  if ' ' in entry:
    X.Cabin.replace({entry: 'others'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


In [189]:
X.Cabin.unique()

array(['others', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6', 'B78',
       'D33', 'B30', 'C52', 'B28', 'C83', 'F33', 'E31', 'A5', 'D26',
       'C110', 'E101', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7',
       'C49', 'F4', 'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78',
       'D35', 'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7',
       'A19', 'B49', 'D', 'C106', 'C65', 'E36', 'C54', 'C7', 'E34', 'C32',
       'B18', 'C124', 'C91', 'E40', 'T', 'C128', 'D37', 'B35', 'E50',
       'C82', 'E10', 'E44', 'A34', 'C104', 'C111', 'C92', 'E38', 'D21',
       'E12', 'E63', 'A14', 'B37', 'C30', 'D20', 'B79', 'E25', 'D46',
       'B73', 'C95', 'B38', 'B39', 'B22', 'C86', 'C70', 'A16', 'C101',
       'C68', 'A10', 'E68', 'B41', 'A20', 'D19', 'D50', 'D9', 'A23',
       'B50', 'A26', 'D48', 'E58', 'C126', 'B71', 'D49', 'B5', 'B20',
       'E24', 'C90', 'C45', 'E8', 'B101', 'D45', 'C46', 'D30', 'E121',
       'D11', 'E77', 'F38', 'B3', 'D6', 'D17', 'A36', 'B102', 'B69',
      

### Mapping categorical values

In [190]:
# map the categorical columns

# Import
from sklearn.compose import make_column_selector

# determine categorical columns
cat_selector = make_column_selector(dtype_include='object')
cat_data = X[cat_selector(X)]

# Store the dictionaries
dicts = {} 

# for each categorical columns
for c_name in cat_data.columns:
    
    # print the column name
    print(c_name)

    # get unique values from that column
    values_list = X[c_name].unique()

    # Create the integer classes for mappring
    indeces = np.arange(len(values_list))

    # create the dictionary
    col_dictionary = dict(zip(values_list,indeces))

    # Print the maping used for reference
    print(col_dictionary)
    dicts[c_name] = col_dictionary

    # replace values in that column
    X[c_name].replace(col_dictionary, inplace=True)

X.head()


Sex
{'male': 0, 'female': 1}
Cabin
{'others': 0, 'C85': 1, 'C123': 2, 'E46': 3, 'G6': 4, 'C103': 5, 'D56': 6, 'A6': 7, 'B78': 8, 'D33': 9, 'B30': 10, 'C52': 11, 'B28': 12, 'C83': 13, 'F33': 14, 'E31': 15, 'A5': 16, 'D26': 17, 'C110': 18, 'E101': 19, 'D47': 20, 'B86': 21, 'F2': 22, 'C2': 23, 'E33': 24, 'B19': 25, 'A7': 26, 'C49': 27, 'F4': 28, 'A32': 29, 'B4': 30, 'B80': 31, 'A31': 32, 'D36': 33, 'D15': 34, 'C93': 35, 'C78': 36, 'D35': 37, 'C87': 38, 'B77': 39, 'E67': 40, 'B94': 41, 'C125': 42, 'C99': 43, 'C118': 44, 'D7': 45, 'A19': 46, 'B49': 47, 'D': 48, 'C106': 49, 'C65': 50, 'E36': 51, 'C54': 52, 'C7': 53, 'E34': 54, 'C32': 55, 'B18': 56, 'C124': 57, 'C91': 58, 'E40': 59, 'T': 60, 'C128': 61, 'D37': 62, 'B35': 63, 'E50': 64, 'C82': 65, 'E10': 66, 'E44': 67, 'A34': 68, 'C104': 69, 'C111': 70, 'C92': 71, 'E38': 72, 'D21': 73, 'E12': 74, 'E63': 75, 'A14': 76, 'B37': 77, 'C30': 78, 'D20': 79, 'B79': 80, 'E25': 81, 'D46': 82, 'B73': 83, 'C95': 84, 'B38': 85, 'B39': 86, 'B22': 87, 'C86':

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,0,22.0,1,0,7.25,0,0
1,1,1,38.0,1,0,71.2833,1,1
2,3,1,26.0,0,0,7.925,0,0
3,1,1,35.0,1,0,53.1,2,0
4,3,0,35.0,0,0,8.05,0,0


## Train Test Split

In [191]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify = y)

## LightGBM

In [192]:
'''
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# Fit on training set only.
scaler.fit(X_train)
# Apply transform to both the training set and the test set.
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
'''

'\nfrom sklearn.preprocessing import StandardScaler\n\nscaler = StandardScaler()\n# Fit on training set only.\nscaler.fit(X_train)\n# Apply transform to both the training set and the test set.\nX_train = scaler.transform(X_train)\nX_test = scaler.transform(X_test)\n'

In [193]:
#import
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Instantiate and fit LightGBM
lgbm = LGBMClassifier( max_depth = -1, boosting_type = 'dart', num_leaves = 31, )
lgbm.fit(X_train, y_train)

# calculate the model scores

lgbm_train = lgbm.score(X_train, y_train)
lgbm_test = lgbm.score(X_test, y_test)

print(f'Training accuracy:{lgbm_train:.3f}')
print(f'Testing accuracy:{lgbm_test:.3f}' )

Training accuracy:0.903
Testing accuracy:0.794


# Making predictions

In [194]:
# load the test csv
test = pd.read_csv('/content/drive/MyDrive/Coding Dojo/08 Week 8: Boosting/test.csv')
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [195]:
# isolate the necessary columns
test_feats = test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]

# do the same transformation as the train set

# fill missing values with placeholders
test_feats.Age.fillna(0, inplace = True) # not necessarily 0 y.o.
test_feats.Cabin.fillna('others', inplace = True)
test_feats.Fare.fillna(0, inplace = True)

# dealing with inconsistencies
for entry in test_feats.Cabin:
  if ' ' in entry:
    test_feats.Cabin.replace({entry: 'others'}, inplace = True)

# map the categorical columns

# determine categorical columns
cat_data = test_feats[cat_selector(test_feats)]

# for each categorical columns
for c_name in cat_data.columns:

    # replace values in that column using the same dictionary used in the begining
    test_feats[c_name].replace(dicts[c_name], inplace=True)

# Replace cabin types that are not in the test set with 0/others
#test_feats.Cabin.str.replace( 0, inplace = True)
test_feats.Cabin.replace(to_replace=r'.*', value=0, regex=True, inplace = True)
'''
for entry in test_feats.Cabin:
  if entry.isstring == True:
    X.Cabin.replace({entry: 'others'}, inplace = True)
'''
test_feats.Cabin.unique()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


array([  0,  15,  36,  97,  34,  91, 106, 116,   4,   3,  52,  28, 117,
        54,  14,   1,  62,  88,  73,  68,  48, 127,  55,   8,  22,  49,
        64, 130,  95,  53])

In [196]:
# re train the model using the whole test csv

# Instantiate and fit LightGBM
lgbm = LGBMClassifier( max_depth = -1, boosting_type = 'dart', num_leaves = 31, )
lgbm.fit(X, y)

lgbm.score(X,y)

0.8956228956228957

In [197]:
# make predictions for the test entries
predictions = lgbm.predict(test_feats)

In [198]:
# Create a new csv containing the PassengerID and the survival predictions
test['Survived'] = predictions

to_submit = test[['PassengerId', 'Survived']]
to_submit.to_csv('/content/drive/MyDrive/Coding Dojo/08 Week 8: Boosting/submision.csv', index = False)