In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
#Importing the dependencies 
import numpy as np
import pandas as pd 
import matplotlib as plt

In [3]:
#Loading the datasets 
train_data=pd.read_csv('../input/titanic/train.csv')
test_data=pd.read_csv('../input/titanic/test.csv')
datasets=[train_data,test_data]
test_ids=test_data['PassengerId'] #Extracting the target ids


In [4]:
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [5]:
#Looking for null values 
print(train_data.isnull().sum())
print(test_data.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


Seems like there are quite a few values missing in the cabin column ,some in the age column and a few missing from the Fare and embarked columns

In [6]:
for incomp_var in train_data.columns:
    missing_value=pd.concat(datasets)[incomp_var].isnull().sum()
    if missing_value>0 and incomp_var!='Survived':
        total_value=pd.concat(datasets).shape[0]
        print(f"Percentage of missing values in {incomp_var}:"
             f"{missing_value/total_value*100:.1f}%")

Percentage of missing values in Age:20.1%
Percentage of missing values in Fare:0.1%
Percentage of missing values in Cabin:77.5%
Percentage of missing values in Embarked:0.2%


**Since percentage of missing values in Cabin is very high, we are just not going to consider this feature and drop it.  Additionally, I am going to drop PassengerId and Ticket from the training dataset since they have no meaningfull relation to the survival rates.**

In [7]:
drop_column=['Cabin','Ticket','PassengerId']
for dataset in datasets:
    dataset.drop(drop_column, axis=1, inplace=True)

**As for the other 3 columns missing datasets, I'm going to replace the null values with the medians of the available ones for the columns with the numerical values and as for the 'Embarked' column I will replace the null values with value that occurs the most number of times. This would give us the most accurate predictions. Later I will include an Imputer which would automate these processes.**

In [8]:
for dataset in datasets:
    dataset.Age.fillna(dataset.Age.median(),inplace=True)
    dataset.Embarked.fillna(dataset.Embarked.mode()[0],inplace=True)
    dataset.Fare.fillna(dataset.Fare.median(),inplace=True)

In [9]:
#Lets make sure there are no more null values 
print(train_data.isnull().sum())
print(test_data.isnull().sum())

Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


### Feature Engineering 
Coming up with some new attributes for the model to intepret behaviours for better predictions.
1. FamilySize - Size of family. 1 represent that the passenger was travelling alone.
2. Title - The indivuals title indicating social status 
3. Farebin and Agebin - Binning the values to reduce the impact of minor observation errors. It also takes care of some outliers. <br>
**Binning basically refers to compressing/bucketing the data in ranges to simplify it.**

In [10]:
#defining a function that gives usproper bin names
def bin_labels(bin_name,number_of_bins):
    labels=[]
    for i in range(number_of_bins):
        labels.append(bin_name + f"_{i}")
    return labels

#Lets define the new attributes in the dataset 
for dataset in datasets:
    dataset["FamilySize"]= dataset.SibSp + dataset.Parch + 1 # 1 is the passenger themselves
    dataset["Title"]=dataset.Name.str.extract(r"([A-Za-z]+)\.",expand= False)
    dataset["FareBin"]=pd.qcut(dataset.Fare,4,labels=bin_labels("FareBin",4))
    dataset["AgeBin"]=pd.cut(dataset.Age.astype(int),5,labels=bin_labels("AgeBin",5))

In [11]:
train_data

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,Title,FareBin,AgeBin
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S,2,Mr,FareBin_0,AgeBin_1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,2,Mrs,FareBin_3,AgeBin_2
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S,1,Miss,FareBin_1,AgeBin_1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S,2,Mrs,FareBin_3,AgeBin_2
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S,1,Mr,FareBin_1,AgeBin_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S,1,Rev,FareBin_1,AgeBin_1
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S,1,Miss,FareBin_2,AgeBin_1
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,23.4500,S,4,Miss,FareBin_2,AgeBin_1
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C,1,Mr,FareBin_2,AgeBin_1


**Now lets remove the unnecessary outdated columns**

In [12]:
drop_columns=['Age','SibSp','Parch','Fare','Name']
for dataset in datasets:
    dataset.drop(drop_columns,axis =1 ,inplace=True)

In [13]:
data=pd.concat(datasets)
titles=data.Title.value_counts()
print(titles)

Mr          757
Miss        260
Mrs         197
Master       61
Rev           8
Dr            8
Col           4
Mlle          2
Major         2
Ms            2
Lady          1
Sir           1
Mme           1
Don           1
Capt          1
Countess      1
Jonkheer      1
Dona          1
Name: Title, dtype: int64


Since we got too many titles that are very few in numbers, they could affect the model and we dont want that. We categorize them under one name which will be represented as 'unique'.

In [14]:
unique_titles=titles[titles<10].index
for dataset in datasets:
    dataset.Title.replace(unique_titles,"Unique",inplace=True)

### Dummifying Categorical features 
It is advised to train the model with numerical values. Therefore, we change the categorical values like embarked and Title to numerical values using the pandas.get_dummies function without having to map or change the value of the data 

In [15]:
Categorical_features=train_data.select_dtypes(exclude=np.number).columns
x_train=pd.get_dummies(train_data,prefix=Categorical_features)
x_test=pd.get_dummies(test_data,prefix=Categorical_features)
x_test=x_test.to_numpy()

**Splitting the label(target) from the Features which is the survived column**

In [16]:
target="Survived"
x_train=x_train.drop(columns=target)
x_train=x_train.to_numpy()
y_train=train_data[target].to_numpy()

In [17]:
x_train,y_train 

(array([[3, 2, 0, ..., 0, 0, 0],
        [1, 2, 1, ..., 1, 0, 0],
        [3, 1, 1, ..., 0, 0, 0],
        ...,
        [3, 4, 1, ..., 0, 0, 0],
        [1, 1, 0, ..., 0, 0, 0],
        [3, 1, 0, ..., 0, 0, 0]]),
 array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
        1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
        1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
        1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
        1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 

## Building the model 


Here I am going to test the data on a few different classifiers and finally analyze the 

In [18]:
#Importing dependencies for building the model
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.linear_model import SGDClassifier

**I'm going to use the GridSearchCV method to perform cross validation and fine tune  based on the parameters defined**

In [19]:
cv= ShuffleSplit(n_splits=3,test_size=0.2,random_state=42)

#parameter settings for SGDClassifier
# parameters={
#     'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], # learning rate
#     'max_iter': [1000], # number of epochs
#     'penalty': ['l2'],
#     'n_jobs': [-1]
# }

#parameter settings for SVC

# parameters={
#     'C': [0.1,1, 10, 100],
#     'gamma': [1,0.1,0.01,0.001],
#     'kernel': ['rbf', 'poly', 'sigmoid'],
# }

#parameter settings for GaussianNB

# parameters={
#     'var_smoothing': np.logspace(0,-9, num=100),
#     'priors': [None],
# }

#parameter settings for Logistic Regression

# parameters={
#     'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
#     'penalty': ['l1', 'l2'],
#     'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
# }

#parameter settings for DecisionTree and Randomforest

# parameters={
#     "max_depth":list(range(1,5)),
#     "min_samples_split": (1, 2, 3),
#     "min_samples_leaf": (1, 2, 3),
# }

#parameter settings for XGBClassifier

parameters={
    "max_depth":list(range(1,5)),
    "n_estimators":list(range(1,5)),
    "learning_rate":[0.01,0.1,1],
}
model = GridSearchCV(
  XGBClassifier(),parameters,cv=cv,scoring='f1'
)
model.fit(x_train,y_train)

GridSearchCV(cv=ShuffleSplit(n_splits=3, random_state=42, test_size=0.2, train_size=None),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=Non...
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
 

### Evaluating the model on some of the training data

In [20]:
some_data=x_train[0:5]
some_labels=y_train[0:5]

**comparing the predicted labels with the actual labels**

In [21]:
model.predict(some_data),some_labels

(array([0, 1, 1, 1, 0]), array([0, 1, 1, 1, 0]))

In [22]:
print ('Best score: %0.3f' % model.best_score_)
print ('Best parameters set:')
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print ('\t%s: %r' % (param_name, best_parameters[param_name]))

Best score: 0.778
Best parameters set:
	learning_rate: 1
	max_depth: 3
	n_estimators: 2


## Now evaluating the model on the test data and printing the results 

In [23]:
predictions = model.predict(x_test)
print  (predictions)

[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 1 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 1 0 0 0 1 1 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 1]


## Noting down F1 scores of different classifiers and choosing the best one.
1.XGBClassifier - 0.778 <br>
2.DecisionTree - 0.758 <br>
3.RandomForestClassifier - 0.763 <br>
4.Gaussian Naive Bayes - 0.762 <br>
5.Logistic Regression - 0.761 <br>
6.SVM classifier - 0.781 <br>
7.SGD classifier -0.769 <br>

**I am going to pick XGBClassifier. Even though it performed the second best after SVM, its evaluation on the train data sample was better than the SVM Classifier.**

In [24]:
#Creating the submission file in a csv format
submission = pd.DataFrame(

      {

          "PassengerId": test_ids,

          "Survived": predictions,

      }

  )
submission.to_csv("titanic_submission.csv", index=False)

In [25]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
