## Loading the Data

In [3]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Importing the dataset
train_data = pd.read_csv("E:/Documents/Kaggle/Titanic/train.csv")
test_data = pd.read_csv("E:/Documents/Kaggle/Titanic/test.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Preprocessing (Dealing with missing data)

In [4]:
# Replacing missing values with mean
test_data["Age"].fillna(test_data["Age"].mean(), inplace=True)

# Removing rows with missing values
train_data.dropna(axis=0, subset=['Survived'], inplace=True)
train_data.dropna(axis=0, subset=['Age'], inplace=True)
train_data.info()

# Selecting target
y = train_data["Survived"]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  714 non-null    int64  
 1   Survived     714 non-null    int64  
 2   Pclass       714 non-null    int64  
 3   Name         714 non-null    object 
 4   Sex          714 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        714 non-null    int64  
 7   Parch        714 non-null    int64  
 8   Ticket       714 non-null    object 
 9   Fare         714 non-null    float64
 10  Cabin        185 non-null    object 
 11  Embarked     712 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 72.5+ KB


## Feature Engineering (Dealing with categorical data)

LabelEncoder is used to convert categorical data, or text data, into numbers, which our predictive models can better understand.

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder() 

train_data["Sex"] = le.fit_transform(train_data["Sex"])
test_data["Sex"] = le.fit_transform(test_data["Sex"])

In [6]:
# Selecting features
features = ["Pclass", "Sex", "SibSp", "Parch", "Age"]
X = train_data[features]
X_test = test_data[features]

X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  714 non-null    int64  
 1   Sex     714 non-null    int32  
 2   SibSp   714 non-null    int64  
 3   Parch   714 non-null    int64  
 4   Age     714 non-null    float64
dtypes: float64(1), int32(1), int64(3)
memory usage: 30.7 KB


In [7]:
# Splitting the dataset into the Training set and Validation set
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)

In [8]:
# Checking for missing values
for col in X.columns:
    if X_train[col].isnull().any():
        print(X_train[col].value_counts())

## Training and Testing

In [9]:
# Training a base model
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)

model.fit(X_train, y_train)
score = model.score(X_valid, y_valid)

print('Score:', score)

Score: 0.8603351955307262


In [10]:
# Importing all models
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Creating a list of model names
names = [
    "Linear Regression",
    "Logistic Regression",
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Gaussian Process",
    "Naive Bayes (Gaussian)",
    "Naive Bayes (Multinomial)",
    "QDA",
]

# Creating a list of models
models = [
    LinearRegression(),
    LogisticRegression(),
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    GaussianNB(),
    MultinomialNB(),
    QuadraticDiscriminantAnalysis(),
]

## Evaluation of models

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Performing cross validation and normal evaluation
scores = []
cv_scores = []

for i,model in enumerate(models):
    model.fit(X_train, y_train)
    scores.append(model.score(X_valid, y_valid))
    cv = cross_val_score(model, X, y, cv=5)
    cv_scores.append(cv.mean())
    print("{} Test Accuracy {}".format(names[i], scores[i]))
    print("Cross Validation : %0.2f (+/- %0.2f) [%s]\n" % (cv.mean(), cv.std(), names[i]))

Linear Regression Test Accuracy 0.4131974546171139
Cross Validation : 0.37 (+/- 0.04) [Linear Regression]

Logistic Regression Test Accuracy 0.8100558659217877
Cross Validation : 0.79 (+/- 0.02) [Logistic Regression]

Nearest Neighbors Test Accuracy 0.7653631284916201
Cross Validation : 0.77 (+/- 0.02) [Nearest Neighbors]

Linear SVM Test Accuracy 0.776536312849162
Cross Validation : 0.78 (+/- 0.04) [Linear SVM]

RBF SVM Test Accuracy 0.7877094972067039
Cross Validation : 0.75 (+/- 0.02) [RBF SVM]

Decision Tree Test Accuracy 0.8379888268156425
Cross Validation : 0.79 (+/- 0.03) [Decision Tree]

Random Forest Test Accuracy 0.8268156424581006
Cross Validation : 0.81 (+/- 0.02) [Random Forest]

Neural Net Test Accuracy 0.8324022346368715
Cross Validation : 0.82 (+/- 0.03) [Neural Net]

AdaBoost Test Accuracy 0.7988826815642458
Cross Validation : 0.81 (+/- 0.02) [AdaBoost]

Gaussian Process Test Accuracy 0.8379888268156425
Cross Validation : 0.81 (+/- 0.03) [Gaussian Process]

Naive Bayes

In [12]:
# remove models with accuracy less than 80%
for i in range(1, len(scores)+1):
    if scores[-i] < 0.8:
        if cv_scores[-i] < 0.8:
            print("Removed %s" % names[-i + len(scores)])
            models.remove(models[-i + len(scores)])
            names.remove(names[-i + len(scores)])

Removed Naive Bayes (Multinomial)
Removed Naive Bayes (Gaussian)
Removed RBF SVM
Removed Linear SVM
Removed Nearest Neighbors
Removed Linear Regression


## Hyperparameter Tuning
One of the most important aspects of machine learning.
The two most common ways to tune hyperparameters are:
1. [Grid Search](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
2. [Random Search](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)

In [13]:
# Ignores Unwanted outputs
import warnings
warnings.filterwarnings("ignore")

In [14]:
# Creating a method to perform hyperparameter tuning using GridSearchCV
def hyperparameter_tuning(model, params):
    grid = GridSearchCV(model, params, n_jobs=-1, verbose=1, cv=4, scoring='accuracy')
    grid.fit(X_train, y_train)
    print("Best Model: {}".format(grid.best_estimator_))
    print("Best Score: {}".format(grid.best_score_))
    return grid

In [20]:
# Hyperparameter definitions for each model
logreg_params = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1.0, 5.0, 10.0],
    'solver': ['lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [30, 100, 200, 300],
}

dtc_params = {
    'max_depth': [15, 20, 50, 70, 90, 100, 110, 120, 130, 150],
    'min_samples_leaf': [2, 3, 5, 10, 20],
    'criterion': ["gini", "entropy"],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [10, 15, 20, 30, 40, 50]
}

rfc_params = {
    'n_estimators': [30, 50, 100, 150],
    'max_depth': [2, 5, 10, 20, 30],
    'min_samples_leaf': [2, 5, 10, 20],
    'bootstrap': [True, False],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5, 10, 15, 20]
}

mlp_params = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['relu', 'sigmoid', 'tanh'],
    'solver': ['adam', 'lbfgs', 'sgd'],
    'alpha': [0.1, 1.0, 10.0],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'max_iter': [500, 750, 1000]
}

ada_params = {
    'n_estimators': [30, 50, 100, 150],
    'learning_rate': [0.01, 0.1, 1.0, 10.0],
    'algorithm': ['SAMME', 'SAMME.R']
}

gp_params = {
    'n_restarts_optimizer': [0, 1, 2],
    'max_iter_predict': [100, 300, 500]
}

qda_params = {
    'reg_param': [0.0, 0.1, 0.5, 1.0],
    'tol': [1e-4, 1e-3, 1e-2]
}

# Perform hyperparameter tuning using GridSearchCV
params = [logreg_params, dtc_params, rfc_params, mlp_params, ada_params, gp_params, qda_params]
best_models = []
best_scores = []

for i, model in enumerate(models):
    best_ = hyperparameter_tuning(model, params[i])
    best_models.append(best_.best_estimator_)
    best_scores.append(best_.best_score_)

Fitting 4 folds for each of 160 candidates, totalling 640 fits
Best Model: LogisticRegression(C=0.1)
Best Score: 0.7981988553473236
Fitting 4 folds for each of 1200 candidates, totalling 4800 fits
Best Model: DecisionTreeClassifier(criterion='entropy', max_depth=15, max_features='sqrt',
                       min_samples_leaf=3, min_samples_split=20)
Best Score: 0.8057036247334756
Fitting 4 folds for each of 1600 candidates, totalling 6400 fits
Best Model: RandomForestClassifier(bootstrap=False, max_depth=5, min_samples_leaf=5,
                       min_samples_split=15, n_estimators=50)
Best Score: 0.8131663113006397
Fitting 4 folds for each of 729 candidates, totalling 2916 fits
Best Model: MLPClassifier(alpha=1.0, hidden_layer_sizes=(50, 50),
              learning_rate='invscaling', max_iter=750, solver='lbfgs')
Best Score: 0.8206149702614745
Fitting 4 folds for each of 32 candidates, totalling 128 fits
Best Model: AdaBoostClassifier(algorithm='SAMME', n_estimators=100)
Best Score

## Ensembling

Using multiple different models and ensembling their predictions.
You can learn more about ensembling [here](https://scikit-learn.org/stable/modules/ensemble.html#voting-classifier).

In [23]:
from sklearn.ensemble import VotingClassifier

# Creating an ensemble of all base models
eclf = VotingClassifier(
     estimators=[(name, clf) for name, clf in zip(names, models)], voting='soft')
eclf.fit(X_train, y_train)
eclf_cv = cross_val_score(eclf, X, y, cv=5)

# Creating an ensemble of best models
best_eclf = VotingClassifier(
     estimators=[(name, clf) for name, clf in zip(names, best_models)], voting='soft')
best_eclf.fit(X_train, y_train)
best_eclf_cv = cross_val_score(best_eclf, X, y, cv=5)

# Checking the accuracy of the ensembles
print("{} Test Accuracy {}".format('Ensemble', eclf.score(X_valid, y_valid)))
print("Cross Validation : %0.2f (+/- %0.2f) [%s]\n" % (
    eclf_cv.mean(), eclf_cv.std(), 'Ensemble'))

print("{} Test Accuracy {}".format('Best Ensemble', best_eclf.score(X_valid, y_valid)))
print("Cross Validation : %0.2f (+/- %0.2f) [%s]\n" % (
    best_eclf_cv.mean(), best_eclf_cv.std(), 'Best Ensemble'))

Ensemble Test Accuracy 0.8324022346368715
Cross Validation : 0.81 (+/- 0.03) [Ensemble]

Best Ensemble Test Accuracy 0.8156424581005587
Cross Validation : 0.81 (+/- 0.04) [Best Ensemble]



## Finalizing the best model

In [26]:
# Performing cross-validation on all best models
for clf, label in zip(best_models, names):
    scores = cross_val_score(clf, X, y, scoring='accuracy', cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.80 (+/- 0.03) [Logistic Regression]
Accuracy: 0.79 (+/- 0.02) [Decision Tree]
Accuracy: 0.82 (+/- 0.04) [Random Forest]
Accuracy: 0.81 (+/- 0.03) [Neural Net]
Accuracy: 0.80 (+/- 0.04) [AdaBoost]
Accuracy: 0.81 (+/- 0.03) [Gaussian Process]
Accuracy: 0.80 (+/- 0.03) [QDA]


In [29]:
final_model = best_models[best_scores.index(max(best_scores))]
print("Best Model: {}".format(names[best_scores.index(max(best_scores))]))

Best Model: Neural Net


In [30]:
final_model.fit(X_train, y_train)
print("Best Model Accuracy {}".format(final_model.score(X_valid, y_valid)))

Best Model Accuracy 0.8715083798882681


## Creating Submission File

In [31]:
predictions = final_model.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


## Congratulations!