In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


sns.set_style('whitegrid')


train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
train_data.head()

In [None]:
train_data.describe()

In [None]:
train_data.info()
print('\n' + '-'*40+'\n')
test_data.info()

In [None]:
drop_cols = ['Name', 'Ticket']
train_data = train_data.drop(drop_cols, axis=1)
test_data = test_data.drop(drop_cols, axis=1)

In [None]:
print(train_data['Embarked'].value_counts().sort_values(ascending=False))

In [None]:
train_data['Embarked'] = train_data['Embarked'].fillna('S')

In [None]:
sns.set_palette("Set2")
sns.factorplot(x='Embarked', y='Survived', data=train_data, height=3 ,aspect=4)

figure, (ax1, ax2, ax3) = plt.subplots(1,3,figsize=(15,5))

sns.countplot(ax=ax1, data=train_data, x='Embarked')
sns.countplot(ax=ax2, data=train_data, x='Survived', hue='Embarked')

embark_mean = train_data[['Survived', 'Embarked']].groupby('Embarked', as_index=False).mean()
sns.barplot(ax=ax3, y='Survived', x='Embarked', data=embark_mean, order=['S', 'C', 'Q'])
train_data.drop(['Embarked'], axis=1,inplace=True)
test_data.drop(['Embarked'], axis=1,inplace=True)

In [None]:

test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)
train_data['Fare'] = train_data['Fare'].astype(int)
test_data['Fare'] = test_data['Fare'].astype(int)

train_data['Fare'].plot(kind='hist', bins=100, xlim=(0,50), figsize=(18, 4))

In [None]:
print(round(train_data['Fare'][train_data['Survived'] == 1].sum() / train_data['Fare'].sum(), 3))
print(round(train_data['Fare'][train_data['Survived'] == 0].sum() / train_data['Fare'].sum(), 3))

In [None]:
# Age

count_null_value_train = train_data['Age'].isna().sum()
train_mean_age = train_data['Age'].mean()
train_median_age = train_data['Age'].median()
train_std_age = train_data['Age'].std()


count_null_value_test  = test_data['Age'].isna().sum()
test_mean_age = test_data['Age'].mean()
test_median_age = test_data['Age'].median()
test_std_age = test_data['Age'].std()

In [None]:
rand_for_train = np.random.randint(train_mean_age - train_std_age, train_mean_age + train_std_age, size=count_null_value_train)
rand_for_test  = np.random.randint(test_mean_age - test_std_age, test_mean_age + test_std_age, size=count_null_value_test)

In [None]:
figure, (axis1, axis2) = plt.subplots(1,2)
axis1.set_title('Original Age data')
axis2.set_title('Updated Age data')

train_data['Age'].dropna().astype(int).hist(ax=axis1, bins=70, figsize=(24,6))

train_data['Age'][np.isnan(train_data['Age'])] = rand_for_train
test_data['Age'][np.isnan(test_data['Age'])]   = rand_for_test

train_data['Age'] = train_data['Age'].astype(int)
test_data['Age']  = test_data['Age'].astype(int)


train_data['Age'].hist(bins=70, ax=axis2, figsize=(24,6)) 

In [None]:
facet = sns.FacetGrid(train_data, hue='Survived', aspect=4)
facet.map(sns.kdeplot, 'Age', shade=True)
facet.set(xlim=(0, train_data['Age'].max()))
facet.add_legend()

fig, ax = plt.subplots(1,1, figsize=(18,4))
average_age = train_data[['Age', 'Survived']].groupby('Age', as_index=False).mean()
sns.barplot(x='Age', y='Survived', data=average_age)

In [None]:
# Cabin

train_data.drop('Cabin', inplace=True, axis=1)
test_data.drop('Cabin', inplace=True, axis=1)

In [None]:
# Family

train_data['Family'] = train_data['SibSp'] + train_data['Parch']
train_data['Family'].loc[train_data['Family'] > 0] = 1
train_data['Family'].loc[train_data['Family'] == 0] = 0

test_data['Family'] = test_data['SibSp'] + test_data['Parch']
test_data['Family'].loc[test_data['Family'] > 0] = 1
test_data['Family'].loc[test_data['Family'] == 0] = 0


train_data = train_data.drop(['SibSp', 'Parch'], axis=1)
test_data = test_data.drop(['SibSp', 'Parch'], axis=1)


In [None]:
# Plotting
sns.set_palette('Paired')
fig, (axis1, axis2, axis3) = plt.subplots(1,3,figsize=(15,6))
axis1.set_title('Survived mean with/without family')
axis2.set_title('Count of people with/without family')
axis3.set_title('Count of males and females with/without family')
average_family = train_data[['Family', 'Survived']].groupby('Family', as_index=False).mean()
sns.countplot(data=train_data, x='Family', ax=axis2, order=[1,0])
sns.barplot(data=average_family, x='Family', y='Survived', order=[1,0], ax=axis1)
sns.countplot(data=train_data, x='Family', ax=axis3, order=[1,0], hue='Sex')

In [None]:
# Pclass

sns.factorplot(data=train_data, y='Survived', x='Pclass', size=7, order=[1,2,3], color='lightgreen')

In [None]:
# Sex

def get_person(passenger):
    age, sex = passenger
    return 'child' if age < 16 else sex

train_data['Person'] = train_data[['Age', 'Sex']].apply(get_person, axis=1)
test_data['Person']  = test_data[['Age', 'Sex']].apply(get_person, axis=1)


train_data.drop(['Sex'], inplace=True, axis=1)
test_data.drop(['Sex'], inplace=True, axis=1)

person_dummies_train = pd.get_dummies(train_data['Person'])
person_dummies_train.columns = ['Child', 'Female', 'Male']
person_dummies_train.drop(['Male'], inplace=True, axis=1)

person_dummies_test = pd.get_dummies(test_data['Person'])
person_dummies_test.columns = ['Child', 'Female', 'Male']
person_dummies_test.drop(['Male'], inplace=True, axis=1)


train_data = train_data.join(person_dummies_train)
test_data  = test_data.join(person_dummies_test)


fig, (axis1, axis2) = plt.subplots(1,2, figsize=(12,6))
axis1.set_title('Count of Female/Male/Children')
sns.countplot(ax=axis1, x='Person', data=train_data)
perc_person = train_data[['Survived', 'Person']].groupby('Person', as_index=False).mean()
sns.barplot(x='Person', y='Survived', data=perc_person, ax=axis2, order=['male','female','child'])

train_data.drop(['Person'], inplace=True, axis=1)
test_data.drop(['Person'], axis=1, inplace=True)

In [None]:
pclass_dummies_train = pd.get_dummies(train_data['Pclass'])
pclass_dummies_train.columns = ['Class_1','Class_2','Class_3']
pclass_dummies_train.drop(['Class_3'], inplace=True, axis=1)


pclass_dummies_test = pd.get_dummies(test_data['Pclass'])
pclass_dummies_test.columns = ['Class_1','Class_2','Class_3']
pclass_dummies_test.drop(['Class_3'], inplace=True, axis=1)

train_data.drop('Pclass', axis=1, inplace=True)
test_data.drop('Pclass', axis=1, inplace=True)

train_data = train_data.join(pclass_dummies_train)
test_data = test_data.join(pclass_dummies_test)

train_data.head()

In [None]:
# Splitting data

X_train = train_data.drop('Survived', axis=1)
X_test = test_data.drop('PassengerId', axis=1).copy()
y_train = train_data['Survived']


In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 8, 12]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
# rf = RandomForestClassifier()

# # Random search of parameters, using 3 fold cross validation, 
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=200, cv=5, verbose=2, random_state=42, n_jobs=-1)
# X_train.head()
# # Fit the random search model
# rf_random.fit(X_train, y_train)
# rf_random.best_params_

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [int(x) for x in np.linspace(670, 690, num = 1)],
     'min_samples_split': [int(x) for x in np.linspace(3, 7, num = 1)],
     'min_samples_leaf': [int(x) for x in np.linspace(2, 6, num = 1)],
     'max_features': ['sqrt'],
     'max_depth': [int(x) for x in np.linspace(50, 100, num = 2)],
     'bootstrap': [False]
}
rf1 = RandomForestClassifier()
grid_search = GridSearchCV(estimator = rf1, param_grid = params,
                          cv = 4, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)
grid_search.best_params_

best_params = {'bootstrap': False,
 'max_depth': 100,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 3,
 'n_estimators': 670}


In [None]:
X_train.drop(['PassengerId'], inplace=True, axis=1)

In [None]:
print(X_train.shape)
print(y_train.shape)

models_accuracy = {}


# Tuned Random Forest
random_forest_tuned = RandomForestClassifier(bootstrap=False, max_depth=100, max_features='sqrt', min_samples_leaf=2, min_samples_split=3, n_estimators=670)
random_forest_tuned.fit(X_train, y_train)
y_pred = random_forest_tuned.predict(X_test)
score = random_forest_tuned.score(X_train, y_train)
models_accuracy['random_forest_tuned'] = score

# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
score = random_forest.score(X_train, y_train)
models_accuracy['random_forest'] = score

# Support Vector Machines

svc = SVC()

svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
score = svc.score(X_train, y_train)
models_accuracy['support vector machines'] = score

knn = KNeighborsClassifier(n_neighbors = 3)

knn.fit(X_train, y_train)

Y_pred = knn.predict(X_test)

models_accuracy['KNeighborsClassifier'] = knn.score(X_train, y_train)

vclf = VotingClassifier(estimators=[('rf', random_forest), ('rft', random_forest_tuned)], voting='hard')
vclf = vclf.fit(X_train, y_train)
y_pred = vclf.predict(X_test)
models_accuracy['VotingClassifier'] = vclf.score(X_train, y_train)

models_accuracy

In [None]:
X_test.head()

In [None]:
X_test.describe()

In [None]:
def predict_your_chances(age,fare,family,child,female,class_1,class_2):
    your_data = np.array([[age,fare,family,child,female,class_1,class_2]])
    return f'Your chances of survival: {random_forest.predict_proba(your_data)[0][1]}'

print(predict_your_chances(19,40,1,0,0,0,1))

This notebook was created to practice data science with help of https://www.kaggle.com/omarelgabry/a-journey-through-titanic