In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Data collection

In [None]:
df = pd.read_csv('../input/titanic/train.csv')
df.head()

# 2. EDA and Data manipulation

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
def get_missingval_percent(df):
    for i in range(len(df.columns)):
        print('Missing values percentage of column',df.columns[i],': ', (df[df.columns[i]].isna().sum()/len(df))*100)
        
get_missingval_percent(df)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt 

plt.figure(figsize = (15,5))
sns.boxplot(df['Fare'])

In [None]:
df['Age'].hist()

In [None]:
df['Age'].isna().sum()

In [None]:
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Age'].isna().sum()

In [None]:
df['Age'].hist()

In [None]:
df.drop(['Cabin'], inplace=True, axis=1)
df.columns

In [None]:
print(df.Embarked.isna().sum())

In [None]:
df['Embarked'].hist()

In [None]:
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
print('Null values: ', df['Embarked'].isna().sum())


In [None]:
df.pivot_table(values='Survived', index='Embarked', columns=['Sex'])

In [None]:
df.pivot_table(values='Survived', index='Embarked', columns=['Sex']).plot(kind='bar')

In [None]:
df.info()

In [None]:
df['Ticket'].value_counts()

In [None]:
dummy = pd.get_dummies(df.Sex)
dummy

In [None]:
df.drop('Sex', inplace=True, axis =1)
df = pd.concat([df, dummy],axis=1)
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['Embarked'] = le.fit_transform(df['Embarked'])
df.head()

In [None]:
df.info()

In [None]:
df.drop(['Name', 'Ticket'], inplace = True, axis=1)
df.head()

In [None]:
X = df.drop('Survived', axis=1)
y = df.Survived

# 3. Train-Test-Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4243, stratify=y)

In [None]:
from collections import Counter

print(Counter(y_train))
print(Counter(y_test))

print('Distribution of survival values in train data - ', Counter(y_train)[0]/len(y_train))
print('Distribution of survival values in test data - ', Counter(y_test)[0]/len(y_test))

# 4. ML modeling and metrics

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
print('Training Logistic',lr.score(X_train, y_train))
predict = lr.predict(X_test)
print('Testing Logistic',accuracy_score(predict, y_test))

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=1000, max_depth=7, min_weight_fraction_leaf=0.00001)
rf.fit(X_train, y_train)

In [None]:
print('Training RandomForest',rf.score(X_train, y_train))
predict = rf.predict(X_test)
print('Testing RandomForest',accuracy_score(predict, y_test))

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(objective='binary:logistic')
xgb.fit(X_train, y_train)

In [None]:
print('Training XGBOOST',xgb.score(X_train, y_train))
predict = xgb.predict(X_test)
print('Testing XGBOOST',accuracy_score(predict, y_test))

# 5. Validation

In [None]:
from sklearn.model_selection import cross_validate, StratifiedKFold

score = cross_validate(lr, X, y, cv=10, scoring = ('r2', 'neg_mean_squared_error', 'roc_auc'))

In [None]:
score

In [None]:
skf = StratifiedKFold(n_splits=10)

In [None]:
lr_score = []
rf_score = []
xgb_score = []

def get_score(train, test, fold_no, model):
    x_train = train.drop(['Survived'],axis=1)
    y_train = train.Survived
    x_test = test.drop(['Survived'],axis=1)
    y_test = test.Survived
    model.fit(x_train, y_train)
    return model.score(x_test,y_test)
  

fold_no = 1
for train_index,test_index in skf.split(X, y):
    train = df.iloc[train_index,:]
    test = df.iloc[test_index,:]
    lr_score.append(get_score(train, test, fold_no, lr))
    rf_score.append(get_score(train, test, fold_no, rf))
    xgb_score.append(get_score(train, test, fold_no, xgb))
    fold_no += 1

In [None]:
lr_score

In [None]:
rf_score

In [None]:
xgb_score

# 7. Bonus

In [None]:
! pip install data-purifier

In [None]:
import datapurifier as dp
from datapurifier import Mleda, MlReport

df = pd.read_csv('../input/titanic/train.csv')
df.head()

In [None]:
ed = Mleda(df)

In [None]:
re = MlReport(df)