In [89]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [36]:
data_train = pd.read_csv('/kaggle/input/titanic/train.csv')
data_train.head()

In [37]:
data_test = pd.read_csv('/kaggle/input/titanic/test.csv')
data_test.head()

In [38]:
data_train.shape

In [41]:
data_train.describe()

This is our dataset description and its looking fine, Now let's just find the null values present in our dataset.

In [42]:
data_train.isna().sum()

Found null values in Age, Cabin, Embarked columns.

In [43]:
data_train['Age'] = data_train['Age'].fillna(data_train['Age'].median())
data_train['Embarked'] = data_train['Embarked'].fillna(data_train['Embarked'].mode()[0])

In [44]:
data_train.isna().sum()

Filled all the missing values using fillna and with the help of median() and mode(). And I am not filling the missing values in cabin as we are going to drop the cabin column.

In [45]:
data_train = data_train.drop(columns = ['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)
data_train.head()

Dropped all the unwanted columns.

In [46]:
data_train.shape

Now, the shape of the dataset is (891,8)

In [47]:
data_train.info()

There are still problem in our dataset as there is also object value present in dataset, which is going to create a problem for the machine as it accepts only int/float values.

In [54]:
lab = LabelEncoder()
sex = lab.fit_transform(data_train['Sex'])
embarked = lab.fit_transform(data_train['Embarked'])

In [55]:
data_train['Sex'] = sex
data_train['Embarked'] = embarked

In [56]:
data_train.head()

* Encoded all the object data to integer data using LabelEncoder()


Lets, check it again.

In [57]:
data_train.info()

Encoded all the object values.

# Now our dataset is clean and ready for further process

Checking the balance of the dataset based on our label i.e. Survived

In [58]:
sns.countplot(x = 'Survived', data = data_train)
plt.show()

It is clear that the dataset is Balanced based on label

Now, checking the multicollinearity using heatplot map

In [59]:
data_corr = data_train.corr()

In [62]:
plt.figure(figsize = (15,10))
sns.heatmap(data_corr,annot = True)
plt.show()

There is no multocollinearity between columns

* Seperating the label and features

In [71]:
x_feature = data_train.drop(columns = ['Survived'])
x_label = data_train['Survived']

Now, using the StandardScaler() on the Features

In [73]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_feature)
x_scaled

# - Training the model

In [75]:
x_train,x_test,y_train,y_test = train_test_split(x_feature, x_label, test_size = 0.25, random_state = 9)

* Selecting the best Model for our dataset

In [76]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve,roc_auc_score
from sklearn.metrics import plot_roc_curve

In [77]:
lr = LogisticRegression()
kn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

*Defining all the models*

In [78]:
lr.fit(x_train,y_train)
kn.fit(x_train,y_train)
dt.fit(x_train,y_train)
rf.fit(x_train,y_train)

print("ALL MODELS ARE TRAINED")

In [87]:
print('Logistic Regression Score : ',lr.score(x_train,y_train))
print('KNearest Neighbor Score : ',kn.score(x_train,y_train))
print('Decision Tree Score : ',dt.score(x_train,y_train))
print('Random Forest Score : ',rf.score(x_train,y_train))

***This is the score of particular model on training***

In [82]:
disp = plot_roc_curve(dt, x_train, y_train)
plot_roc_curve(kn,x_train,y_train, ax = disp.ax_)
plot_roc_curve(lr,x_train,y_train, ax = disp.ax_)
plot_roc_curve(rf,x_train,y_train, ax = disp.ax_)
plt.legend(prop = {'size': 10}, loc = 'lower right')
plt.show()

*On training this is the score for particular model and we find that Decision Tree and Random Forest is giving best score based on training data.*

In [88]:
print('Logistic Regression Score : ',lr.score(x_test,y_test))
print('KNearest Neighbor Score : ',kn.score(x_test,y_test))
print('Decision Tree Score : ',dt.score(x_test,y_test))
print('Random Forest Score : ',rf.score(x_test,y_test))

***This is the score of particular model on testing***

In [84]:
disp = plot_roc_curve(dt, x_test, y_test)
plot_roc_curve(kn,x_test,y_test, ax = disp.ax_)
plot_roc_curve(lr,x_test,y_test, ax = disp.ax_)
plot_roc_curve(rf,x_test,y_test, ax = disp.ax_)
plt.legend(prop = {'size': 10}, loc = 'lower right')
plt.show()

*On testing score, it is clear that  Random Forest and Logistic Regression score is the top 2 score among all. And can see the difference in decision tree score of testing and training.*

**- Model Instantiating & Trainning**

In [92]:
#rf = RandomForestClassifier()
rf.fit(x_train,y_train)

Random Forest Classifier model is Instantiated

In [91]:
y_pred = rf.predict(x_test)
y_pred

Prediction using Random Forest Classifier

* Now, Creating a confusion matrix

In [93]:
cfm = confusion_matrix(y_test, y_pred)
cfm

1. False Positive : 18
2. False Negative : 29

In [94]:
print(classification_report(y_test, y_pred, digits = 2))

**So, This is the Classification Report and according to this the accuracy of our model is 79%**