In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Exploring Dataset

In [2]:
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.shape  # Shape of dataset

(891, 12)

In [4]:
df.isnull().sum()  # Missing values

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Cleaning

In [5]:
# Filling the age column
df['Age'] = df['Age'].fillna(df['Age'].median())  # filled with median becoz it is better than mean for outliers

# Filling Embarked column
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# We drop Cabin column as there are so many missing values for it and also it is not much imp for us, and also along with it we dropped some more unimp columns
df = df.drop(columns=['Cabin','Ticket','Name','PassengerId'])

print("Missing values after taking care of them..")
print(df.isnull().sum())

Missing values after taking care of them..
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [6]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


### Encoding
converting Text into numbers so that our model can understand it
2 types of Encoding:
1) Label Encoding: Converting categories to numbers (Good for binary things). If there are only 2 categories it acts like a switch, if there are multi columns then it gives ordering to each column. 
2) One Hot Encoding: Creates a new column for each category. Good for Categories with no order.

In [7]:
# For sex column we are using Label encoding
df['Sex'] = df['Sex'].map({'male' : 0, 'female' : 1})  # Maping the categories first by our choice

# For Embark column we are using One hot Encoding as there are more than 2 columns and also they dont have any order.
df = pd.get_dummies(df,columns=['Embarked'],drop_first=True)  # The get_dummies function creates a new column for each new category and "drop_first" drops the first column within them as only n-1 columns is sufficient to classify them

print("Data after encoding...")

df['Embarked_Q'] = df['Embarked_Q'].astype(int)
df['Embarked_S'] = df['Embarked_S'].astype(int)

print(df.head().to_string())

Data after encoding...
   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked_Q  Embarked_S
0         0       3    0  22.0      1      0   7.2500           0           1
1         1       1    1  38.0      1      0  71.2833           0           0
2         1       3    1  26.0      0      0   7.9250           0           1
3         1       1    1  35.0      1      0  53.1000           0           1
4         0       3    0  35.0      0      0   8.0500           0           1


In [8]:
# checking the correlation
print("Correlation check...")
print(df.corr()['Survived'].sort_values(ascending=False))

Correlation check...
Survived      1.000000
Sex           0.543351
Fare          0.257307
Parch         0.081629
Embarked_Q    0.003650
SibSp        -0.035322
Age          -0.064910
Embarked_S   -0.149683
Pclass       -0.338481
Name: Survived, dtype: float64


As the Correlation between Survival and sex is 0.54 which means as the number goes up (1-female, 0-male) it the survival increases. It is true as it is womens and childrens were given more priority.

## Splitting

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X = df.drop('Survived',axis=1)  # input features
Y = df['Survived']  # Output

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

model = DecisionTreeClassifier(random_state=42)

model.fit(X_train,Y_train)
print("Model Training completed (Decision Tree)")

prediction = model.predict(X_test)
print(f"Accuracy score: {accuracy_score(Y_test,prediction)*100:.2f}%")

Model Training completed
Accuracy score: 78.77%


In [13]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100,random_state=42)

rf_model.fit(X_train,Y_train)

rf_pred = rf_model.predict(X_test)

print("Model Training Completed (Random Forest)")

print(f"Random_forest accuracy: {accuracy_score(Y_test,rf_pred)*100:.2f}%")

Model Training Completed (Random Forest)
Random_forest accuracy: 79.89%
