In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# **Data Description**
##### **Survival** : Survival	0 = No, 1 = Yes. This is the **target variable**.

##### **Pclass** : Ticket class , 1 = 1st, 2 = 2nd, 3 = 3rd.

##### **Sex** : Gender.

##### **Age** : Age in years.

##### **SibSp** : # of siblings / spouses aboard the Titanic.

##### **Parch** : # of parents / children aboard the Titanic.

##### **Ticket** : Ticket number.

##### **Fare** : Passenger fare.

##### **Cabin** : Cabin number.

##### **Embarked** : Port of Embarkation.

# **Importing the train and test data**.

In [5]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# **Exploring the dataset**.

In [6]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [7]:
test.head(10)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [10]:
# train.info()

In [11]:
# test.info()

* We have 177 null values in Age, 687 in Cabin and 2 in Embarked.

In [12]:
# cheking how many values we have null in each column of train data
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

* We have 86 null values in Age, 327 in Cabin and 1 in Fare.

In [13]:
# cheking how many values we have null in each column of test data
test.isnull().sum()


PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

# **Explanatory Data Analysis**
* We try to apply a bit of EDA on some features to get a better understanding through amazing visuals.

#### The visual below representing how many male and female survived along with their respective ages.

In [None]:
px.bar(data_frame=train, x = 'Survived' , y = 'Age' ,color='Sex',
                                       color_discrete_sequence = px.colors.diverging.Temps).update_layout(
                                                    template="plotly_dark",barmode = 'group').update_layout(template="plotly_dark") 

#### The visual below representing how many ppl suvived, what was their respective ages and which passenger class(Pclass) they belonged to.
* As we can see the ppl belonging to class 3 were the ones not survived(there are a few that survived many lost their lives,see yourself by hovering over the visual).
* Similarly ppl belonging to class 1 survived the most(there are a few that lost their lives while majority survived,see yourself by hovering over the visual).

In [306]:
px.scatter(train , x = 'Age' , y = 'Pclass',color = 'Survived',color_continuous_scale=px.colors.diverging.Temps).update_layout(template="plotly_dark")

#### The visual below representing how much ppl of different ages are paying for passenger class(Pclass) 1, 2 & 3.

In [307]:
px.scatter(train,'Age','Fare',color = 'Pclass',color_continuous_scale=px.colors.sequential.BuGn).update_layout(template="plotly_dark")

#### The visual below is representing how many ppl with varying no. of siblings survived the disaster.
* As we can see that passengers with no siblings survived the most.

In [308]:
px.pie(train,names = 'SibSp',values = 'Survived',color_discrete_sequence=px.colors.diverging.Temps).update_layout(template="plotly_dark")

#### The visual below helps in outlier detection by this visual we can choose best imputation techinque to apply on fare and age(if need be).
* We can see age does not have that much of outliers but on the other hand fare is having a picnic with outliers -_-.

In [309]:
px.box(train,y = ['Age','Fare',],color_discrete_sequence=px.colors.diverging.Temps).update_layout(template="plotly_dark")

#### Histogram tells us the distribution the data is following.
* The histogram is also representing that fare is certainly not following gaussian distribution and is rightly skewed.
* Rightly skewed means that outliers are at the end of the tail.

In [310]:
px.histogram(train,'Fare',color = 'Survived'
                                         ,color_discrete_sequence=px.colors.diverging.Temps).update_layout(
                                            template="plotly_dark",bargap = 0.1)

#### Histogram tells us the distribution the data is following.
* This histogram is representing that age is nearly following gaussian distribution.

In [311]:
px.histogram(train,'Age',color = 'Survived'
                                         ,color_discrete_sequence=px.colors.diverging.Temps,histnorm='probability').update_layout(
                                            template="plotly_dark",bargap = 0.05)

## **Data Cleaning**
* In this section we will deal with imputation using statistical methods.

In [312]:
# replacing the null values of age with mean.
train.Age.fillna(train.Age.mean(),inplace = True)

# replacing the null values of cabin with most occuring value.
train.Cabin.fillna(train.Cabin.mode()[0],inplace = True)

# replacing the null values of cabin with most occuring value.
train.Embarked.fillna(train.Embarked.mode()[0],inplace = True)

* Applying everything on test dataset similar to train dataset.

In [323]:
test.Age.fillna(test.Age.mean(),inplace = True)
test.Cabin.fillna(test.Cabin.mode()[0],inplace = True)

# replacing the null values of fare with median, choosing median because we detected outliers in age box visual.
test.Fare.fillna(test.Fare.median(),inplace = True)

# **Feature Engineering**
* In this section we select the best features for our model by using the information from the visuals.

In [324]:
xtrain = train[['Age','Pclass','Fare','SibSp','Parch','PassengerId']]
ytrain = train.Survived
xtrain.head(5)

In [325]:
xtest = test[['Age','Pclass','Fare','SibSp','Parch','PassengerId']]
xtest.head(5)

#### Applying log transformation on fare on both train and test dataset.
* Adding 1 just in case if a value is 0 then log(0) will upset this cell :(

In [326]:
xtrain['Fare'] = np.log(xtrain['Fare']+1) 
xtest['Fare'] = np.log(xtest['Fare']+1) 

* Creating dummies for Embarket and Sex features of train and test.

In [327]:
emb = LabelEncoder()
sex = LabelEncoder()
xtrain['Embarked'] = emb.fit_transform(train.Embarked)
xtrain['Sex'] = sex.fit_transform(train.Sex)

In [328]:
embt = LabelEncoder()
sext = LabelEncoder()
xtest['Embarked'] = embt.fit_transform(test.Embarked)
xtest['Sex'] = sext.fit_transform(test.Sex)

# **Logistics Regression Model**

In [330]:
lr = LogisticRegression()

In [331]:
lr.fit(xtrain,ytrain)

In [332]:
ypred = lr.predict(xtest)

In [334]:
lr.score(xtest,ypred)

# **Decision Tree Classifier Model**

In [335]:
dt = DecisionTreeClassifier()
dt.fit(xtrain,ytrain)

In [336]:
dt_ypred = dt.predict(xtest)

In [337]:
dt.score(xtest,dt_ypred)

## **Submission**

In [339]:
PassengerId = xtest.PassengerId
Survived = pd.Series(ypred,name = 'Survived')

In [340]:
submission = pd.concat([PassengerId,Survived],axis = 1)

In [342]:
submission.to_csv('submission.csv')