## Problem Statement - 
### Probability of survival of a person on the titanic based on gender, age and passenger-class.


# Imports

In [36]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

# Data Analysis

In [37]:
data = pd.read_csv("titanic.csv") ## Reading the .csv file

In [38]:
data.head()  ## This gives the first 5 values of all the columns in the dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,fair,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [39]:
data = data.drop(["Cabin","Name","Ticket","PassengerId", "SibSp"],axis=1)  ## Dropping unwanted data fields

In [40]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,fair,Embarked
0,0,3,male,22.0,0,7.25,S
1,1,1,female,38.0,0,71.2833,C
2,1,3,female,26.0,0,7.925,S
3,1,1,female,35.0,0,53.1,S
4,0,3,male,35.0,0,8.05,S


In [41]:
data.isnull().sum()  ## Tracking the null values in the data fields

Survived      0
Pclass        0
Sex           0
Age         177
Parch         0
fair          0
Embarked      2
dtype: int64

In [42]:
data['Age']=data['Age'].fillna(data['Age'].median())  ## Filling the null values with the median of the data

In [43]:
data['Embarked']=data['Embarked'].fillna(data['Embarked'].mode()[0]) ## Filling the null values with the mode of the data

In [44]:
data.isnull().sum()  ## Checking if there are any remaining null values

Survived    0
Pclass      0
Sex         0
Age         0
Parch       0
fair        0
Embarked    0
dtype: int64

In [45]:
data.loc[data["Sex"] == "male", "Sex"] = 0  ## Replacing 'male' field to 0
data.loc[data["Sex"] == "female", "Sex"] = 1  ## Replacing 'female' field to 1

In [46]:
cat_num = ['Embarked']
le = LabelEncoder()
for i in cat_num:
    data[i] = le.fit_transform(data[i])

In [47]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,fair,Embarked
0,0,3,0,22.0,0,7.25,2
1,1,1,1,38.0,0,71.2833,0
2,1,3,1,26.0,0,7.925,2
3,1,1,1,35.0,0,53.1,2
4,0,3,0,35.0,0,8.05,2


In [48]:
data.groupby(['Survived']).size()

Survived
0    549
1    342
dtype: int64

In [49]:
data.describe()  ## This method returns the important aspects of the data like count, mean, minimum value etc.

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,fair,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.352413,29.361582,0.381594,32.204208,1.536476
std,0.486592,0.836071,0.47799,13.019697,0.806057,49.693429,0.791503
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,0.0,22.0,0.0,7.9104,1.0
50%,0.0,3.0,0.0,28.0,0.0,14.4542,2.0
75%,1.0,3.0,1.0,35.0,0.0,31.0,2.0
max,1.0,3.0,1.0,80.0,6.0,512.3292,2.0


In [50]:
data.corr()  ## This method finds the correlation between the data fields

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,fair,Embarked
Survived,1.0,-0.338481,0.543351,-0.06491,0.081629,0.257307,-0.167675
Pclass,-0.338481,1.0,-0.1319,-0.339898,0.018443,-0.5495,0.162098
Sex,0.543351,-0.1319,1.0,-0.081163,0.245489,0.182333,-0.108262
Age,-0.06491,-0.339898,-0.081163,1.0,-0.172482,0.096688,-0.018754
Parch,0.081629,0.018443,0.245489,-0.172482,1.0,0.216225,0.039798
fair,0.257307,-0.5495,0.182333,0.096688,0.216225,1.0,-0.224719
Embarked,-0.167675,0.162098,-0.108262,-0.018754,0.039798,-0.224719,1.0


# Data Analysis

In [51]:
data = pd.read_csv("titanic.csv")  ## Reading the .csv file

In [52]:
data.head()  ## This gives the first 5 values of all the columns in the dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,fair,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [53]:
data = data.drop(["Cabin","Name","Ticket","PassengerId", "SibSp"],axis=1)  ## Dropping unwanted data fields

In [54]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,fair,Embarked
0,0,3,male,22.0,0,7.25,S
1,1,1,female,38.0,0,71.2833,C
2,1,3,female,26.0,0,7.925,S
3,1,1,female,35.0,0,53.1,S
4,0,3,male,35.0,0,8.05,S


In [55]:
data.isnull().sum()  ## Tracking the null values in the data fields

Survived      0
Pclass        0
Sex           0
Age         177
Parch         0
fair          0
Embarked      2
dtype: int64

In [56]:
data['Age']=data['Age'].fillna(data['Age'].median())  ## Filling the null values with the median of the data

In [57]:
data['Embarked']=data['Embarked'].fillna(data['Embarked'].mode()[0])  ## Filling the null values with the mode of the data

In [58]:
data.isnull().sum()  ## Checking if there are any remaining null values

Survived    0
Pclass      0
Sex         0
Age         0
Parch       0
fair        0
Embarked    0
dtype: int64

In [59]:
data.loc[data["Sex"] == "male", "Sex"] = 0  ## Replacing 'male' field to 0
data.loc[data["Sex"] == "female", "Sex"] = 1  ## Replacing 'female' field to 1

In [60]:
cat_num = ['Embarked']
le = LabelEncoder()
for i in cat_num:
    data[i] = le.fit_transform(data[i])

In [61]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,fair,Embarked
0,0,3,0,22.0,0,7.25,2
1,1,1,1,38.0,0,71.2833,0
2,1,3,1,26.0,0,7.925,2
3,1,1,1,35.0,0,53.1,2
4,0,3,0,35.0,0,8.05,2


In [62]:
data.groupby(['Survived']).size()

Survived
0    549
1    342
dtype: int64

In [63]:
data.describe()  ## This method returns the important aspects of the data like count, mean, minimum value etc.

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,fair,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.352413,29.361582,0.381594,32.204208,1.536476
std,0.486592,0.836071,0.47799,13.019697,0.806057,49.693429,0.791503
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,0.0,22.0,0.0,7.9104,1.0
50%,0.0,3.0,0.0,28.0,0.0,14.4542,2.0
75%,1.0,3.0,1.0,35.0,0.0,31.0,2.0
max,1.0,3.0,1.0,80.0,6.0,512.3292,2.0


In [64]:
data.corr()  ## This method finds the correlation between the data fields

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,fair,Embarked
Survived,1.0,-0.338481,0.543351,-0.06491,0.081629,0.257307,-0.167675
Pclass,-0.338481,1.0,-0.1319,-0.339898,0.018443,-0.5495,0.162098
Sex,0.543351,-0.1319,1.0,-0.081163,0.245489,0.182333,-0.108262
Age,-0.06491,-0.339898,-0.081163,1.0,-0.172482,0.096688,-0.018754
Parch,0.081629,0.018443,0.245489,-0.172482,1.0,0.216225,0.039798
fair,0.257307,-0.5495,0.182333,0.096688,0.216225,1.0,-0.224719
Embarked,-0.167675,0.162098,-0.108262,-0.018754,0.039798,-0.224719,1.0


In [65]:
X = data.drop('Survived', axis=1) ## Droping the 'Survived field from the parameters'
y = data['Survived']  ## Setting target value as 'Survived'

# Data Split

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=156)  ## Splitting the train and test data

# Model Fitting

In [67]:
model = LogisticRegression()  ## Creating LogisticRegression object
model2 = DecisionTreeClassifier()  ## Creating DecisionTreeClassifie object
result = model.fit(X_train, y_train)  ## Calling fit method from the object
result2 = model2.fit(X_train, y_train)

In [68]:
model.predict(X_test)  ## Predicting using the testing dataset

array([1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0], dtype=int64)

In [69]:
model2.predict(X_test)  ## Predicting using the testing dataset

array([0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0], dtype=int64)

# Model Scores

In [70]:
score = model.score(X_test, y_test)
score2 = model2.score(X_test, y_test)
print("Score using Logistic Regression - ", score)
print("Score using Decision Tree Classifier - ", score2)

Score using Logistic Regression -  0.7877094972067039
Score using Decision Tree Classifier -  0.8100558659217877
