# Titanic 

Goal: predict survival on the Titanic  

Here we are looking into how to apply Logistic Regression to the Titanic dataset.

# 1. Collect and understand the data

In [22]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [23]:
# get titanic training file as a DataFrame
# titanic = pd.read_csv("Data/train.csv")

In [24]:
# df=pd.read_excel(r"path/train.xlsx")

In [25]:
df=pd.read_csv(r"C:\Users\Lenovo\Muskan\Ml\Machine-Learning-Projects\Logistic_Regression\train.csv")

In [26]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [27]:
#preprocess the data

#1: check the nan
#Normalization numerical data using the minmaxscaler(0-1)
#Encoding of the categorical data (map/labelencoder)

In [28]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [29]:
df["Fare"].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [30]:
df['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

**Handling nan values**

In [31]:
df['Embarked']=df['Embarked'].fillna(df['Embarked'].mode()[0])

In [32]:
#handling nan value present in Age column 
df['Age']=df['Age'].fillna(df['Age'].mean())

In [33]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [68]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

**normalizing the data**

In [34]:
#using mean max scaler so we can make fare and age in the same range
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler 
scaler= MinMaxScaler()# define min

df['Age']=scaler.fit_transform(df[['Age']])
df['Fare']=scaler.fit_transform(df[['Fare']])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,0.271174,1,0,A/5 21171,0.014151,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,0.472229,1,0,PC 17599,0.139136,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,0.321438,0,0,STON/O2. 3101282,0.015469,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,0.434531,1,0,113803,0.103644,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,0.434531,0,0,373450,0.015713,,S


**encoding the data = converting categorical data into numerical data**

In [75]:
df['Sex']=df['Sex'].map({'male':0 , 'female':1})

In [82]:
df['Embarked']=df['Embarked'].map({'S':0,'C':1,'Q':2})

In [83]:
df.shape

(891, 12)

In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    int64  
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    int64  
dtypes: float64(2), int64(7), object(3)
memory usage: 83.7+ KB


In [85]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,0.271174,1,0,A/5 21171,0.014151,,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0.472229,1,0,PC 17599,0.139136,C85,1
2,3,1,3,"Heikkinen, Miss. Laina",1,0.321438,0,0,STON/O2. 3101282,0.015469,,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,0.434531,1,0,113803,0.103644,C123,0
4,5,0,3,"Allen, Mr. William Henry",0,0.434531,0,0,373450,0.015713,,0


# choosing the column which are matter to  get result


**Pclass
Sex
Age
SibSp
Parch
Fare
Embarked**

In [94]:
features=['Pclass','Sex', 'Age','SibSp','Parch','Fare','Embarked']
X = df[features].values


y = df['Survived'].values


print('Features:',X[:10], '\nLabels:', y[:10], sep='\n')

Features:
[[3.         0.         0.27117366 1.         0.         0.01415106
  0.        ]
 [1.         1.         0.4722292  1.         0.         0.13913574
  1.        ]
 [3.         1.         0.32143755 0.         0.         0.01546857
  0.        ]
 [1.         1.         0.43453129 1.         0.         0.1036443
  0.        ]
 [3.         0.         0.43453129 0.         0.         0.01571255
  0.        ]
 [3.         0.         0.36792055 0.         0.         0.0165095
  2.        ]
 [1.         0.         0.67328474 0.         0.         0.10122886
  0.        ]
 [3.         0.         0.01985423 3.         1.         0.04113566
  0.        ]
 [3.         1.         0.33400352 0.         2.         0.02173075
  0.        ]
 [2.         1.         0.17064589 1.         0.         0.05869429
  1.        ]]

Labels:
[0 1 1 1 0 0 0 0 1 1]


In [95]:
from sklearn.model_selection import train_test_split

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [97]:
from sklearn.linear_model import LogisticRegression

In [98]:
model_log = LogisticRegression(max_iter=500)

In [99]:
model_log.fit(X_train, y_train) # training

In [101]:
print(model_log)

LogisticRegression(max_iter=500)


In [102]:
pwd #path of current directory

'C:\\Users\\Lenovo\\Muskan\\Ml\\Machine-Learning-Projects\\Logistic_Regression'

In [103]:
import joblib

In [104]:
joblib.dump(model_log,'Titanic_model.pkl')

['Titanic_model.pkl']