# Supervised ML - Predict Titanic Survival

![image.png](attachment:image.png)

## Logistic Regression - classification with binary response variable

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

#### Load Training Dataset - Titanic_TrainingDataset.csv

In [None]:
train=pd.read_csv("Titanic_TrainingDataset.csv")
train.describe()

In [None]:
train.head(5)

### Select the columns we need for building model + clean up the data

In [None]:
df=train[['Survived','Pclass','Sex','Age','Fare']]

In [None]:
#change male to 1 and female to 0
df["Sex"] = df["Sex"].apply(lambda sex:1 if sex=="male" else 0)

In [None]:
#handle missing values of age
df["Age"] = df["Age"].fillna(df["Age"].median())
df["Fare"] = df["Fare"].fillna(df["Fare"].median()) #more immune to outliers
df.isnull().sum()

In [None]:
df.head(7)

In [None]:
X= df.drop("Survived", axis=1)
Y = df["Survived"]

## What happens when we don't have a seperate hold-out test dataset?

 - We take our dataset and split it into training data (70%) and testing data (30%)
 - We will fit the model on 70% of the data and test its performance on the 30% data set

## When you don't have a seperate hold-out test dataset - use sklearn's train_test_split()

In [None]:
#30% hold out for testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = .3, random_state=25) 

### Train the model using sklearn.linear_model.LogisticRegression algorithm

In [None]:
logreg = LogisticRegression() #define the logistic regression model
titanicmodel = logreg.fit(X_train, y_train)

In [None]:
#predict the response variable based on predictors in the test set
Y_pred = titanicmodel.predict(X_test) 

## Let's check how accurate (reliable) our model is

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
 #compare with the actual y values, y_test (hold outs) with predicted y
accuracypct = round(accuracy_score(y_test, Y_pred) * 100, 2)
print(accuracypct)

## Let's predict the outcome using individual values

In [None]:
data = {'Pclass': [3, 2], 'Sex': [1, 0],'Age': [40, 35], 'Fare': [7.98, 8.99]}
passengers = pd.DataFrame(data)
passengers

In [None]:
predictions = titanicmodel.predict(passengers)
print(predictions)

# Evaluate Model

- Classification accuracy is the **easiest classification metric to understand**
- But, it does not tell you the **underlying distribution** of response values
- And, it does not tell you what **"types" of errors** your classifier is making

![image.png](attachment:image.png)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, Y_pred)
confusion_matrix

In [None]:
confusion_matrix.sum()

In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_test, Y_pred)
print(report)