# Logistic Regression Example - Titanic Dataset

Predict Survival based on passenger class, sex, fare, embarkation, fare band

Steps
* Load data into pandas
* Clean data (select columns), remove any rows with missing values
* Encode data (convert string columns into numbers, required by model). One-hot Ordinal (later) for passenger class
* Encode label column (Died ->0, Survived ->1)
* Split data into training ands test sections
* Build logistic regression model, fit on training data an predict on test data
* Evaluate model with a confusion matrix

In [None]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report


In [None]:
titanic_url = 'https://raw.githubusercontent.com/MarkWilcock/CourseDatasets/main/Misc%20Datasets/Titanic%20Passenger.csv'
df = pd.read_csv(titanic_url) # read the data
df.head() # show the first 5 rows

In [None]:
df_slim = df.loc[:, ['Survival', 'Title','Passenger Class','Gender', 'Embarked', 'FareBand']]
df_slim.columns = ['survival', 'title','pass_class','gender', 'embarked', 'fareband']
df_slim.head()

In [None]:
# some quick data quality checks
df_slim.describe()
df_slim.loc[:, 'embarked'].unique()
df_slim.loc[:, 'fareband'].unique()

Encode the categorical columns with a one hot encoder

In [None]:
#category_columns = ['title', 'gender', 'embarked', 'fareband', 'pass_class']
category_columns = ['title', 'gender', 'embarked']
categorical_encoders = OneHotEncoder(sparse_output=False)

Encode the ordinal columns with a ordinal encoder

In [None]:
passenger_class_values = ['1st', '2nd', '3rd']
fareband_values = ['0 - 10', '10 - 20', '20 - 30', '30 - above']
ordinal_encoders = OrdinalEncoder(categories=[passenger_class_values, fareband_values]) 

In [None]:
ct = ColumnTransformer(
    transformers = [
        ('cat', categorical_encoders, category_columns),
        ('ord', ordinal_encoders, ['pass_class', 'fareband'])
        ], 
        remainder = 'drop')
ct.set_output(transform='pandas')
X = ct.fit_transform(df_slim)

In [None]:
X.columns
# X['ord__pass_class'].unique()

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_slim.loc[:, 'survival'])
y[:5]

In [None]:
# quick check - decode the labels
list(label_encoder.inverse_transform([0,1]))

In [None]:
# Spilt into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Build and fit the model
model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)


In [None]:
#  Evaluate using standard metrics
print('Classification Report\n',  classification_report(y_test, predictions))
print(f'f1 score\n {f1_score(y_test, predictions):3.3f}')


Understand how well the model is performing with a [confusion matrix](https://en.wikipedia.org/wiki/Confusion_matrix)

In [None]:
confusion_matrix(y_test, predictions)