### Module Importation and DataFrame Setup

In [1]:
# Import the required modules
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [2]:
# Read the cleaned CSV file from the Dataset folder into a Pandas DataFrame
dfSurvivals = pd.read_csv(
    Path("../CSV_OUTPUT/Base_Cleaned_DS_CSV.csv")
)

In [3]:
# Review the DataFrame
dfSurvivals.head()

Unnamed: 0,Passenger Id,Title,First Name,Last Name,Sex,Age,Sibling/Spouse Aboard,Parent/Children Aboard,Passenger Class,Fare,Embarkation Port,Survival Boat,Body Number,Survived,age_group,family_size
0,1,Miss,Elisabeth Walton,Allen,female,29.0,0,0,1,211.3375,S,2,S,1,Adults,0
1,2,Master,Hudson Trevor,Allison,male,0.9167,1,2,1,151.55,S,11,S,1,Children,3
2,3,Miss,Helen Loraine,Allison,female,2.0,1,2,1,151.55,S,DNS,BNR,0,Children,3
3,4,Mr,Hudson Joshua Creighton,Allison,male,30.0,1,2,1,151.55,S,DNS,135,0,Adults,3
4,5,Mrs,Hudson J C (Bessie Waldo Daniels),Allison,female,25.0,1,2,1,151.55,S,DNS,BNR,0,Adults,3


In [4]:
# Print DataFrame information to ensure no null values and check datatypes
dfSurvivals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1308 entries, 0 to 1307
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Passenger Id            1308 non-null   int64  
 1   Title                   1308 non-null   object 
 2   First Name              1308 non-null   object 
 3   Last Name               1308 non-null   object 
 4   Sex                     1308 non-null   object 
 5   Age                     1308 non-null   float64
 6   Sibling/Spouse Aboard   1308 non-null   int64  
 7   Parent/Children Aboard  1308 non-null   int64  
 8   Passenger Class         1308 non-null   int64  
 9   Fare                    1308 non-null   float64
 10  Embarkation Port        1308 non-null   object 
 11  Survival Boat           1308 non-null   object 
 12  Body Number             1308 non-null   object 
 13  Survived                1308 non-null   int64  
 14  age_group               1308 non-null   

### Preprocess Data

In [5]:
# Change Sex Column
result = OneHotEncoder().fit_transform(dfSurvivals["Sex"].values.reshape(-1, 1)).toarray()
dfSurvivals[["Female", "Male"]] = pd.DataFrame(result, index = dfSurvivals.index)
dfSurvivals.drop(["Sex"], axis=1, inplace=True)

In [6]:
# Apply categorical encoding to the Embarkation Port, Surival Boat, and age_group columns
labelencoder_X = LabelEncoder()

dfSurvivals["Embarkation Port"] = labelencoder_X.fit_transform(dfSurvivals["Embarkation Port"])
dfSurvivals["Survival Boat"] = labelencoder_X.fit_transform(dfSurvivals["Survival Boat"])
dfSurvivals["age_group"] = labelencoder_X.fit_transform(dfSurvivals["age_group"])

In [7]:
# Drop string columns that will not be used in model
dfSurvivals.drop(["Title","First Name","Last Name", "Body Number"], axis=1, inplace=True)

In [8]:
# Ensure only numerical values remain
dfSurvivals.dtypes

Passenger Id                int64
Age                       float64
Sibling/Spouse Aboard       int64
Parent/Children Aboard      int64
Passenger Class             int64
Fare                      float64
Embarkation Port            int32
Survival Boat               int32
Survived                    int64
age_group                   int32
family_size                 int64
Female                    float64
Male                      float64
dtype: object

### Split the data into X and y and then into testing and training sets

In [9]:
# Split the data into X (features) and y (target)

# Set the y variable, which is the target
y = dfSurvivals['Survived']

# Set the X variable, which includes all features escept the target
X = dfSurvivals.drop(columns=['Survived'])

In [10]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Fit a logistic regression classifier

In [11]:
# Declare a logistic regression model
logistic_regression_model = LogisticRegression(random_state=1)

In [12]:
# Fit and save the logistic regression model using the training data
lr_model = logistic_regression_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Create the predicted values for the testing and the training data

In [13]:
#Generate training predictions
training_predictions = lr_model.predict(X_train)

In [14]:
#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)

### Print a confusion matrix for the testing data

In [15]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

In [16]:
# Print the confusion matrix for the testing data
print(test_matrix)

[[188   7]
 [ 21 111]]


### Print the testing classification report

In [17]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

In [18]:
# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.90      0.96      0.93       195
           1       0.94      0.84      0.89       132

    accuracy                           0.91       327
   macro avg       0.92      0.90      0.91       327
weighted avg       0.92      0.91      0.91       327



**Question**: How does the performance of the training and test dataset compare?

**Answer**: Looking at the two classification reports for the training and test data, it looks as if model performance actually improved on the testing data. All metrics (precision, recall, f1-score, and accuracy) were 1-6% better on the testing data.