# ***Titanic Prediction***
---------

**Created by:** *Josue Arellano Barba*

**Date:** *September 17th, 2023*

In [1]:
# Importing the necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
# Loading the data
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df, df["Survived"].values, test_size=0.25, random_state=42)

In [4]:
# Dropping unnecessary columns
cols_to_drop = ["PassengerId", "Name", "Ticket"]
X_train = X_train.drop(cols_to_drop, axis=1)
X_test = X_test.drop(cols_to_drop, axis=1)

In [5]:
# Encoding categorical variables
encoder = LabelEncoder()
X_train["Sex"] = encoder.fit_transform(X_train["Sex"])
X_test["Sex"] = encoder.transform(X_test["Sex"])
X_train["Embarked"] = encoder.fit_transform(X_train["Embarked"])
X_test["Embarked"] = encoder.transform(X_test["Embarked"])

In [6]:
# Filling in missing values
X_train["Age"] = X_train["Age"].fillna(X_train["Age"].mean())
X_test["Age"] = X_test["Age"].fillna(X_test["Age"].mean())

In [7]:
# Extracting information from the 'Cabin' column
X_train["Cabin_Letter"] = X_train["Cabin"].str.extract('([A-Za-z])')
X_train["Cabin_Number"] = X_train["Cabin"].str.extract('(\d+)')
X_test["Cabin_Letter"] = X_test["Cabin"].str.extract('([A-Za-z])')
X_test["Cabin_Number"] = X_test["Cabin"].str.extract('(\d+)')

In [8]:
# Filling missing values in the new columns
X_train["Cabin_Letter"].fillna('N', inplace=True)  # 'N' for missing values
X_train["Cabin_Number"].fillna(0, inplace=True)    # 0 for missing values
X_test["Cabin_Letter"].fillna('N', inplace=True)
X_test["Cabin_Number"].fillna(0, inplace=True)

In [9]:
# Encoding the Cabin Letter column
X_train["Cabin_Letter"] = encoder.fit_transform(X_train["Cabin_Letter"])
X_test["Cabin_Letter"] = encoder.transform(X_test["Cabin_Letter"])

In [10]:
# Scaling the features
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train.drop(["Cabin"], axis=1))  # Dropping the original 'Cabin' column
X_test = scaler.transform(X_test.drop(["Cabin"], axis=1))

In [11]:
# Training the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [12]:
# Making predictions
y_pred = model.predict(X_test)

In [13]:
score_test = model.score(X_test, y_test)
print("Accuracy Score on Test Set: {:.2f}%".format(score_test * 100))

Accuracy Score on Test Set: 100.00%


In [14]:
# Calculating the classification report
classification_report = classification_report(y_test, y_pred, target_names=["Not Survived", "Survived"])
print("Classification Report:\n", classification_report)

Classification Report:
               precision    recall  f1-score   support

Not Survived       1.00      1.00      1.00       134
    Survived       1.00      1.00      1.00        89

    accuracy                           1.00       223
   macro avg       1.00      1.00      1.00       223
weighted avg       1.00      1.00      1.00       223



## Classification Report

---------

<br>
<center>

|    | **Precision** | **Recall** | **F1-Score** | **Support** |
|:---|:----------|:-------|:---------|:--------|
| Not Survived | 1.00 | 1.00 | 1.00 | 134 |
| Survived | 1.00 | 1.00 | 1.00 | 89 |
| Accuracy | | | 1.00 | 223 |
| Macro avg | 1.00 | 1.00 | 1.00 | 223 |
| Weighted avg | 1.00 | 1.00 | 1.00 | 223 |

</center>