# Titanic Survival Prediction

This notebook builds a machine learning model to predict which passengers survived the Titanic shipwreck.

The approach includes:
- Feature engineering with age categories and family information
- Data preprocessing and missing value handling
- Logistic Regression modeling
- Performance evaluation using confusion matrix and ROC curve

In [None]:
# Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, classification_report, RocCurveDisplay

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load the datasets
train_data=pd.read_csv("/kaggle/input/titanic/train.csv")
test_data=pd.read_csv("/kaggle/input/titanic/test.csv")
submitted_data=pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

In [None]:
# Display the first few rows of training data
train_data.head()

In [None]:
train_data.info()  # Check the structure and missing values in our dataset

# Data Preprocessing

In [None]:
# Remove unnecessary columns 
train_data.drop(["PassengerId", "Name", "Cabin"], axis=1, inplace=True)
test_data.drop(["Name", "Cabin"], axis=1, inplace=True)

In [None]:
# Check missing values percentage
train_data.isna().sum()/len(train_data)*100

# Feature Engineering

In [None]:
# Create a new feature 'Family_Size'
train_data["Family_Size"]=train_data.SibSp+train_data.Parch
test_data["Family_Size"]=test_data.SibSp+test_data.Parch

In [None]:
# Create 'Is_Alone' feature
def isAlone(value):
    if value == 0:
        return 1  # Traveling alone
    else:
        return 0  # Traveling with family

In [None]:
train_data["Is_Alone"] = train_data["Family_Size"].apply(isAlone)
test_data["Is_Alone"] = test_data["Family_Size"].apply(isAlone)

In [None]:
# Create age groups feature
def categorize_age(age):
    """
    Categorizes passengers into age groups.
    
    Parameters:
    age (float): The age of the passenger
    
    Returns:
    str: Age category (Child, Teenager, Young Adult, Adult, Senior, or Unknown)
    """
    if pd.isna(age):
        return "Unknown"
    elif age <= 12:
        return "Child"
    elif age <= 18:
        return "Teenager"
    elif age <= 35:
        return "Young Adult"
    elif age <= 60:
        return "Adult"
    else:
        return "Senior"

In [None]:
train_data["Age_Category"] = train_data["Age"].apply(categorize_age)
test_data["Age_Category"] = test_data["Age"].apply(categorize_age)

In [None]:
# Convert fare to logarithmic scale
train_data["Log_Fare"] = np.log1p(train_data["Fare"])  # log1p handles zero values
test_data["Log_Fare"] = np.log1p(test_data["Fare"])

# Handle Missing Values and Further Preprocessing

In [None]:
# Drop the original columns
train_data.drop(["SibSp", "Parch", "Ticket"], axis=1, inplace=True)
test_data.drop(["SibSp", "Parch", "Ticket"], axis=1, inplace=True)

In [None]:
train_data.head()

In [None]:
train_data["Embarked"].unique()

In [None]:
# Handle missing 'Embarked' values
train_data.dropna(subset=["Embarked"], inplace=True)
test_data.dropna(subset=["Embarked"], inplace=True)

In [None]:
# Fill missing 'Age' values
train_data["Age"] = train_data["Age"].fillna(train_data["Age"].mean())
test_data["Age"] = test_data["Age"].fillna(test_data["Age"].mean())

In [None]:
# Fill missing 'Fare' values
test_data["Fare"] = test_data["Fare"].fillna(train_data["Fare"].median())
test_data["Log_Fare"] = test_data["Log_Fare"].fillna(np.log1p(train_data["Fare"].median()))

In [None]:
# Convert 'Sex' to numerical
train_data["Sex"] = train_data["Sex"].map({"male": 0, "female": 1})
test_data["Sex"] = test_data["Sex"].map({"male": 0, "female": 1})

In [None]:
# One-hot encode categorical variables
train_data = pd.get_dummies(train_data, columns=["Embarked", "Age_Category"], drop_first=True)
test_data = pd.get_dummies(test_data, columns=["Embarked", "Age_Category"], drop_first=True)

In [None]:
train_data.info()

In [None]:
test_data.info()

# Model Training and Evaluation

In [None]:
# Define features
features = ["Pclass", "Sex", "Age", "Fare", "Family_Size", "Is_Alone", "Log_Fare"]
for col in train_data.columns:
    if col.startswith("Embarked_") or col.startswith("Age_Category_"):
        features.append(col)

In [None]:
# Prepare data
X = train_data[features]
y = train_data["Survived"]

In [None]:
# Split dataset into train and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=14, stratify=y)

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

In [None]:
# Make predictions
y_pred = logreg.predict(X_test_scaled)

# Visualizing the model's predictions versus actual outcomes

In [None]:
# Create confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=logreg.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=logreg.classes_)
plt.figure(figsize=(8, 6))
disp.plot()
plt.title("Confusion Matrix for Validation Set")
plt.savefig('confusion_matrix.png')  

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Logistic Regression Model Accuracy: {accuracy:.4f}')

In [None]:
# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Generate ROC curve
plt.figure(figsize=(8, 6))
Logreg_roc = RocCurveDisplay.from_estimator(logreg, X_test_scaled, y_test)
plt.title("ROC Curve for Validation Set")
plt.savefig('roc_curve.png')  
plt.show()

In [None]:
# Prepare test data
X_test = test_data[features]
X_test_scaled = scaler.transform(X_test)

In [None]:
# Make predictions
test_predictions = logreg.predict(X_test_scaled)

In [None]:
# Create submission file
output=pd.DataFrame({"PassengerId":test_data.PassengerId, "Survived":test_predictions})
output.to_csv('submission.csv', index=False)