In [38]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from joblib import dump, load
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader

In [39]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Embarked'].fillna('S', inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)
    features = ['Pclass', 'Age', 'Fare', 'FamilySize', 'IsAlone',
                'Sex_male', 'Embarked_Q', 'Embarked_S']
    return df[features + (['Survived'] if 'Survived' in df.columns else [])]

In [40]:
df_train = pd.read_csv('../data/train.csv')

def preprocess(df):
    df = df.copy()  
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Embarked'] = df['Embarked'].fillna('S')
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    return df

train, val = train_test_split(df_proc, test_size=0.2, random_state=42, stratify=df_proc['Survived'])
os.makedirs('../data/processed', exist_ok=True)
train.to_csv('../data/processed/train_processed.csv', index=False)
val.to_csv('../data/processed/val_processed.csv', index=False)

print("Processed datasets saved to 'train_processed.csv' and 'val_processed.csv'")

Processed datasets saved to 'train_processed.csv' and 'val_processed.csv'


In [41]:
train = pd.read_csv('../data/processed/train_processed.csv')
X = train.drop('Survived', axis=1)
y = train['Survived']

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
dump(model, '../models/titanic_model.pkl')


['../models/titanic_model.pkl']

In [42]:
val = pd.read_csv('../data/processed/val_processed.csv')
X_val = val.drop('Survived', axis=1)
y_val = val['Survived']
model = load('../models/titanic_model.pkl')

preds = model.predict(X_val)
acc = accuracy_score(y_val, preds)
cm = confusion_matrix(y_val, preds)
report = classification_report(y_val, preds)

print("Accuracy:", acc)
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", report)


Accuracy: 0.8044692737430168
Confusion Matrix:
 [[95 15]
 [20 49]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.86      0.84       110
           1       0.77      0.71      0.74        69

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



In [43]:
df = pd.read_csv('../data/train.csv')
os.makedirs('../visuals', exist_ok=True)
sns.countplot(x='Survived', hue='Sex', data=df)
plt.title('Survival by Gender')
plt.savefig('../visuals/survival_by_gender.png')
plt.clf()

sns.countplot(x='Survived', hue='Pclass', data=df)
plt.title('Survival by Class')
plt.savefig('../visuals/survival_by_class.png')
plt.clf()

sns.histplot(data=df, x='Age', hue='Survived', kde=True, bins=30)
plt.title('Age Distribution by Survival')
plt.savefig('../visuals/age_distribution.png')
plt.clf()

df_heatmap = df.copy()
df_heatmap['Sex'] = df_heatmap['Sex'].map({'male': 1, 'female': 0})
df_heatmap['Embarked'] = df_heatmap['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
df_heatmap['Age'] = df_heatmap['Age'].fillna(df_heatmap['Age'].median())
df_heatmap['Fare'] = df_heatmap['Fare'].fillna(df_heatmap['Fare'].median())
df_heatmap['Embarked'] = df_heatmap['Embarked'].fillna(0)

corr = df_heatmap[['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp', 'Parch', 'Embarked']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Feature Correlation')
plt.savefig('../visuals/feature_correlation.png')
plt.clf()

<Figure size 640x480 with 0 Axes>

In [44]:
df_test = pd.read_csv('../data/test.csv')
df_test_proc = preprocess(df_test)

if 'Survived' in df_test_proc.columns:
    df_test_proc = df_test_proc.drop('Survived', axis=1)

missing_cols = set(model.feature_names_in_) - set(df_test_proc.columns)
for col in missing_cols:
    df_test_proc[col] = 0

df_test_proc = df_test_proc[model.feature_names_in_]  # Reorder to match
df_test['Survived_Pred'] = model.predict(df_test_proc)

df_test[['PassengerId', 'Survived_Pred']].to_csv('../data/predictions.csv', index=False)
print("Predictions saved to 'predictions.csv'")


Predictions saved to 'predictions.csv'


In [49]:


# Ensure visuals directory exists
os.makedirs('../visuals', exist_ok=True)

# Plot 1: Survival by Gender
plt.figure(figsize=(6, 4))
sns.countplot(data=df_train, x='Sex', hue='Survived')
plt.title('Survival by Gender')
plt.xlabel('Sex')
plt.ylabel('Count')
plt.legend(title='Survived')
plt.tight_layout()
plt.savefig('../visuals/survival_by_gender.png')
plt.close()

# Plot 2: Survival by Class
plt.figure(figsize=(6, 4))
sns.countplot(data=df_train, x='Pclass', hue='Survived')
plt.title('Survival by Class')
plt.xlabel('Passenger Class')
plt.ylabel('Count')
plt.legend(title='Survived')
plt.tight_layout()
plt.savefig('../visuals/survival_by_class.png')
plt.close()

# Plot 3: Age Distribution
plt.figure(figsize=(6, 4))
sns.histplot(data=df_train, x='Age', bins=30, kde=True)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('../visuals/age_distribution.png')
plt.close()

# Plot 4: Feature Correlation Heatmap (numeric columns only)
plt.figure(figsize=(10, 8))
numeric_df = df_train.select_dtypes(include='number')  # Avoid string conversion errors
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.savefig('../visuals/feature_correlation.png')
plt.close()

print("Visualizations saved in '../visuals' folder.")


Visualizations saved in '../visuals' folder.


In [50]:
os.makedirs('reports', exist_ok=True)

c = canvas.Canvas("reports/titanic_model_report.pdf", pagesize=letter)
width, height = letter

c.setFont("Helvetica-Bold", 18)
c.drawString(50, height - 50, "Titanic Classification Model Report")
c.setFont("Helvetica", 12)
c.drawString(50, height - 100, f"Accuracy: {acc:.4f}")

c.drawString(50, height - 130, "Confusion Matrix:")
for i, row in enumerate(cm):
    c.drawString(70, height - 150 - i*15, str(row))

c.drawString(50, height - 200, "Classification Report:")
for i, line in enumerate(report.split('\n')):
    c.drawString(70, height - 220 - i*15, line.strip())

img_paths = [
    '../visuals/survival_by_gender.png',
    '../visuals/survival_by_class.png',
    '../visuals/age_distribution.png',
    '../visuals/feature_correlation.png'
]

for img_path in img_paths:
    if os.path.exists(img_path):
        c.showPage()
        c.drawImage(ImageReader(img_path), 50, 200, width=500, preserveAspectRatio=True, mask='auto')
        c.setFont("Helvetica", 12)
        c.drawString(50, 180, os.path.basename(img_path).replace('_', ' ').replace('.png', '').title())
    else:
        print(f"‚ö†Ô∏è Warning: Image not found - {img_path}")

c.save()
print("PDF Report generated at: reports/titanic_model_report.pdf")


üìÑ PDF Report generated at: reports/titanic_model_report.pdf
