In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [2]:
# Load dataset
df = pd.read_csv('train.csv')

# Handle missing values
age_mean_10_50 = df[(df['Age'] >= 10) & (df['Age'] <= 50)]['Age'].mean()
df['Age'] = df['Age'].fillna(age_mean_10_50)
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Drop unwanted columns
df.drop(['Cabin', 'Ticket', 'PassengerId', 'Name', 'Fare'], axis=1, inplace=True)

# Encode Embarked (One-hot)
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

# Scale Age
scaler = StandardScaler()
df[['Age']] = scaler.fit_transform(df[['Age']])

# Split into X and y
X = df.drop('Survived', axis=1)
y = df['Survived']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Load dataset
df = pd.read_csv('train.csv')

# Handle missing values
age_mean_10_50 = df[(df['Age'] >= 10) & (df['Age'] <= 50)]['Age'].mean()
df['Age'] = df['Age'].fillna(age_mean_10_50)
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Drop unwanted columns
df.drop(['Cabin', 'Ticket', 'PassengerId', 'Name', 'Fare'], axis=1, inplace=True)

# Convert 'Sex' to numeric
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# One-hot encode 'Embarked'
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

# Scale 'Age'
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['Age']] = scaler.fit_transform(df[['Age']])

# Define features and target
X = df.drop('Survived', axis=1)
y = df['Survived']

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [8]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

In [9]:
y_pred = model.predict(X_test)

In [10]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Accuracy
print("✅ Accuracy:", accuracy_score(y_test, y_pred))

# Confusion Matrix
print("\n🧮 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Classification Report
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))

# ROC AUC Score
print("\n📈 ROC AUC Score:", roc_auc_score(y_test, y_pred))

✅ Accuracy: 0.8044692737430168

🧮 Confusion Matrix:
 [[90 15]
 [20 54]]

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.86      0.84       105
           1       0.78      0.73      0.76        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.80      0.80      0.80       179


📈 ROC AUC Score: 0.7934362934362935


In [22]:
# Define the column names in the same order as your model
columns = X.columns.tolist()

# Example input: [Pclass, Sex, Age, SibSp, Parch, Embarked_Q, Embarked_S]
new_passenger = pd.DataFrame([[1, 0, 24, 0, 0, 0, 1]], columns=columns)

# Predict
prediction = model.predict(new_passenger)
probability = model.predict_proba(new_passenger)

print("Prediction (0 = did not survive, 1 = survived):", prediction[0])
print("Probability of survival:", round(probability[0][1] * 100, 2), "%")

Prediction (0 = did not survive, 1 = survived): 0
Probability of survival: 0.01 %


In [25]:
mean_age = df['Age'].mean()
std_age = df['Age'].std()

In [26]:
scaled_age_24 = (24 - mean_age) / std_age

In [27]:
columns = X.columns.tolist()

test_passenger_4 = pd.DataFrame([[1, 1, scaled_age_24, 0, 0, 0, 1]], columns=columns)

In [29]:
prediction4 = model.predict(test_passenger_4)
probability4 = model.predict_proba(test_passenger_4)

print("🧪 Test Case 4 – 24-year-old woman, 1st class")
print("Prediction (0 = did not survive, 1 = survived):", prediction4[0])
print("Probability of survival:", round(probability4[0][1] * 100, 2), "%")

🧪 Test Case 4 – 24-year-old woman, 1st class
Prediction (0 = did not survive, 1 = survived): 0
Probability of survival: 0.08 %
