In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the preprocessed data
print("Loading preprocessed Titanic data...")
train_data = pd.read_csv('titanic_train_cleaned.csv')
test_data = pd.read_csv('titanic_test_cleaned.csv')

print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")

Loading preprocessed Titanic data...
Train data shape: (334, 33)
Test data shape: (84, 33)


In [None]:
# Separate features and target
X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']
X_test = test_data.drop('Survived', axis=1)
y_test = test_data['Survived']

print(f"\nFeatures: {X_train.shape[1]}")
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")


Features: 32
Training samples: 334
Test samples: 84


In [None]:
# Feature scaling (important for Logistic Regression)
print("\nApplying feature scaling...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression model
print("\nTraining Logistic Regression model...")
logistic_model = LogisticRegression(
    random_state=42,
    max_iter=1000,  # Increase iterations to ensure convergence
    solver='liblinear'  # Good for small datasets
)


Applying feature scaling...

Training Logistic Regression model...


In [None]:
# Fit the model
logistic_model.fit(X_train_scaled, y_train)

In [None]:
# Make predictions
print("Making predictions...")
y_train_pred = logistic_model.predict(X_train_scaled)
y_test_pred = logistic_model.predict(X_test_scaled)

Making predictions...


In [None]:
# Calculate accuracy scores
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

In [None]:
# Display results
print("\n" + "="*50)
print("LOGISTIC REGRESSION RESULTS")
print("="*50)
print(f"Training Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"Accuracy Difference: {abs(train_accuracy - test_accuracy):.4f}")

if abs(train_accuracy - test_accuracy) > 0.05:
    print("⚠️  Large gap suggests possible overfitting")
else:
    print("✅ Good balance between training and test performance")


LOGISTIC REGRESSION RESULTS
Training Accuracy: 1.0000 (100.00%)
Test Accuracy: 1.0000 (100.00%)
Accuracy Difference: 0.0000
✅ Good balance between training and test performance


In [None]:
# Classification report
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred, target_names=['Not Survived', 'Survived']))


Classification Report (Test Set):
              precision    recall  f1-score   support

Not Survived       1.00      1.00      1.00        53
    Survived       1.00      1.00      1.00        31

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84



In [None]:
# Confusion Matrix
print("\nConfusion Matrix (Test Set):")
cm = confusion_matrix(y_test, y_test_pred)
print(cm)


Confusion Matrix (Test Set):
[[53  0]
 [ 0 31]]


In [None]:
# Confusion matrix interpretation
tn, fp, fn, tp = cm.ravel()
print(f"\nConfusion Matrix Breakdown:")
print(f"True Negatives (Correctly predicted not survived): {tn}")
print(f"False Positives (Incorrectly predicted survived): {fp}")
print(f"False Negatives (Incorrectly predicted not survived): {fn}")
print(f"True Positives (Correctly predicted survived): {tp}")


Confusion Matrix Breakdown:
True Negatives (Correctly predicted not survived): 53
False Positives (Incorrectly predicted survived): 0
False Negatives (Incorrectly predicted not survived): 0
True Positives (Correctly predicted survived): 31


In [None]:
# Calculate additional metrics
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"\nAdditional Metrics:")
print(f"Precision (Survived): {precision:.4f}")
print(f"Recall (Survived): {recall:.4f}")
print(f"F1-Score (Survived): {f1_score:.4f}")


Additional Metrics:
Precision (Survived): 1.0000
Recall (Survived): 1.0000
F1-Score (Survived): 1.0000


In [None]:
# Get feature names and coefficients
feature_names = X_train.columns
coefficients = logistic_model.coef_[0]

In [None]:
# Create feature importance dataframe
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients,
    'Abs_Coefficient': np.abs(coefficients)
}).sort_values('Abs_Coefficient', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10).to_string(index=False))


Top 10 Most Important Features:
            Feature  Coefficient  Abs_Coefficient
                Sex     2.097029         2.097029
           Title_Mr    -1.510346         1.510346
         Title_Miss     1.175499         1.175499
          Title_Mrs     1.083645         1.083645
       Title_Master    -0.524150         0.524150
         Title_Rare    -0.185033         0.185033
FareGroup_Very_High     0.134079         0.134079
                Age    -0.084856         0.084856
             Deck_C     0.081104         0.081104
             Deck_G     0.076679         0.076679


In [None]:
# Overall survival rates
actual_survival_rate = y_test.mean()
predicted_survival_rate = y_test_pred.mean()

print(f"Actual survival rate in test set: {actual_survival_rate:.3f} ({actual_survival_rate*100:.1f}%)")
print(f"Predicted survival rate: {predicted_survival_rate:.3f} ({predicted_survival_rate*100:.1f}%)")
print(f"Difference: {abs(actual_survival_rate - predicted_survival_rate):.3f}")

Actual survival rate in test set: 0.369 (36.9%)
Predicted survival rate: 0.369 (36.9%)
Difference: 0.000


In [None]:
# Model confidence (probability predictions)
y_test_proba = logistic_model.predict_proba(X_test_scaled)[:, 1]  # Probability of survival
avg_confidence = np.mean(np.maximum(y_test_proba, 1 - y_test_proba))  # Average max probability
print(f"\nAverage model confidence: {avg_confidence:.3f}")


Average model confidence: 0.993


In [None]:
# Show some example predictions
print(f"\nSample Predictions:")
sample_indices = np.random.choice(len(y_test), 5, replace=False)
for i in sample_indices:
    actual = y_test.iloc[i]
    predicted = y_test_pred[i]
    probability = y_test_proba[i]
    status = "✅" if actual == predicted else "❌"
    print(f"{status} Actual: {actual}, Predicted: {predicted}, Survival Probability: {probability:.3f}")


Sample Predictions:
✅ Actual: 0, Predicted: 0, Survival Probability: 0.007
✅ Actual: 0, Predicted: 0, Survival Probability: 0.006
✅ Actual: 0, Predicted: 0, Survival Probability: 0.006
✅ Actual: 1, Predicted: 1, Survival Probability: 0.995
✅ Actual: 0, Predicted: 0, Survival Probability: 0.011


In [None]:
# Summary
print("\n" + "="*50)
print("SUMMARY")
print("="*50)
print(f" Model Accuracy: {test_accuracy:.1%}")
print(f" Correctly classified {int(test_accuracy * len(y_test))} out of {len(y_test)} passengers")
print(f" Model shows {'strong' if test_accuracy > 0.8 else 'moderate' if test_accuracy > 0.7 else 'weak'} predictive performance")

if test_accuracy > 0.75:
    print(" Good performance! The model captures survival patterns well.")
elif test_accuracy > 0.65:
    print(" Decent performance. Consider feature engineering or different algorithms.")
else:
    print(" Room for improvement. Try ensemble methods or more features.")

print("\n Logistic Regression training and evaluation complete!")


SUMMARY
 Model Accuracy: 100.0%
 Correctly classified 84 out of 84 passengers
 Model shows strong predictive performance
 Good performance! The model captures survival patterns well.

 Logistic Regression training and evaluation complete!
