In [1]:
# Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import joblib


In [2]:
# Load dataset (update path if needed)
df = pd.read_csv('tested.csv')  # change filename if your CSV has a different name
print('Data shape:', df.shape)
df.head()

Data shape: (418, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:
# Quick column inspection to spot potential leakage
print('Columns:', df.columns.tolist())
print('\nSample missing counts:\n', df.isnull().sum())

Columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

Sample missing counts:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [4]:
# Basic cleaning & feature engineering (safe)
# We will create FamilySize, IsAlone, Title (from Name) and Deck (from Cabin first letter).
df2 = df.copy()

# Title extraction
if 'Name' in df2.columns:
    df2['Title'] = df2['Name'].str.extract(',\s*([^\.]+)\.')[0].str.strip()
    df2['Title'] = df2['Title'].replace(['Mlle','Ms'],'Miss')
    df2['Title'] = df2['Title'].replace('Mme','Mrs')
    rare_titles = ['Lady','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona']
    df2['Title'] = df2['Title'].apply(lambda t: 'Rare' if t in rare_titles else t)
else:
    df2['Title'] = 'Unknown'

# Family features
if all(c in df2.columns for c in ['SibSp','Parch']):
    df2['FamilySize'] = df2['SibSp'] + df2['Parch'] + 1
    df2['IsAlone'] = (df2['FamilySize'] == 1).astype(int)
else:
    df2['FamilySize'] = 1
    df2['IsAlone'] = 1

# Deck from Cabin (useful but treat missing as 'U')
if 'Cabin' in df2.columns:
    df2['Deck'] = df2['Cabin'].astype(str).str[0].replace('n','U')
    df2['Deck'] = df2['Deck'].fillna('U')
else:
    df2['Deck'] = 'U'

# Ensure target column exists
if 'Survived' not in df2.columns:
    raise ValueError("Dataset must contain 'Survived' column as target.")

df2.shape

  df2['Title'] = df2['Name'].str.extract(',\s*([^\.]+)\.')[0].str.strip()


(418, 16)

In [5]:
# Remove identifiers and potential leakage columns from features.
# IMPORTANT: do NOT keep 'Survived' in X.
leakage_columns = ['PassengerId','Ticket','Name','Cabin']  # these may leak or are identifiers
features = [c for c in df2.columns if c not in leakage_columns + ['Survived']]
print('Features used ({}):'.format(len(features)), features)

Features used (11): ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title', 'FamilySize', 'IsAlone', 'Deck']


In [6]:
# Prepare X and y
X = df2[features].copy()
y = df2['Survived'].copy()

# Quick check: ensure Survived not in X
assert 'Survived' not in X.columns, "Target column should not be present in features (leakage)."

# Check for any column that is identical to the target (defensive)
for col in X.columns:
    try:
        if X[col].equals(y):
            raise ValueError(f"Column '{col}' is identical to the target — this is leakage.")
    except Exception:
        pass

X.shape, y.shape

((418, 11), (418,))

In [7]:
# Split the data (stratify to preserve class proportions)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print('Train:', X_train.shape, 'Test:', X_test.shape)

Train: (334, 11) Test: (84, 11)


In [8]:
# Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category','bool']).columns.tolist()

print('Numeric columns:', numeric_cols)
print('Categorical columns:', cat_cols)

Numeric columns: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone']
Categorical columns: ['Sex', 'Embarked', 'Title', 'Deck']


In [10]:
# Build preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, cat_cols)
])

In [11]:
# Full pipeline with a classifier (Random Forest example)
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state=42))
])

# Cross-validation to check for overfitting/leakage
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
print('5-fold CV accuracy scores:', cv_scores)
print('CV mean accuracy:', cv_scores.mean())

5-fold CV accuracy scores: [1. 1. 1. 1. 1.]
CV mean accuracy: 1.0


In [12]:
# Fit on train and evaluate on test
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Test accuracy:', accuracy_score(y_test, y_pred))
print('\nConfusion matrix:\n', confusion_matrix(y_test, y_pred))
print('\nClassification report:\n', classification_report(y_test, y_pred))

Test accuracy: 1.0

Confusion matrix:
 [[53  0]
 [ 0 31]]

Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        53
           1       1.00      1.00      1.00        31

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84



In [15]:
# Corrected, leakage-safe Titanic modeling pipeline
# Paste into a notebook cell or run as a script.
import pandas as pd
import numpy as np
import inspect
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# ---- Config ----
DATA_PATH = "tested.csv"   # change to your real file if needed
TARGET = "Survived"
IDENTIFIERS = ['PassengerId', 'Name', 'Ticket', 'Cabin']  # remove these from features
RANDOM_STATE = 42
CV_SPLITS = 5

# ---- Helper: instantiate OneHotEncoder compatibly with sklearn versions ----
def make_onehot(**kwargs):
    """Return OneHotEncoder with correct keyword for sparse output across sklearn versions."""
    ok = {}
    # prefer sparse_output if available (sklearn >= 1.4)
    sig = inspect.signature(OneHotEncoder.__init__)
    if 'sparse_output' in sig.parameters:
        ok['sparse_output'] = kwargs.pop('sparse_output', False)
    elif 'sparse' in sig.parameters:
        ok['sparse'] = kwargs.pop('sparse', False)
    # pass other kwargs (like handle_unknown)
    ok.update(kwargs)
    return OneHotEncoder(**ok)

# ---- Load data and basic checks ----
df = pd.read_csv(DATA_PATH)
print("DATA SHAPE:", df.shape)
print("COLUMNS:", df.columns.tolist())
if TARGET not in df.columns:
    raise SystemExit(f"Target column '{TARGET}' not found in {DATA_PATH} — aborting.")

print("\nMissing per column:\n", df.isnull().sum())

# ---- Detect perfect predictors (columns that deterministically map to the target) ----
perfect_predictors = []
for col in df.columns:
    if col == TARGET:
        continue
    nunique = df[col].nunique(dropna=False)
    # only check columns with reasonable cardinality
    if nunique <= 200:
        grouped = df.groupby(col)[TARGET].nunique(dropna=False)
        # if for every value of col the target has <=1 unique value => perfect mapping
        if (grouped <= 1).all():
            perfect_predictors.append((col, nunique))

print("\nPerfect predictors (column, unique_values):", perfect_predictors)

# ---- Prepare features and drop identifiers & perfect predictors (to avoid leakage) ----
drop_columns = [c for c in IDENTIFIERS if c in df.columns]
# we will drop any perfect predictors (these leak the label)
perfect_cols = [c for c, _ in perfect_predictors]
if perfect_cols:
    print("\nDropping perfect predictor columns (these leak target):", perfect_cols)
    drop_columns += perfect_cols

# Always ensure we drop the target from X
X = df.drop(columns=[TARGET] + drop_columns, errors='ignore').copy()
y = df[TARGET].copy()

print("\nFinal feature set ({} columns):\n".format(X.shape[1]), X.columns.tolist())

# defensive check: ensure no feature equals target
for col in X.columns:
    try:
        if X[col].reset_index(drop=True).equals(y.reset_index(drop=True)):
            raise RuntimeError(f"Column '{col}' is identical to target — leakage detected. Remove it.")
    except Exception:
        # continue if comparison fails for some dtype
        pass

# ---- Train/test split (stratified) ----
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
print("\nTrain shape:", X_train.shape, "Test shape:", X_test.shape)

# ---- Detect numeric vs categorical ----
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
print("\nNumeric columns:", numeric_cols)
print("Categorical columns:", cat_cols)

# ---- Build preprocessing pipelines ----
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', make_onehot(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, cat_cols)
], remainder='drop')  # drop any columns not specified

# ---- Full pipeline with classifier ----
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE))
])

# ---- Cross-validation (to detect overfitting / estimate performance) ----
cv = StratifiedKFold(n_splits=CV_SPLITS, shuffle=True, random_state=RANDOM_STATE)
cv_scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
print("\n{}-fold stratified CV accuracy scores:".format(CV_SPLITS), cv_scores)
print("CV mean accuracy:", cv_scores.mean())

# ---- Fit on train and evaluate on test ----
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("\nTEST SET RESULTS")
print("Test accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))

# ---- Save pipeline to disk ----
joblib.dump(clf, "titanic_pipeline_fixed.joblib")
print("\nSaved pipeline to 'titanic_pipeline_fixed.joblib'")

# ---- Extra: If perfect predictors were found, print a small summary for debugging ----
if perfect_predictors:
    print("\nDetails for perfect predictors (value -> unique Survived values):")
    for col, _ in perfect_predictors:
        try:
            display = df[[col, TARGET]].drop_duplicates().sort_values(by=col)
            print(f"\n--- {col} ---")
            print(display.to_string(index=False))
        except Exception:
            pass

# ---- If needed: Quick realistic run WITHOUT dropping sex (for experimentation) ----
# NOTE: do NOT use this for final model if Sex is a perfect predictor in your file.
# If you want to try a model excluding such leaking cols explicitly:
# X_noleak = df.drop(columns=[TARGET] + perfect_cols + IDENTIFIERS, errors='ignore')
# (then repeat preprocessing and training)


DATA SHAPE: (418, 12)
COLUMNS: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

Missing per column:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

Perfect predictors (column, unique_values): [('Sex', 2)]

Dropping perfect predictor columns (these leak target): ['Sex']

Final feature set (6 columns):
 ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

Train shape: (334, 6) Test shape: (84, 6)

Numeric columns: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Categorical columns: ['Embarked']

5-fold stratified CV accuracy scores: [0.64285714 0.72619048 0.63095238 0.61445783 0.6746988 ]
CV mean accuracy: 0.6578313253012048

TEST SET RESULTS
Test accuracy: 0.6190476190476191

Confusion matrix:
 [[39 14]
 [18 13]]

Classificatio