In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("titanic.csv")
print(df.head())


   PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name     Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    male  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                     Myles, Mr. Thomas Francis    male  62.0      0      0   
3                              Wirz, Mr. Albert    male  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   

    Ticket     Fare Cabin Embarked  
0   330911   7.8292   NaN        Q  
1   363272   7.0000   NaN        S  
2   240276   9.6875   NaN        Q  
3   315154   8.6625   NaN        S  
4  3101298  12.2875   NaN        S  


In [3]:
import pandas as pd
from tkinter import Tk
from tkinter.filedialog import askopenfilename

# Hide the main Tkinter window
Tk().withdraw()

file_path = askopenfilename(title="Select CSV File")
print("Loaded file:", file_path)

df = pd.read_csv(file_path)
print("Dataset Shape:", df.shape)
df.head()


If file is local → Upload now


NameError: name 'files' is not defined

In [None]:
df = df.copy()

# Drop columns not useful for survival prediction
cols_to_drop = ["Name", "Ticket", "Cabin"]
for c in cols_to_drop:
    if c in df.columns:
        df.drop(columns=c, inplace=True)

df.head()


In [None]:
# Fill Age missing values with median
if "Age" in df.columns:
    df["Age"].fillna(df["Age"].median(), inplace=True)

# Fill Embarked missing values with mode (most common)
if "Embarked" in df.columns:
    df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)

print(df.isna().sum())


In [None]:
X = df.drop("Survived", axis=1)
y = df["Survived"]


In [None]:
numeric_features = ["Age", "SibSp", "Parch", "Fare"]

categorical_features = ["Pclass", "Sex", "Embarked"]

numeric_features = [c for c in numeric_features if c in X.columns]
categorical_features = [c for c in categorical_features if c in X.columns]

print("Numeric:", numeric_features)
print("Categorical:", categorical_features)


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_transformer = Pipeline([
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])


In [None]:
# Inspect missing values in X_train and X_test
import pandas as pd
print("Missing in entire dataset:\n", df.isna().sum())
print("\nMissing in X (features):\n", X.isna().sum())
print("\nMissing in X_train:\n", X_train.isna().sum())


In [None]:
# Preprocessing pipeline with imputers (replace the previous pipeline)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),   # fills numeric NaNs with median
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  # fills categorical NaNs with mode
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
], remainder="drop")

print("Preprocessor with imputation built.")


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

y_pred = log_reg_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# If you have not already split, (re)create split to ensure X_train/X_test are in memory:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

log_reg_model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

log_reg_model.fit(X_train, y_train)
print("Model trained successfully!")


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Predictions
y_pred = log_reg_model.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Detailed metrics
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Titanic Survival Prediction — Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:
print(df.head(10))
print("\nColumns:", df.columns.tolist())

# Check if Survived is mistakenly included in X
print("\nDoes X contain Survived?", "Survived" in X.columns)

# Check for duplicates between X_train and X_test
dup = pd.merge(X_train.reset_index(), X_test.reset_index(), how='inner')
print("\nOverlapping rows between train and test:", dup.shape[0])

# Check class distribution
print("\nTarget distribution:")
print(df['Survived'].value_counts())
