In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import resample
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import seaborn as sns


## 1. Read and Visualize Data

In [None]:
df = pd.read_csv("train.csv")
print("First 5 rows of data:")
display(df.head())

## 2. Handling Missing Data

In [None]:
print("Missing values count per column:")
print(df.isnull().sum())

In [None]:
df_filled = df.fillna(0)
print("Missing values after filling with 0:")
print(df_filled.isnull().sum())

## 3. Handling Numerical and Categorical Values

In [None]:
numerical_cols = df_filled.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df_filled.select_dtypes(include=["object"]).columns.tolist()
print("Numerical columns:", numerical_cols)
print("Categorical columns:", categorical_cols)

In [None]:
df_encoded = pd.get_dummies(df_filled, columns=categorical_cols)
print("First 5 rows after one-hot encoding:")
display(df_encoded.head())

## 4. Dataset Splitting

In [None]:
X = df_encoded.drop("Survived", axis=1)
y = df_encoded["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

## 5. Addressing Imbalanced Labels

In [None]:
print("Survival counts in training set:")
print(y_train.value_counts())

In [None]:
train_data = pd.concat([X_train, y_train], axis=1)
majority = train_data[train_data.Survived == 0]
minority = train_data[train_data.Survived == 1]
minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)
upsampled = pd.concat([majority, minority_upsampled])
X_train_balanced = upsampled.drop("Survived", axis=1)
y_train_balanced = upsampled["Survived"]
print("Balanced training label counts:")
print(y_train_balanced.value_counts())

## 6. Model Training and Validation

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_balanced, y_train_balanced)
train_acc = knn.score(X_train_balanced, y_train_balanced)
test_acc = knn.score(X_test, y_test)
print("Train accuracy:", train_acc)
print("Test accuracy:", test_acc)

In [None]:
k_vals = range(1, 101)
train_scores = []
test_scores = []
for k in k_vals:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_balanced, y_train_balanced)
    train_scores.append(knn.score(X_train_balanced, y_train_balanced))
    test_scores.append(knn.score(X_test, y_test))

plt.figure(figsize=(10, 5))
plt.plot(k_vals, train_scores, label='Train Accuracy')
plt.plot(k_vals, test_scores, label='Test Accuracy')
plt.xlabel("k")
plt.ylabel("Accuracy")
plt.title("Accuracy vs k in KNN")
plt.legend()
plt.grid(True)
plt.show()

## 7. Analytical Questions

**1. Which feature is most important?**

Based on data distribution and class separability, features like `Sex`, `Pclass`, and `Age` tend to be most informative. Gender especially shows strong correlation with survival rate.

**2. Alternative missing value strategies for 'Age':**

- Fill with median or mean (less extreme than 0)
- Use age binning (child, adult, senior)
- Predict missing age using regression from other features

These approaches preserve age distribution better and help the model generalize.