In [None]:
# Day - 22 ML learning K-NN means: K-Nearest Neighbors
#k-nearest neighbors (K-NN) is a simple, easy-to-implement supervised machine learning algorithm that can be used for both classification and regression tasks.
#It works by finding the 'k' closest data points (neighbors) to a given input and making predictions based on the majority class (for classification) or average value (for regression) of those neighbors.


In [None]:
"""Small K → noisy, overfitting

Large K → smooth, underfitting"""

In [None]:
#KNN uses distance to measure similarity. Common distance metrics include:
#Euclidean Distance: Straight-line distance between two points. Formula: d = √((x2 - x1)² + (y2 - y1)²)


In [None]:
"""Use KNN when:

Dataset is small to medium

Decision boundary is complex

You want an intuitive baseline"""

In [1]:
import pandas as pd #data manipulation and analysis library
import numpy as np #numerical computing library

from sklearn.model_selection import train_test_split #sklean means its a library for machine learning used for splitting data into training and testing sets.
from sklearn.preprocessing import StandardScaler #for feature scaling
from sklearn.neighbors import KNeighborsClassifier #K-NN model from scikit-learn
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report #metrics to evaluate model performance


In [2]:
df = pd.read_csv(r"C:\Users\USER\Downloads\Titanic-Dataset.csv") # Load Titanic dataset

In [3]:
# Handle missing values
df["Age"].fillna(df["Age"].median(), inplace=True)

# Encode categorical variables
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
df = pd.get_dummies(df, columns=["Embarked"], drop_first=True)

# Drop unnecessary columns
df.drop(columns=["Name", "Ticket", "Cabin"], inplace=True, errors="ignore")


In [4]:
X = df.drop("Survived", axis=1)
y = df["Survived"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [None]:
scaler = StandardScaler() # Initialize StandardScaler

X_train_scaled = scaler.fit_transform(X_train) # Fit and transform training data
X_test_scaled = scaler.transform(X_test) # Transform testing data


In [6]:
knn = KNeighborsClassifier(n_neighbors=5) # Initialize K-NN with k=5
knn.fit(X_train_scaled, y_train) # Train the model


In [7]:
y_pred = knn.predict(X_test_scaled)


In [8]:
accuracy_score(y_test, y_pred)


0.8116591928251121

In [9]:
confusion_matrix(y_test, y_pred)


array([[116,  18],
       [ 24,  65]], dtype=int64)

In [10]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.83      0.87      0.85       134
           1       0.78      0.73      0.76        89

    accuracy                           0.81       223
   macro avg       0.81      0.80      0.80       223
weighted avg       0.81      0.81      0.81       223



In [11]:
for k in [1, 3, 5, 7, 9]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    y_pred = knn.predict(X_test_scaled)
    print(f"K={k}, Accuracy={accuracy_score(y_test, y_pred):.3f}")


K=1, Accuracy=0.803
K=3, Accuracy=0.798
K=5, Accuracy=0.812
K=7, Accuracy=0.821
K=9, Accuracy=0.821


In [None]:
"""## KNN Interpretation

KNN classifies a data point based on the majority class of its nearest neighbors.
The algorithm relies heavily on distance, making feature scaling essential.
Different values of K affect bias–variance tradeoff.
"""