1. Iris Flower Classification using KNN       
Dataset: Iris Dataset (from scikit-learn)
Goal: Classify iris species using petal and sepal measurements

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train KNN model
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn.fit(X_train, y_train)

# Predict & evaluate
y_pred = knn.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')


Accuracy: 1.00


2. Handwritten Digit Recognition (MNIST Dataset) using KNN      
Dataset: MNIST (from scikit-learn)
Goal: Classify handwritten digits using KNN




In [2]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load MNIST dataset
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist.data, mnist.target

# Convert labels to integers
y = y.astype(int)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train KNN model
knn = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
knn.fit(X_train, y_train)

# Predict & evaluate
y_pred = knn.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')


Accuracy: 0.95


3. Customer Segmentation Using KNN        
Dataset: Synthetic data (generated using Pandas & NumPy)
Goal: Classify customers based on their purchasing behavior

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Generate synthetic customer data
np.random.seed(42)
data = {
    'Age': np.random.randint(18, 70, 200),
    'Annual_Income': np.random.randint(15000, 100000, 200),
    'Spending_Score': np.random.randint(1, 100, 200),
    'Category': np.random.choice([0, 1, 2], 200)  # 0: Low, 1: Medium, 2: High spenders
}

df = pd.DataFrame(data)

# Prepare dataset
X = df[['Age', 'Annual_Income', 'Spending_Score']]
y = df['Category']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train KNN model
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn.fit(X_train, y_train)

# Predict & evaluate
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.48      0.58      0.52        19
           1       0.25      0.18      0.21        11
           2       0.33      0.30      0.32        10

    accuracy                           0.40        40
   macro avg       0.35      0.35      0.35        40
weighted avg       0.38      0.40      0.39        40



Project: KNN for Email Spam Classification      
Dataset: SMS Spam Collection Dataset
Goal: Classify emails/SMS messages as spam or not spam using KNN

In [6]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Sample dataset (Manually created)
data = {
    "message": [
        "Congratulations! You've won a free iPhone! Click the link to claim now.",
        "Hey, are we still meeting for coffee tomorrow?",
        "URGENT! Your account has been compromised. Reset your password immediately!",
        "Hello John, this is a reminder for your doctor's appointment at 10 AM.",
        "Win a lottery of $1,000,000 now! Just send your details to claim.",
        "Are you available for a quick call later?",
        "Limited-time offer: Buy 1 Get 1 Free on all items. Shop now!",
        "Let's catch up soon! It's been a long time since we talked.",
        "Congratulations! You have been selected for a free holiday trip!",
        "Meet me at the park at 5 PM. Let's go for a walk."
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1 = Spam, 0 = Not Spam
}

df = pd.DataFrame(data)

# Text cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.strip()  # Remove extra spaces
    return text

df['message'] = df['message'].apply(clean_text)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

# Convert text into numerical features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Train KNN model
knn = KNeighborsClassifier(n_neighbors=3, metric='cosine')
knn.fit(X_train, y_train)

# Predict & evaluate
y_pred = knn.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')
print(classification_report(y_test, y_pred))


Accuracy: 1.00
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



5. Movie Recommendation System Using KNN        
Dataset: Movie Ratings Dataset (Synthetic Data)
Goal: Recommend movies based on user preferences

In [7]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors

# Create a sample movie ratings dataset
data = {
    'User': ['A', 'B', 'C', 'D', 'E'],
    'Movie1': [5, 3, 4, 2, 1],
    'Movie2': [4, 2, 5, 3, 2],
    'Movie3': [3, 5, 2, 4, 5],
    'Movie4': [1, 4, 3, 5, 4],
    'Movie5': [2, 3, 5, 1, 3]
}

df = pd.DataFrame(data).set_index('User')

# Train KNN model for recommendations
knn = NearestNeighbors(n_neighbors=2, metric='cosine')
knn.fit(df)

# Find nearest neighbors for User 'A'
distances, indices = knn.kneighbors([df.loc['A']])
print(f"Similar users to 'A': {df.index[indices[0][1]]}")


Similar users to 'A': C


