<a href="https://colab.research.google.com/github/Krish6115/MLLab/blob/main/Lab3/Lab3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Lab03 – k-NN on Crop_recommendation.csv (Two-class subset)
# Dataset columns: N,P,K,temperature,humidity,ph,rainfall,label
# All functions are defined here (single code block) per rules.

import json
import numpy as np
import pandas as pd
from typing import Tuple, Dict, List
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    roc_auc_score,
    roc_curve
)
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Utility: Data loading/select
def load_dataset(csv_path: str) -> pd.DataFrame:
    """
    A0. Utility
    Load the Crop Recommendation dataset from a CSV path.
    Returns a pandas DataFrame.
    """
    df = pd.read_csv(csv_path)
    return df


def select_two_classes(df: pd.DataFrame, class_a: str, class_b: str) -> pd.DataFrame:
    """
    A0. Utility
    Filter the dataset to keep only two classes (binary problem), as required by Lab03.
    Returns a DataFrame containing only rows whose 'label' ∈ {class_a, class_b}.
    """
    subset = df[df['label'].isin([class_a, class_b])].copy()
    subset.reset_index(drop=True, inplace=True)
    return subset


def split_features_labels(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
    """
    A0. Utility
    Split the two-class DataFrame into X (features) and y (labels).
    """
    X = df.drop(columns=['label'])
    y = df['label'].copy()
    return X, y

# A1. Intraclass spread and interclass centroid dist
def compute_class_stats(X: pd.DataFrame, y: pd.Series) -> Dict[str, Dict[str, np.ndarray]]:
    """
    A1.
    Compute class centroids (mean vectors) and spreads (std vectors) for each class.
    Returns a dict: {class_label: {'centroid': np.ndarray, 'spread': np.ndarray}}
    """
    stats = {}
    for cls in y.unique():
        Xc = X[y == cls]
        centroid = Xc.mean(axis=0).to_numpy()
        spread = Xc.std(axis=0, ddof=0).to_numpy()
        stats[cls] = {'centroid': centroid, 'spread': spread}
    return stats


def compute_centroid_distance(stats: Dict[str, Dict[str, np.ndarray]], class_a: str, class_b: str) -> float:
    """
    A1.
    Compute Euclidean distance between two class centroids.
    """
    c1 = stats[class_a]['centroid']
    c2 = stats[class_b]['centroid']
    return float(np.linalg.norm(c1 - c2))

# A2. Histogram, mean and variance for a chosen feature
def feature_histogram_stats(
    X: pd.DataFrame,
    feature_name: str,
    bins: int = 10
) -> Dict[str, object]:
    """
    A2.
    Compute histogram for a single feature using numpy.histogram.
    Also compute mean and variance of that feature.
    Returns a dict containing counts, bin_edges, mean, variance.
    """
    values = X[feature_name].to_numpy()
    counts, bin_edges = np.histogram(values, bins=bins)
    return {
        'feature': feature_name,
        'bins': int(bins),
        'counts': counts.tolist(),
        'bin_edges': bin_edges.tolist(),
        'mean': float(np.mean(values)),
        'variance': float(np.var(values))
    }

# A3. Minkowski distance between two feature vectors for r=1..10
def minkowski_curve(
    vec1: np.ndarray,
    vec2: np.ndarray,
    r_min: int = 1,
    r_max: int = 10
) -> Dict[str, List[float]]:
    """
    A3.
    Compute Minkowski distances between two vectors for orders r = r_min..r_max.
    Returns a dict with the r values and the corresponding distances.
    """
    rs = list(range(r_min, r_max + 1))
    dists = []
    for r in rs:
        d = np.power(np.sum(np.abs(vec1 - vec2) ** r), 1.0 / r)
        dists.append(float(d))
    return {'r': rs, 'distance': dists}

# A4. Train-test split (with scaling kept optional and outside)
def train_test_split_binary(
    X: pd.DataFrame,
    y: pd.Series,
    test_size: float = 0.3,
    random_state: int = 42,
    scale: bool = True
) -> Dict[str, object]:
    """
    A4.
    Perform a train-test split. Optionally standardize features using StandardScaler.
    Returns a dict with X_train, X_test, y_train, y_test, and scaler (if used).
    Note: labels are encoded to integers (0/1) to support ROC if needed.
    """
    le = LabelEncoder()
    y_enc = le.fit_transform(y)
    X_train, X_test, y_train, y_test = train_test_split(
        X.values, y_enc, test_size=test_size, random_state=random_state, stratify=y_enc
    )
    scaler = None
    if scale:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    return {
        'X_train': X_train, 'X_test': X_test,
        'y_train': y_train, 'y_test': y_test,
        'label_encoder': le, 'scaler': scaler
    }

# A5–A7. Train kNN, score, and predictions
def train_knn_classifier(
    X_train: np.ndarray,
    y_train: np.ndarray,
    k_neighbors: int = 3,
    metric: str = 'minkowski',
    p: int = 2
) -> KNeighborsClassifier:
    """
    A5.
    Train a k-NN classifier with given hyperparameters.
    """
    clf = KNeighborsClassifier(n_neighbors=k_neighbors, metric=metric, p=p)
    clf.fit(X_train, y_train)
    return clf


def evaluate_accuracy(
    clf: KNeighborsClassifier,
    X_test: np.ndarray,
    y_test: np.ndarray
) -> float:
    """
    A6.
    Compute accuracy on the test set.
    """
    return float(clf.score(X_test, y_test))


def predict_labels(
    clf: KNeighborsClassifier,
    X_test: np.ndarray
) -> np.ndarray:
    """
    A7.
    Predict labels for test vectors.
    """
    return clf.predict(X_test)

# A8. Vary k from 1 to 11 and collect accuracy vs k (curve)
def k_sweep_accuracy(
    X_train: np.ndarray,
    y_train: np.ndarray,
    X_test: np.ndarray,
    y_test: np.ndarray,
    k_min: int = 1,
    k_max: int = 11,
    metric: str = 'minkowski',
    p: int = 2
) -> Dict[str, List[float]]:
    """
    A8.
    Sweep k from k_min to k_max and return accuracies for each k.
    """
    ks = list(range(k_min, k_max + 1))
    accs = []
    for k in ks:
        clf = KNeighborsClassifier(n_neighbors=k, metric=metric, p=p)
        clf.fit(X_train, y_train)
        acc = accuracy_score(y_test, clf.predict(X_test))
        accs.append(float(acc))
    return {'k': ks, 'accuracy': accs}

# A9. Confusion matrix and metrics (precision/recall/F1)
def compute_confusion_and_metrics(
    clf: KNeighborsClassifier,
    X_train: np.ndarray,
    y_train: np.ndarray,
    X_test: np.ndarray,
    y_test: np.ndarray
) -> Dict[str, Dict[str, float]]:
    """
    A9.
    Compute confusion matrices and precision/recall/F1 for train and test sets.
    Also returns accuracies for both splits.
    """
    results = {}
    for split_name, Xs, ys in [('train', X_train, y_train), ('test', X_test, y_test)]:
        y_pred = clf.predict(Xs)
        cm = confusion_matrix(ys, y_pred)
        prec = precision_score(ys, y_pred, zero_division=0)
        rec = recall_score(ys, y_pred, zero_division=0)
        f1 = f1_score(ys, y_pred, zero_division=0)
        acc = accuracy_score(ys, y_pred)
        results[split_name] = {
            'confusion_matrix': cm.tolist(),
            'precision': float(prec),
            'recall': float(rec),
            'f1_score': float(f1),
            'accuracy': float(acc)
        }
    return results

# Main Program
if __name__ == "__main__":
    # 1) Load and choose two classes for binary classification
    df_full = load_dataset("Crop_recommendation.csv")
    class_a = "rice"       # choose any class present in the CSV
    class_b = "maize"      # choose another class
    df_bin = select_two_classes(df_full, class_a, class_b)
    X, y = split_features_labels(df_bin)

    # 2) A1: Intraclass stats and interclass centroid distance
    stats = compute_class_stats(X, y)
    centroid_distance = compute_centroid_distance(stats, class_a, class_b)

    print("A1 — Class centroids/spreads and interclass distance")
    print(f"Classes: {class_a} vs {class_b}")
    print(f"Centroid distance: {centroid_distance:.4f}")
    for cls in [class_a, class_b]:
        print(f"  {cls} centroid (first 4 dims): {np.round(stats[cls]['centroid'][:4], 4)}")
        print(f"  {cls} spread   (first 4 dims): {np.round(stats[cls]['spread'][:4], 4)}")

    # 3) A2: Histogram stats for a chosen feature
    feature_name = "temperature"  # choose any single feature
    hist_info = feature_histogram_stats(X, feature_name, bins=12)
    print("\nA2 — Histogram, mean, variance for feature:", feature_name)
    print(json.dumps(hist_info, indent=2))

    # 4) A3: Minkowski distance between two arbitrary vectors (first two rows)
    v1 = X.iloc[0].to_numpy(dtype=float)
    v2 = X.iloc[1].to_numpy(dtype=float)
    mink_curve = minkowski_curve(v1, v2, r_min=1, r_max=10)
    print("\nA3 — Minkowski distance r=1..10 between first two samples")
    print(json.dumps(mink_curve, indent=2))

    # 5) A4: Train-test split (with scaling)
    split = train_test_split_binary(X, y, test_size=0.3, random_state=42, scale=True)
    X_train, X_test = split['X_train'], split['X_test']
    y_train, y_test = split['y_train'], split['y_test']
    label_encoder = split['label_encoder']

    # 6) A5: Train kNN (k=3)
    knn_k3 = train_knn_classifier(X_train, y_train, k_neighbors=3, metric='minkowski', p=2)

    # 7) A6: Test accuracy
    test_acc_k3 = evaluate_accuracy(knn_k3, X_test, y_test)
    print("\nA6 — Test accuracy (k=3):", round(test_acc_k3, 4))

    # 8) A7: Predictions for test set
    y_pred_test = predict_labels(knn_k3, X_test)
    y_pred_labels = label_encoder.inverse_transform(y_pred_test)
    print("\nA7 — First 10 predictions (original labels):", y_pred_labels[:10].tolist())

    # 9) A8: Accuracy vs k (1..11)
    sweep = k_sweep_accuracy(X_train, y_train, X_test, y_test, k_min=1, k_max=11, metric='minkowski', p=2)
    print("\nA8 — Accuracy vs k (1..11):")
    print(json.dumps(sweep, indent=2))

    # 10) A9: Confusion matrices and metrics (train/test)
    perf = compute_confusion_and_metrics(knn_k3, X_train, y_train, X_test, y_test)
    print("\nA9 — Confusion matrices and metrics (k=3):")
    print(json.dumps(perf, indent=2))

A1 — Class centroids/spreads and interclass distance
Classes: rice vs maize
Centroid distance: 153.7258
  rice centroid (first 4 dims): [79.89   47.58   39.87   23.6893]
  rice spread   (first 4 dims): [11.8582  7.8653  2.9314  2.0211]
  maize centroid (first 4 dims): [77.76   48.44   19.79   22.3892]
  maize spread   (first 4 dims): [11.8896  7.9703  2.9268  2.6659]

A2 — Histogram, mean, variance for feature: temperature
{
  "feature": "temperature",
  "bins": 12,
  "counts": [
    14,
    10,
    7,
    15,
    16,
    15,
    17,
    26,
    14,
    27,
    22,
    17
  ],
  "bin_edges": [
    18.04185513,
    18.782529766666666,
    19.52320440333333,
    20.26387904,
    21.004553676666667,
    21.74522831333333,
    22.48590295,
    23.226577586666664,
    23.967252223333332,
    24.70792686,
    25.44860149666667,
    26.189276133333333,
    26.92995077
  ],
  "mean": 23.039268060350004,
  "variance": 6.018575201679887
}

A3 — Minkowski distance r=1..10 between first two sample

In [2]:
from google.colab import files
uploaded = files.upload()

Saving Crop_recommendation.csv to Crop_recommendation.csv
