In [1]:
from google.colab import files
uploaded = files.upload()

Saving Crop_recommendation.csv to Crop_recommendation.csv


In [2]:
# Lab 05 – Classification project dataset used for regression + k-means tasks
# Dataset: Crop_recommendation.csv (features: N,P,K,temperature,humidity,ph,rainfall; target: label)

import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# -----------------------
# Data loading and splits
# -----------------------

def load_crop_data(csv_path: str) -> pd.DataFrame:
    """Load the crop dataset."""
    df = pd.read_csv(csv_path)
    return df

def make_train_test(df: pd.DataFrame, test_size: float = 0.2, random_state: int = 42):
    """Prepare supervised-learning splits for a regression surrogate target.
    We map the categorical crop label to a numeric surrogate target using one attribute as target
    as per A1 instructions for classification projects."""
    # Choose one numeric attribute as surrogate target (A1): rainfall
    # X will be the remaining attributes except the chosen target and the original label
    y_one = df['rainfall'].values
    X_one = df.drop(columns=['rainfall', 'label'])

    # For A3 (multi-attribute): predict rainfall using all remaining attributes (same X_one)
    # We will reuse the same splits for fairness
    X_train, X_test, y_train, y_test = train_test_split(
        X_one, y_one, test_size=test_size, random_state=random_state
    )
    return X_train, X_test, y_train, y_test

# -----------------------------------
# Metrics for regression (A2 helpers)
# -----------------------------------

def mape(y_true, y_pred):
    """Mean Absolute Percentage Error with safe division."""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    eps = 1e-8
    return np.mean(np.abs((y_true - y_pred) / (np.maximum(np.abs(y_true), eps)))) * 100.0

def evaluate_regression(y_true, y_pred):
    """Return MSE, RMSE, MAPE, R2 in a dict."""
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mape_val = mape(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {"MSE": mse, "RMSE": rmse, "MAPE%": mape_val, "R2": r2}

# ---------------------------------------------------
# Pipelines for A1 (one feature) and A3 (multi-feat.)
# ---------------------------------------------------

def train_lr_one_attribute(X_train, y_train, single_feature: str = 'N'):
    """A1: Linear regression using only one attribute to predict rainfall."""
    # Keep only the chosen column
    X_tr = X_train[[single_feature]].copy()
    # Scale feature to stabilize training
    pipe = Pipeline(steps=[
        ("scaler", StandardScaler()),
        ("lr", LinearRegression())
    ])
    pipe.fit(X_tr, y_train)
    return pipe, single_feature

def predict_lr_one_attribute(model: Pipeline, X, feature: str):
    """Predict using the one-attribute model."""
    return model.predict(X[[feature]])

def train_lr_multi_attribute(X_train, y_train):
    """A3: Linear regression using all available attributes to predict rainfall."""
    numeric_features = X_train.columns.tolist()
    pre = ColumnTransformer(
        transformers=[("num", StandardScaler(), numeric_features)],
        remainder='drop'
    )
    pipe = Pipeline(steps=[
        ("pre", pre),
        ("lr", LinearRegression())
    ])
    pipe.fit(X_train, y_train)
    return pipe

def predict_lr_multi_attribute(model: Pipeline, X):
    """Predict using the multi-attribute model."""
    return model.predict(X)

# ---------------------------------
# K-means clustering tasks (A4–A7)
# ---------------------------------

def kmeans_fit(X_train, k: int, random_state: int = 42, n_init: int = 10):
    """Fit k-means on features only (label removed)."""
    # Standardize to make k-means distance meaningful
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X_train)
    km = KMeans(n_clusters=k, random_state=random_state, n_init=n_init)
    km.fit(Xs)
    return km, scaler

def kmeans_internal_scores(X, labels):
    """Compute Silhouette, Calinski–Harabasz, Davies–Bouldin."""
    return {
        "silhouette": silhouette_score(X, labels),
        "calinski_harabasz": calinski_harabasz_score(X, labels),
        "davies_bouldin": davies_bouldin_score(X, labels),
    }

def k_sweep_scores(X_train, k_values=range(2, 11), random_state: int = 42, n_init: int = 10):
    """A6: Evaluate scores across k values."""
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X_train)
    results = []
    for k in k_values:
        km = KMeans(n_clusters=k, random_state=random_state, n_init=n_init)
        km.fit(Xs)
        scores = kmeans_internal_scores(Xs, km.labels_)
        scores["k"] = k
        scores["inertia"] = km.inertia_
        results.append(scores)
    return pd.DataFrame(results).sort_values("k")

def elbow_inertia(X_train, k_values=range(2, 21), random_state: int = 42, n_init: int = 10):
    """A7: Compute inertia values for elbow plot."""
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X_train)
    ks, inertias = [], []
    for k in k_values:
        km = KMeans(n_clusters=k, random_state=random_state, n_init=n_init)
        km.fit(Xs)
        ks.append(k)
        inertias.append(km.inertia_)
    return pd.DataFrame({"k": ks, "inertia": inertias})

# -------------
# Main program
# -------------
if __name__ == "__main__":
    # Paths
    data_path = Path("Crop_recommendation.csv")

    # Load
    df = load_crop_data(str(data_path))

    # Supervised splits using rainfall as surrogate target (A1/A3)
    X_train, X_test, y_train, y_test = make_train_test(df)

    # A1: Linear Regression with one attribute (choose 'N' by default; change if desired)
    lr_one, feat = train_lr_one_attribute(X_train, y_train, single_feature='N')
    ytr_pred_one = predict_lr_one_attribute(lr_one, X_train, feat)
    yte_pred_one = predict_lr_one_attribute(lr_one, X_test, feat)
    metrics_train_one = evaluate_regression(y_train, ytr_pred_one)
    metrics_test_one = evaluate_regression(y_test, yte_pred_one)
    print(f"A1/A2 – One-attribute LR using '{feat}' -> Train metrics: {metrics_train_one}")
    print(f"A1/A2 – One-attribute LR using '{feat}' -> Test  metrics: {metrics_test_one}")

    # A3: Linear Regression with all attributes
    lr_all = train_lr_multi_attribute(X_train, y_train)
    ytr_pred_all = predict_lr_multi_attribute(lr_all, X_train)
    yte_pred_all = predict_lr_multi_attribute(lr_all, X_test)
    metrics_train_all = evaluate_regression(y_train, ytr_pred_all)
    metrics_test_all = evaluate_regression(y_test, yte_pred_all)
    print(f"A3 – All-attribute LR -> Train metrics: {metrics_train_all}")
    print(f"A3 – All-attribute LR -> Test  metrics: {metrics_test_all}")

    # A4: K-means clustering on features (drop target label); use k=2 as example
    X_for_clustering = df.drop(columns=['label'])  # target removed as instructed
    km2, scaler2 = kmeans_fit(X_for_clustering, k=2, random_state=42)
    Xs2 = scaler2.transform(X_for_clustering)
    scores_k2 = kmeans_internal_scores(Xs2, km2.labels_)
    print(f"A4/A5 – KMeans k=2 -> centers shape: {km2.cluster_centers_.shape}")
    print(f"A5 – Scores (k=2): {scores_k2}")

    # A6: Sweep k and compute internal validation indices
    sweep_df = k_sweep_scores(X_for_clustering, k_values=range(2, 11), random_state=42)
    print("A6 – k sweep scores (head):")
    print(sweep_df.head(10).to_string(index=False))

    # A7: Elbow – inertia vs k
    elbow_df = elbow_inertia(X_for_clustering, k_values=range(2, 21), random_state=42)
    print("A7 – Elbow inertia data (first 10 rows):")
    print(elbow_df.head(10).to_string(index=False))

    # Optional: If running in notebooks, you may plot from these dataframes:
    #   sweep_df.plot(x='k', y=['silhouette','calinski_harabasz','davies_bouldin'])
    #   elbow_df.plot(x='k', y='inertia', marker='o')


A1/A2 – One-attribute LR using 'N' -> Train metrics: {'MSE': 3004.6487559367247, 'RMSE': np.float64(54.81467646476375), 'MAPE%': np.float64(59.52083717707944), 'R2': 0.004425830282330234}
A1/A2 – One-attribute LR using 'N' -> Test  metrics: {'MSE': 3028.1081727566134, 'RMSE': np.float64(55.02824886144037), 'MAPE%': np.float64(55.886134849538585), 'R2': -0.005896678372762976}
A3 – All-attribute LR -> Train metrics: {'MSE': 2898.610446934675, 'RMSE': np.float64(53.838744849175995), 'MAPE%': np.float64(58.069570510393085), 'R2': 0.03956105240584573}
A3 – All-attribute LR -> Test  metrics: {'MSE': 2973.049110541983, 'RMSE': np.float64(54.525673866005384), 'MAPE%': np.float64(54.93777359209605), 'R2': 0.012393199212950234}
A4/A5 – KMeans k=2 -> centers shape: (2, 7)
A5 – Scores (k=2): {'silhouette': np.float64(0.4168458131187345), 'calinski_harabasz': np.float64(685.7252777990983), 'davies_bouldin': np.float64(0.7967758017826654)}
A6 – k sweep scores (head):
 silhouette  calinski_harabasz  