In [15]:
pip install -U scikit-learn pandas numpy




In [16]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)

# --------------------- Helpers ---------------------

POSSIBLE_TARGET_NAMES = [
    "target","label","class","diagnosis","lung_cancer","lung-cancer","LUNG_CANCER",
    "Outcome","outcome","Result","result"
]

def guess_target_column(df: pd.DataFrame) -> str:
    # 1) by known names (case-insensitive)
    lower = {c.lower(): c for c in df.columns}
    for name in POSSIBLE_TARGET_NAMES:
        if name.lower() in lower:
            return lower[name.lower()]
    # 2) choose a column with few unique values (2-10) as a likely label
    candidates = [c for c in df.columns if df[c].nunique()<=10 and df[c].nunique()>=2]
    if candidates:
        return candidates[-1]
    # 3) fallback: last column
    return df.columns[-1]

def build_preprocessor(num_cols, cat_cols, for_pca=False):
    """Create a ColumnTransformer.
    If for_pca=True, we'll output dense to feed PCA.
    """
    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ])
    pre = ColumnTransformer([
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ])
    if for_pca:
        # Convert sparse to dense so PCA can work
        pre = Pipeline([
            ("ct", pre),
            ("to_dense", FunctionTransformer(lambda x: x.toarray(), accept_sparse=True))
        ])
    return pre

def evaluate(model, X_test, y_test, prefix=""):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec_macro = precision_score(y_test, y_pred, average="macro", zero_division=0)
    rec_macro  = recall_score(y_test, y_pred, average="macro", zero_division=0)
    f1_macro   = f1_score(y_test, y_pred, average="macro", zero_division=0)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, zero_division=0)
    return {
        "accuracy": acc,
        "precision_macro": prec_macro,
        "recall_macro": rec_macro,
        "f1_macro": f1_macro,
        "confusion_matrix": cm,
        "report_text": report
    }

def get_feature_names_after_preprocess(pipeline: Pipeline, num_cols, cat_cols):
    """
    Returns list of feature names after the ColumnTransformer in the baseline model.
    """
    pre = pipeline.named_steps["preprocess"]
    # If wrapped into Pipeline, unwrap
    if isinstance(pre, Pipeline):
        pre = pre.named_steps["ct"] if "ct" in pre.named_steps else pre

    names = []
    # numeric names
    names.extend(list(num_cols))
    # categorical one-hot names
    ohe = pre.named_transformers_["cat"].named_steps["ohe"]
    ohe_names = list(ohe.get_feature_names_out(cat_cols))
    names.extend(ohe_names)
    return names

def save_confusion_matrix(cm: np.ndarray, path: Path):
    df_cm = pd.DataFrame(cm)
    df_cm.to_csv(path, index=False)

# --------------------- Main ---------------------

def main():
    # Define arguments directly
    data_file = "/content/Lung_Cancer_dataset.csv"
    target_column = None # or specify a column name like "lung_cancer"
    test_size_ratio = 0.2
    random_state_value = 42

    data_path = Path(data_file)
    assert data_path.exists(), f"Data file not found: {data_path.resolve()}"

    out_dir = Path("outputs")
    out_dir.mkdir(exist_ok=True, parents=True)

    # Load
    df = pd.read_csv(data_path)
    print(f"Loaded: {data_path} -> shape {df.shape}")
    # Pick target
    target = target_column or guess_target_column(df)
    assert target in df.columns, f"Target column '{target}' not found in CSV. Use --target to set it."
    print(f"Using target column: {target}")
    print("Class distribution:")
    print(df[target].value_counts(dropna=False))

    # Features/label split
    X = df.drop(columns=[target])
    y = df[target]

    # Identify column types
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
    print(f"Numeric cols: {len(num_cols)} | Categorical cols: {len(cat_cols)}")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size_ratio, random_state=random_state_value, stratify=y if y.nunique()<=20 else None
    )

    # ---------- Baseline Decision Tree ----------
    baseline_pre = build_preprocessor(num_cols, cat_cols, for_pca=False)
    baseline_clf = Pipeline([
        ("preprocess", baseline_pre),
        ("dt", DecisionTreeClassifier(random_state=random_state_value))
    ])
    baseline_clf.fit(X_train, y_train)
    base_eval = evaluate(baseline_clf, X_test, y_test, prefix="baseline")

    # Save baseline artifacts
    (out_dir/"baseline_report.txt").write_text(base_eval["report_text"])
    save_confusion_matrix(base_eval["confusion_matrix"], out_dir/"confusion_matrix_baseline.csv")

    # Feature importance (after OHE)
    try:
        feat_names = get_feature_names_after_preprocess(baseline_clf, num_cols, cat_cols)
        importances = baseline_clf.named_steps["dt"].feature_importances_
        feat_imp_df = pd.DataFrame({"feature": feat_names, "importance": importances})
        feat_imp_df = feat_imp_df.sort_values("importance", ascending=False)
        feat_imp_df.head(50).to_csv(out_dir/"feature_importance_baseline.csv", index=False)
    except Exception as e:
        print("Could not extract feature importances (skipping):", e)

    # ---------- PCA + Decision Tree ----------
    pca_pre = build_preprocessor(num_cols, cat_cols, for_pca=True)
    pca_clf = Pipeline([
        ("preprocess", pca_pre),
        ("pca", PCA(n_components=0.95, random_state=random_state_value)),
        ("dt", DecisionTreeClassifier(random_state=random_state_value))
    ])
    pca_clf.fit(X_train, y_train)
    pca_eval = evaluate(pca_clf, X_test, y_test, prefix="pca")

    # Save PCA artifacts
    (out_dir/"pca_report.txt").write_text(pca_eval["report_text"])
    save_confusion_matrix(pca_eval["confusion_matrix"], out_dir/"confusion_matrix_pca.csv")

    # PCA details
    n_components = pca_clf.named_steps["pca"].n_components_
    explained = pca_clf.named_steps["pca"].explained_variance_ratio_.sum()
    print(f"PCA kept components: {n_components} | Total variance explained: {explained:.3f}")

    # ---------- Comparison table ----------
    compare = pd.DataFrame([
        {"model":"Baseline DT",
         "accuracy":base_eval["accuracy"],
         "precision_macro":base_eval["precision_macro"],
         "recall_macro":base_eval["recall_macro"],
         "f1_macro":base_eval["f1_macro"]},
        {"model":"PCA + DT",
         "accuracy":pca_eval["accuracy"],
         "precision_macro":pca_eval["precision_macro"],
         "recall_macro":pca_eval["recall_macro"],
         "f1_macro":pca_eval["f1_macro"]}
    ])
    compare.to_csv(out_dir/"metrics_compare.csv", index=False)

    # Print summary
    print("\n=== Summary ===")
    print(compare.to_string(index=False))
    print(f"\nConfusion matrices and reports saved to: {out_dir.resolve()}")
    print("Files: baseline_report.txt, pca_report.txt, feature_importance_baseline.csv, metrics_compare.csv, confusion_matrix_*.csv")

if __name__ == "__main__":
    main()

Loaded: /content/Lung_Cancer_dataset.csv -> shape (59, 7)
Using target column: Result
Class distribution:
Result
0    31
1    28
Name: count, dtype: int64
Numeric cols: 4 | Categorical cols: 2
PCA kept components: 33 | Total variance explained: 0.953

=== Summary ===
      model  accuracy  precision_macro  recall_macro  f1_macro
Baseline DT  0.833333         0.875000      0.833333  0.828571
   PCA + DT  0.916667         0.928571      0.916667  0.916084

Confusion matrices and reports saved to: /content/outputs
Files: baseline_report.txt, pca_report.txt, feature_importance_baseline.csv, metrics_compare.csv, confusion_matrix_*.csv


In [17]:
!wget https://raw.githubusercontent.com/plotly/datasets/master/lung_cancer_examples.csv -O lung_cancer_examples.csv

--2025-08-20 07:01:14--  https://raw.githubusercontent.com/plotly/datasets/master/lung_cancer_examples.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2025-08-20 07:01:14 ERROR 404: Not Found.

