# IT001 — PCA

Run this notebook to perform your assigned preprocessing step. Visuals are shown and outputs saved under `results/outputs/`.

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import PowerTransformer, StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (roc_auc_score, confusion_matrix, classification_report,
                             RocCurveDisplay, PrecisionRecallDisplay)

DATA_PATH = "/content/Group_07/data/raw/Lung Cancer.csv"
df = pd.read_csv(DATA_PATH).drop(columns=["id"], errors="ignore")
print("Loaded:", df.shape)

In [None]:
import matplotlib.pyplot as plt
def safe_hist(series, title, bins=30):
    plt.figure(); s = pd.to_numeric(series, errors="coerce").dropna()
    if len(s)==0: plt.title(f"{title} (no numeric)"); plt.show(); return
    plt.hist(s, bins=bins); plt.title(title); plt.xlabel(series.name); plt.ylabel("Count"); plt.show()

def bar_top_counts(series, title, topn=10, rotate=45):
    plt.figure(); vc = series.astype(str).value_counts().head(topn)
    plt.bar(vc.index, vc.values); plt.title(title); plt.xticks(rotation=rotate, ha="right"); plt.tight_layout(); plt.show()

def boxplot_two(before, after, title_before, title_after):
    plt.figure()
    data = [pd.to_numeric(before, errors="coerce").dropna(),
            pd.to_numeric(after,  errors="coerce").dropna()]
    plt.boxplot(data, labels=[title_before, title_after], showfliers=False)
    plt.title("Before vs After"); plt.show()

In [None]:
num_all = [c for c in ["age","bmi","cholesterol_level","diagnosis_year","diagnosis_month",
                       "treatment_length_days","comorbidity_count","ever_smoked","passive_exposure"] if c in df.columns]
k = min(5, len(num_all)) if len(num_all) else 0
if k==0:
    print("No numeric columns for PCA."); X_pca = pd.DataFrame(index=df.index)
else:
    pca = PCA(n_components=k).fit(df[num_all])
    X_pca = pd.DataFrame(pca.transform(df[num_all]), columns=[f"pca_{i+1}" for i in range(k)], index=df.index)
    cum = (pca.explained_variance_ratio_).cumsum()
    plt.figure(); plt.plot(range(1,len(cum)+1), cum, marker="o"); plt.grid(); plt.title("PCA variance"); plt.xlabel("k"); plt.ylabel("cum var"); plt.show()
X_pca.to_csv("/content/Group_07/results/outputs/pca_features.csv", index=False)
print("Saved -> results/outputs/pca_features.csv")