NEW APPROACH

DATA PREPROCESSING 

In [21]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

np.random.seed(42)

CSV_PATH = "/kaggle/input/behavioral-risk-factor-surveillance-system/2015.csv"

USECOLS = [
    'DIABETE3','_RFHYPE5','TOLDHI2','_CHOLCHK','_BMI5','SMOKE100',
    'CVDSTRK3','_MICHD','_TOTINDA','_FRTLT1','_VEGLT1','_RFDRHV5',
    'HLTHPLN1','MEDCOST','GENHLTH','MENTHLTH','PHYSHLTH','DIFFWALK',
    'SEX','_AGEG5YR','EDUCA','INCOME2'
]

# ---------------------------
# 0) Load + baseline metrics
# ---------------------------
raw = pd.read_csv(CSV_PATH, usecols=USECOLS, low_memory=False)
print("=== LOAD ===")
print(f"Raw shape: {raw.shape}")
print("Raw DIABETE3 (codes) value_counts:\n", raw['DIABETE3'].value_counts(dropna=False).sort_index())
print("-" * 60)

df = raw.copy()

def drop_report(df, mask_keep, desc):
    """Keep rows where mask_keep is True; report rows dropped."""
    before = len(df)
    df = df[mask_keep].copy()
    after = len(df)
    print(f"{desc}: kept {after} / dropped {before - after}")
    return df

# Keep a copy of BMI before rounding for comparison
_BMI5_before = df['_BMI5'].copy()

# ------------------------------------------
# 1) BRFSS recodes / drops (with logging)
# ------------------------------------------
print("=== CLEANING & RECODING ===")

df = drop_report(df, ~df['DIABETE3'].isin([7,9]), "DIABETE3 exclude {7,9}")
df['DIABETE3'] = df['DIABETE3'].replace({2:0, 3:0, 1:2, 4:1})
print("DIABETE3 (mapped to 0/1/2) value_counts:\n", df['DIABETE3'].value_counts().sort_index())

df = drop_report(df, df['_RFHYPE5'] != 9, "_RFHYPE5 != 9")
df['_RFHYPE5'] = df['_RFHYPE5'].replace({1:0, 2:1})

df = drop_report(df, ~df['TOLDHI2'].isin([7,9]), "TOLDHI2 exclude {7,9}")
df['TOLDHI2'] = df['TOLDHI2'].replace({2:0})

df = drop_report(df, df['_CHOLCHK'] != 9, "_CHOLCHK != 9")
df['_CHOLCHK'] = df['_CHOLCHK'].replace({2:0, 3:0})

print(f"_BMI5 before rounding (describe):\n{_BMI5_before.describe()}")
df['_BMI5'] = (df['_BMI5'] / 100).round(0)
print(f"BMI after rounding to nearest integer (describe):\n{df['_BMI5'].describe()}")

df = drop_report(df, ~df['SMOKE100'].isin([7,9]), "SMOKE100 exclude {7,9}")
df['SMOKE100'] = df['SMOKE100'].replace({2:0})

df = drop_report(df, ~df['CVDSTRK3'].isin([7,9]), "CVDSTRK3 exclude {7,9}")
df['CVDSTRK3'] = df['CVDSTRK3'].replace({2:0})

df['_MICHD'] = df['_MICHD'].replace({2:0})

df = drop_report(df, df['_TOTINDA'] != 9, "_TOTINDA != 9")
df['_TOTINDA'] = df['_TOTINDA'].replace({2:0})

df = drop_report(df, df['_FRTLT1'] != 9, "_FRTLT1 != 9")
df['_FRTLT1'] = df['_FRTLT1'].replace({2:0})

df = drop_report(df, df['_VEGLT1'] != 9, "_VEGLT1 != 9")
df['_VEGLT1'] = df['_VEGLT1'].replace({2:0})

df = drop_report(df, df['_RFDRHV5'] != 9, "_RFDRHV5 != 9")
df['_RFDRHV5'] = df['_RFDRHV5'].replace({1:0, 2:1})

df = drop_report(df, ~df['HLTHPLN1'].isin([7,9]), "HLTHPLN1 exclude {7,9}")
df['HLTHPLN1'] = df['HLTHPLN1'].replace({2:0})

df = drop_report(df, ~df['MEDCOST'].isin([7,9]), "MEDCOST exclude {7,9}")
df['MEDCOST'] = df['MEDCOST'].replace({2:0})

df = drop_report(df, ~df['GENHLTH'].isin([7,9]), "GENHLTH exclude {7,9}")

df['MENTHLTH'] = df['MENTHLTH'].replace({88:0})
df = drop_report(df, ~df['MENTHLTH'].isin([77,99]), "MENTHLTH exclude {77,99}")

df['PHYSHLTH'] = df['PHYSHLTH'].replace({88:0})
df = drop_report(df, ~df['PHYSHLTH'].isin([77,99]), "PHYSHLTH exclude {77,99}")

df = drop_report(df, ~df['DIFFWALK'].isin([7,9]), "DIFFWALK exclude {7,9}")
df['DIFFWALK'] = df['DIFFWALK'].replace({2:0})

df['SEX'] = df['SEX'].replace({2:0})
df = drop_report(df, df['_AGEG5YR'] != 14, "_AGEG5YR != 14")
df = drop_report(df, df['EDUCA'] != 9, "EDUCA != 9")
df = drop_report(df, ~df['INCOME2'].isin([77,99]), "INCOME2 exclude {77,99}")

before_na = len(df)
df = df.dropna()
print(f"dropna(): kept {len(df)} / dropped {before_na - len(df)}")

print("-" * 60)
print("Post-cleaning shape:", df.shape)

# ------------------------------------------
# 2) Rename columns 
# ------------------------------------------
df = df.rename(columns={
    'DIABETE3':'Diabetes_012','_RFHYPE5':'HighBP','TOLDHI2':'HighChol','_CHOLCHK':'CholCheck',
    '_BMI5':'BMI','SMOKE100':'Smoker','CVDSTRK3':'Stroke','_MICHD':'HeartDiseaseorAttack',
    '_TOTINDA':'PhysActivity','_FRTLT1':'Fruits','_VEGLT1':'Veggies','_RFDRHV5':'HvyAlcoholConsump',
    'HLTHPLN1':'AnyHealthcare','MEDCOST':'NoDocbcCost','GENHLTH':'GenHlth','MENTHLTH':'MentHlth',
    'PHYSHLTH':'PhysHlth','DIFFWALK':'DiffWalk','SEX':'Sex','_AGEG5YR':'Age',
    'EDUCA':'Education','INCOME2':'Income'
})
print("Renamed columns. Sample columns:\n", list(df.columns))

# ------------------------------------------
# 3) Save cleaned MULTICLASS
# ------------------------------------------
multi_out = "/kaggle/working/diabetes_012_health_indicators_BRFSS2015.csv"
df.to_csv(multi_out, index=False)
print(f"Saved multiclass cleaned CSV → {multi_out}")
print("Diabetes_012 value_counts:\n", df['Diabetes_012'].value_counts().sort_index())
print("-" * 60)

# ------------------------------------------
# 4) Make BINARY (1 = prediabetes/diabetes)
# ------------------------------------------
bin_df = df.copy()
bin_df['Diabetes_binary'] = bin_df['Diabetes_012'].replace({0:0, 1:1, 2:1})
bin_df = bin_df.drop(columns=['Diabetes_012'])

bin_out = "/kaggle/working/diabetes_binary_health_indicators_BRFSS2015.csv"
bin_df.to_csv(bin_out, index=False)
print(f"Saved binary cleaned CSV → {bin_out}")
print("Binary target value_counts (real distribution):\n", bin_df['Diabetes_binary'].value_counts())
print("Binary target proportion:\n", (bin_df['Diabetes_binary'].value_counts(normalize=True).sort_index()))
print("-" * 60)

# ----------------------------------------------------------------
# 5) Stratified split (NO manual 70:30 dataset construction)
# ----------------------------------------------------------------
X = bin_df.drop(columns=['Diabetes_binary'])
y = bin_df['Diabetes_binary']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

print("=== SPLIT ===")
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print("Train target counts:", Counter(y_train))
print("Test  target counts:", Counter(y_test))
print("Train target ratio:", {k: round(v/len(y_train), 4) for k, v in Counter(y_train).items()})
print("Test  target ratio:", {k: round(v/len(y_test), 4) for k, v in Counter(y_test).items()})
print("-" * 60)

# ----------------------------------------------------------------
# 6) Balance ONLY the training set (no external libs)
#    A) Random Over-Sampling  (duplicate minority)
#    B) Random Under-Sampling (trim majority)
# ----------------------------------------------------------------
def make_over_sampled_train(X_tr, y_tr):
    train = X_tr.copy()
    train['Diabetes_binary'] = y_tr.values
    counts = train['Diabetes_binary'].value_counts()
    maj_class = counts.idxmax()
    min_class = counts.idxmin()
    n_major = counts.max()
    n_minor = counts.min()

    # sample minority with replacement to match majority size
    minority_df = train[train['Diabetes_binary'] == min_class]
    majority_df = train[train['Diabetes_binary'] == maj_class]
    needed = n_major - n_minor
    boot = minority_df.sample(n=needed, replace=True, random_state=42)
    over_df = pd.concat([majority_df, minority_df, boot], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

    X_over = over_df.drop(columns=['Diabetes_binary'])
    y_over = over_df['Diabetes_binary']
    return X_over, y_over

def make_under_sampled_train(X_tr, y_tr):
    train = X_tr.copy()
    train['Diabetes_binary'] = y_tr.values
    counts = train['Diabetes_binary'].value_counts()
    maj_class = counts.idxmax()
    min_class = counts.idxmin()
    n_minor = counts.min()

    # sample majority without replacement down to minority size
    minority_df = train[train['Diabetes_binary'] == min_class]
    majority_df = train[train['Diabetes_binary'] == maj_class].sample(n=n_minor, replace=False, random_state=42)
    under_df = pd.concat([majority_df, minority_df], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

    X_under = under_df.drop(columns=['Diabetes_binary'])
    y_under = under_df['Diabetes_binary']
    return X_under, y_under

X_train_over, y_train_over = make_over_sampled_train(X_train, y_train)
X_train_under, y_train_under = make_under_sampled_train(X_train, y_train)

print("=== BALANCING (TRAIN ONLY) ===")
print("Original train counts:", Counter(y_train))
print("Over-sampled train counts:", Counter(y_train_over))
print("Under-sampled train counts:", Counter(y_train_under))

# Optional: scale after resampling if your models benefit from scaling
scaler = StandardScaler()
X_train_over_scaled  = scaler.fit_transform(X_train_over)
X_train_under_scaled = scaler.fit_transform(X_train_under)
X_test_scaled        = scaler.fit_transform(X_test)  # for fairness, fit on train in real pipelines

# Save balanced training sets (unscaled) for reproducible experiments
over_X_out = "/kaggle/working/X_train_over.csv"
over_y_out = "/kaggle/working/y_train_over.csv"
under_X_out = "/kaggle/working/X_train_under.csv"
under_y_out = "/kaggle/working/y_train_under.csv"

pd.DataFrame(X_train_over,  columns=X.columns).to_csv(over_X_out, index=False)
pd.DataFrame({"Diabetes_binary": y_train_over}).to_csv(over_y_out, index=False)

pd.DataFrame(X_train_under, columns=X.columns).to_csv(under_X_out, index=False)
pd.DataFrame({"Diabetes_binary": y_train_under}).to_csv(under_y_out, index=False)

print(f"Saved OVER-sampled train → {over_X_out} / {over_y_out}")
print(f"Saved UNDER-sampled train → {under_X_out} / {under_y_out}")
print("Saved all files in /kaggle/working/")


=== LOAD ===
Raw shape: (441456, 22)
Raw DIABETE3 (codes) value_counts:
 DIABETE3
1.0     57256
2.0      3608
3.0    372104
4.0      7690
7.0       598
9.0       193
NaN         7
Name: count, dtype: int64
------------------------------------------------------------
=== CLEANING & RECODING ===


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


DIABETE3 exclude {7,9}: kept 440665 / dropped 791
DIABETE3 (mapped to 0/1/2) value_counts:
 DIABETE3
0.0    375712
1.0      7690
2.0     57256
Name: count, dtype: int64
_RFHYPE5 != 9: kept 439407 / dropped 1258
TOLDHI2 exclude {7,9}: kept 436136 / dropped 3271
_CHOLCHK != 9: kept 421422 / dropped 14714
_BMI5 before rounding (describe):
count    405058.000000
mean       2804.242400
std         665.463433
min        1202.000000
25%        2373.000000
50%        2695.000000
75%        3090.000000
max        9995.000000
Name: _BMI5, dtype: float64
BMI after rounding to nearest integer (describe):
count    387818.000000
mean         28.066119
std           6.647624
min          12.000000
25%          24.000000
50%          27.000000
75%          31.000000
max          98.000000
Name: _BMI5, dtype: float64
SMOKE100 exclude {7,9}: kept 418475 / dropped 2947
CVDSTRK3 exclude {7,9}: kept 417464 / dropped 1011
_TOTINDA != 9: kept 383139 / dropped 34325
_FRTLT1 != 9: kept 372733 / dropped 10406
_