In [38]:
# ============================================================================
# MEMORY-EFFICIENT PREPROCESSING FOR LARGE DATASETS
# Optimized for 200K rows × 10K columns
# ============================================================================

import pandas as pd
import numpy as np
import gc
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split, KFold
import warnings
warnings.filterwarnings('ignore')


In [39]:
# ============================================================================
# 1. DEFINE NON-MEDICAL COLUMNS (Allowed Features)
# ============================================================================

NON_MEDICAL_COLUMNS = [
    "NACCID", "BIRTHMO", "BIRTHYR", "SEX", "HISPANIC", "RACE", "EDUC", 
    "MARISTAT", "HANDED", "INBIRMO", "INBIRYR", "INSEX", "INRELTO", "INKNOWN",
    "HEIGHT", "WEIGHT", "BPSYS", "BPDIAS", "HRATE", "VISION", "VISCORR", "VISWCORR", "HEARING", 
    "HEARAID", "HEARWAID", "PACKET", "FORMVER", "VISITMO", "VISITDAY", 
    "VISITYR", "NACCVNUM", "NACCAVST", "NACCNVST", "NACCDIED", "NACCMOD", 
    "NACCYOD", "TELCOV", "TELMOD"
]

TARGET_COLUMN = 'DEMENTED'

In [40]:
# ============================================================================
# 2. MEMORY-EFFICIENT DATA LOADING
# ============================================================================

print("\n" + "="*80)
print("LOADING DATA (MEMORY-EFFICIENT)")
print("="*80)

# Load in chunks to reduce memory
columns_to_load = NON_MEDICAL_COLUMNS + [TARGET_COLUMN]
chunk_size = 50000

chunks = []
for chunk in pd.read_csv('Dementia Prediction Dataset.csv', 
                          usecols=columns_to_load,
                          chunksize=chunk_size,
                          low_memory=False):
    # Optimize dtypes immediately
    for col in chunk.columns:
        if chunk[col].dtype == 'int64':
            chunk[col] = chunk[col].astype('int32')
        elif chunk[col].dtype == 'float64':
            chunk[col] = chunk[col].astype('float32')
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)
del chunks
gc.collect()

print(f"Dataset loaded: {df.shape}")
print(f"Memory: {df.memory_usage().sum() / 1024**2:.2f} MB")


LOADING DATA (MEMORY-EFFICIENT)
Dataset loaded: (195196, 39)
Memory: 33.51 MB


In [41]:
# ============================================================================
# 3. INITIAL EXPLORATION & CLEANING
# ============================================================================

print("\n" + "="*80)
print("DATA CLEANING")
print("="*80)

print(f"Target Distribution:\n{df[TARGET_COLUMN].value_counts()}")

# Separate features and target
X = df.drop([TARGET_COLUMN, 'NACCID'], axis=1)
y = df[TARGET_COLUMN]
patient_ids = df['NACCID']
del df
gc.collect()


DATA CLEANING
Target Distribution:
DEMENTED
0    137606
1     57590
Name: count, dtype: int64


0

In [42]:
# ============================================================================
# 4. HANDLE SPECIAL CODES & MISSING VALUES
# ============================================================================

print("\n" + "="*80)
print("HANDLING MISSING VALUES")
print("="*80)

# Replace special codes
special_codes = [-4, 88, 99, 888, 9999]
X = X.replace(special_codes, np.nan)

# Drop columns with >70% missing
threshold = 0.7
missing_ratio = X.isnull().sum() / len(X)
cols_to_drop = missing_ratio[missing_ratio > threshold].index.tolist()

if cols_to_drop:
    print(f"Dropping {len(cols_to_drop)} columns with >{threshold*100}% missing")
    X = X.drop(columns=cols_to_drop)
    gc.collect()

print(f"Remaining features: {X.shape[1]}")


HANDLING MISSING VALUES
Dropping 4 columns with >70.0% missing
Remaining features: 33


In [43]:
# ============================================================================
# 5. FEATURE ENGINEERING
# ============================================================================

print("\n" + "="*80)
print("FEATURE ENGINEERING")
print("="*80)

# Age
if 'BIRTHYR' in X.columns and 'VISITYR' in X.columns:
    X['AGE_AT_VISIT'] = (X['VISITYR'] - X['BIRTHYR']).astype('float32')

# BMI
if 'HEIGHT' in X.columns and 'WEIGHT' in X.columns:
    X['BMI'] = ((X['WEIGHT'] * 0.453592) / ((X['HEIGHT'] * 0.0254) ** 2)).clip(10, 60).astype('float32')

# Education level
if 'EDUC' in X.columns:
    X['EDUC_LEVEL'] = pd.cut(X['EDUC'], bins=[0, 12, 16, 20, 99], labels=[0, 1, 2, 3]).astype('float32')

# Smoking intensity
if 'SMOKYRS' in X.columns and 'PACKSPER' in X.columns:
    X['PACK_YEARS'] = (X['SMOKYRS'] * X['PACKSPER']).astype('float32')

# Hypertension
if 'BPSYS' in X.columns and 'BPDIAS' in X.columns:
    X['HIGH_BP'] = ((X['BPSYS'] > 140) | (X['BPDIAS'] > 90)).astype('int8')

# ADL impairment
adl_cols = ['BILLS', 'TAXES', 'SHOPPING', 'GAMES', 'STOVE', 
            'MEALPREP', 'EVENTS', 'PAYATTN', 'REMDATES', 'TRAVEL']
adl_present = [col for col in adl_cols if col in X.columns]
if adl_present:
    X['ADL_IMPAIRMENT'] = X[adl_present].sum(axis=1).astype('float32')

# Depression score
gds_cols = ['SATIS', 'DROPACT', 'EMPTY', 'BORED', 'SPIRITS',
            'AFRAID', 'HAPPY', 'HELPLESS', 'STAYHOME', 'MEMPROB',
            'WONDRFUL', 'WRTHLESS', 'ENERGY', 'HOPELESS', 'BETTER']
gds_present = [col for col in gds_cols if col in X.columns]
if gds_present:
    X['DEPRESSION_SCORE'] = X[gds_present].sum(axis=1).astype('float32')

# NPI total
npi_sev_cols = [col for col in X.columns if 'SEV' in col]
if npi_sev_cols:
    X['NPI_TOTAL'] = X[npi_sev_cols].sum(axis=1).astype('float32')

print(f"Features after engineering: {X.shape[1]}")
gc.collect()


FEATURE ENGINEERING
Features after engineering: 37


0

In [44]:
# ============================================================================
# 6. IMPUTATION (MEMORY-EFFICIENT)
# ============================================================================

print("\n" + "="*80)
print("IMPUTATION")
print("="*80)

# Batch imputation to save memory
batch_size = 20
for i in range(0, len(X.columns), batch_size):
    batch_cols = X.columns[i:i+batch_size]
    for col in batch_cols:
        if X[col].isnull().sum() > 0:
            X[col].fillna(X[col].median(), inplace=True)

print(f"Missing values after imputation: {X.isnull().sum().sum()}")


IMPUTATION
Missing values after imputation: 0


In [45]:
# ============================================================================
# 7. OUTLIER HANDLING
# ============================================================================

print("\n" + "="*80)
print("OUTLIER CAPPING")
print("="*80)

for col in X.columns:
    if X[col].dtype in ['float32', 'int32', 'int8']:
        lower = X[col].quantile(0.01)
        upper = X[col].quantile(0.99)
        X[col] = X[col].clip(lower, upper)

gc.collect()


OUTLIER CAPPING


0

In [46]:
# ============================================================================
# 8. TRAIN-TEST SPLIT (EARLY TO PREVENT LEAKAGE)
# ============================================================================

print("\n" + "="*80)
print("TRAIN-TEST SPLIT")
print("="*80)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {X_train.shape}, Test: {X_test.shape}")
del X, y
gc.collect()



TRAIN-TEST SPLIT
Train: (156156, 37), Test: (39040, 37)


0

In [47]:
# ============================================================================
# 9. MEAN ENCODING (K-FOLD TO PREVENT LEAKAGE)
# ============================================================================

print("\n" + "="*80)
print("MEAN ENCODING")
print("="*80)

def mean_encode_feature(X_tr, X_te, y_tr, feature):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    train_encoded = np.zeros(len(X_tr), dtype='float32')
    
    for train_idx, val_idx in kf.split(X_tr):
        means = pd.concat([X_tr.iloc[train_idx], y_tr.iloc[train_idx]], axis=1).groupby(feature)['DEMENTED'].mean()
        train_encoded[val_idx] = X_tr.iloc[val_idx][feature].map(means).fillna(y_tr.mean()).values
    
    global_means = pd.concat([X_tr, y_tr], axis=1).groupby(feature)['DEMENTED'].mean()
    test_encoded = X_te[feature].map(global_means).fillna(y_tr.mean()).astype('float32')
    
    return train_encoded, test_encoded

if 'SEX' in X_train.columns:
    X_train['SEX_mean'], X_test['SEX_mean'] = mean_encode_feature(X_train, X_test, y_train, 'SEX')
    print("✓ Mean encoded SEX")

gc.collect()


MEAN ENCODING
✓ Mean encoded SEX


0

In [48]:
# ============================================================================
# 10. FEATURE ENGINEERING & SCALING
# ============================================================================

print("\n" + "="*80)
print("CATEGORICAL ENCODING AND FEATURE SCALING")
print("="*80)

# Identify categorical columns (non-numeric)
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Encoding categorical columns: {cat_cols}")

# One-hot encode categorical features
X_train = pd.get_dummies(X_train, columns=cat_cols, drop_first=True)
X_test = pd.get_dummies(X_test, columns=cat_cols, drop_first=True)

# Align columns to ensure test set matches training set structure
common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test.reindex(columns=common_cols, fill_value=0)

# Now perform scaling on fully numeric data
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train).astype('float32'),
    columns=X_train.columns,
    index=X_train.index
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test).astype('float32'),
    columns=X_test.columns,
    index=X_test.index
)

del X_train, X_test
gc.collect()


CATEGORICAL ENCODING AND FEATURE SCALING
Encoding categorical columns: ['PACKET']


0

In [49]:
# ============================================================================
# 11. VARIANCE THRESHOLD
# ============================================================================

print("\n" + "="*80)
print("VARIANCE-BASED FEATURE SELECTION")
print("="*80)

var_selector = VarianceThreshold(threshold=0.01)
X_train_var = var_selector.fit_transform(X_train_scaled)
X_test_var = var_selector.transform(X_test_scaled)

selected_features = X_train_scaled.columns[var_selector.get_support()].tolist()
X_train_final = pd.DataFrame(X_train_var, columns=selected_features, index=X_train_scaled.index)
X_test_final = pd.DataFrame(X_test_var, columns=selected_features, index=X_test_scaled.index)

del X_train_scaled, X_test_scaled
gc.collect()

print(f"Features after variance selection: {len(selected_features)}")


VARIANCE-BASED FEATURE SELECTION
Features after variance selection: 40


In [50]:
# ============================================================================
# 12. CORRELATION-BASED FEATURE SELECTION (MEMORY-EFFICIENT)
# ============================================================================

print("\n" + "="*80)
print("CORRELATION-BASED SELECTION")
print("="*80)

# Process correlation in batches to save memory
corr_threshold = 0.95
cols_to_drop = set()

# Calculate correlation matrix in chunks
chunk_size = 50
for i in range(0, len(X_train_final.columns), chunk_size):
    end_idx = min(i + chunk_size, len(X_train_final.columns))
    chunk_cols = X_train_final.columns[i:end_idx]
    
    corr_chunk = X_train_final[chunk_cols].corr().abs()
    
    for col in chunk_cols:
        if col in cols_to_drop:
            continue
        correlated = corr_chunk.index[(corr_chunk[col] > corr_threshold) & (corr_chunk.index != col)]
        cols_to_drop.update(correlated)
    
    gc.collect()

cols_to_drop = list(cols_to_drop)
if cols_to_drop:
    X_train_final = X_train_final.drop(columns=cols_to_drop)
    X_test_final = X_test_final.drop(columns=cols_to_drop)
    print(f"Removed {len(cols_to_drop)} highly correlated features")

gc.collect()


CORRELATION-BASED SELECTION
Removed 3 highly correlated features


0

In [51]:
# ============================================================================
# 13. POLYNOMIAL FEATURES (TOP FEATURES ONLY)
# ============================================================================

print("\n" + "="*80)
print("POLYNOMIAL FEATURES (TOP 5)")
print("="*80)

from sklearn.preprocessing import PolynomialFeatures

# Select top 5 most important features for polynomial expansion
top_features = ['AGE_AT_VISIT', 'EDUC', 'BMI', 'CDRSUM', 'CDRGLOB']
top_present = [f for f in top_features if f in X_train_final.columns]

if len(top_present) >= 2:
    poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
    
    X_train_poly = poly.fit_transform(X_train_final[top_present])
    X_test_poly = poly.transform(X_test_final[top_present])
    
    poly_features = [f for f in poly.get_feature_names_out(top_present) if ' ' in f]
    
    X_train_poly_df = pd.DataFrame(
        X_train_poly[:, len(top_present):],
        columns=poly_features,
        index=X_train_final.index
    ).astype('float32')
    
    X_test_poly_df = pd.DataFrame(
        X_test_poly[:, len(top_present):],
        columns=poly_features,
        index=X_test_final.index
    ).astype('float32')
    
    X_train_final = pd.concat([X_train_final, X_train_poly_df], axis=1)
    X_test_final = pd.concat([X_test_final, X_test_poly_df], axis=1)
    
    print(f"✓ Created {len(poly_features)} interaction features")
    del X_train_poly_df, X_test_poly_df
    gc.collect()



POLYNOMIAL FEATURES (TOP 5)
✓ Created 3 interaction features


In [52]:
# ============================================================================
# 14. SAVE PREPROCESSED DATA
# ============================================================================

print("\n" + "="*80)
print("SAVING PREPROCESSED DATA")
print("="*80)

# Save train and test separately to manage memory
X_train_final.to_csv('Preprocessed Data/preprocessed_train.csv', index=False)
X_test_final.to_csv('Preprocessed Data/preprocessed_test.csv', index=False)
y_train.to_csv('Preprocessed Data/y_train.csv', index=False, header=True)
y_test.to_csv('Preprocessed Data/y_test.csv', index=False, header=True)

print(f"✓ Train saved: {X_train_final.shape}")
print(f"✓ Test saved: {X_test_final.shape}")
print(f"✓ Final feature count: {X_train_final.shape[1]}")

# Save feature names
with open('Preprocessed Data/feature_names.txt', 'w') as f:
    for i, feat in enumerate(X_train_final.columns, 1):
        f.write(f"{i}. {feat}\n")

print("\n" + "="*80)
print("PREPROCESSING COMPLETE!")
print("="*80)
print(f"""
Final Statistics:
- Train samples: {len(X_train_final):,}
- Test samples: {len(X_test_final):,}
- Features: {X_train_final.shape[1]}
- Memory usage: {(X_train_final.memory_usage().sum() + X_test_final.memory_usage().sum()) / 1024**2:.2f} MB
- Target balance: {y_train.value_counts(normalize=True)[1]:.1%} dementia
""")


SAVING PREPROCESSED DATA
✓ Train saved: (156156, 40)
✓ Test saved: (39040, 40)
✓ Final feature count: 40

PREPROCESSING COMPLETE!

Final Statistics:
- Train samples: 156,156
- Test samples: 39,040
- Features: 40
- Memory usage: 31.27 MB
- Target balance: 29.5% dementia

