In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, TargetEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Load Data
household_data = pd.read_parquet('Cleaned_Household_Data_filled.parquet')
person_data = pd.read_parquet('Updated_Clean_Person_Data_train.parquet')

# Feature Engineering - Household Level
household_data['Total_Assets'] = household_data.filter(like='Is_HH_Have').sum(axis=1)
household_data['Total_Online_Purchases'] = household_data.filter(like='Is_online').sum(axis=1)

# New Features
household_data['Assets_per_person'] = household_data['Total_Assets'] / household_data['HH Size (For FDQ)']
household_data['Online_Purchases_per_person'] = household_data['Total_Online_Purchases'] / household_data['HH Size (For FDQ)']
household_data['Asset_HHSize_Interaction'] = household_data['Total_Assets'] * household_data['HH Size (For FDQ)']
household_data['HH_Size_Squared'] = household_data['HH Size (For FDQ)'] ** 2

# Feature Engineering - Person Level Aggregation
person_agg = person_data.groupby('HH_ID').agg({
    'Age(in years)': ['mean', 'std'],
    'Gender': lambda x: (x == 1).sum(),  # Male count
    'Total year of education completed': 'mean',
    'Whether used internet from any location during last 30 days': 'sum'
}).reset_index()

# Rename columns
person_agg.columns = ['HH_ID', 'Avg_Age', 'Age_StdDev', 'Male_Count', 'Avg_Education', 'Internet_Users']

# Merge Household & Person Data
final_data = pd.merge(household_data, person_agg, on='HH_ID', how='left')

# Fix NaN Values in Numeric Columns
num_cols = ['HH Size (For FDQ)', 'Total_Assets', 'Total_Online_Purchases',
            'Assets_per_person', 'Online_Purchases_per_person', 'Asset_HHSize_Interaction',
            'HH_Size_Squared', 'Avg_Age', 'Age_StdDev', 'Avg_Education', 'Internet_Users'] # 11
final_data[num_cols] = final_data[num_cols].fillna(final_data[num_cols].mean())

# Apply Target Encoding on Categorical Columns
cat_cols = ['Sector', 'State', 'Household Type', 'Religion of the head of the household']
target_encoder = TargetEncoder()
final_data[cat_cols] = target_encoder.fit_transform(final_data[cat_cols], final_data['TotalExpense'])

# Apply Log Transformation to TotalExpense
final_data['Log_TotalExpense'] = np.log1p(final_data['TotalExpense'])  # log(1 + x) to handle zero values

# Scale Numerical Features
scaler = StandardScaler()
final_data[num_cols] = scaler.fit_transform(final_data[num_cols])

# PCA for Dimensionality Reduction (Captures Important Variance)
pca = PCA(n_components=5)
pca_features = pca.fit_transform(final_data[num_cols])
pca_df = pd.DataFrame(pca_features, columns=[f'PCA_{i+1}' for i in range(5)])

# K-Means Clustering (Creates Household Groups)
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
final_data['Household_Cluster'] = kmeans.fit_predict(final_data[num_cols])

# Combine PCA + Cluster Features
final_data = pd.concat([final_data, pca_df], axis=1)

# Drop Unnecessary Columns
final_data.drop(columns=['HH_ID', 'NCO_3D', 'NIC_5D', 'TotalExpense'], inplace=True)  # Drop original target

# Train/Test Split
X = final_data.drop(columns=['Log_TotalExpense'])
y = final_data['Log_TotalExpense']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure All Features are Numeric for XGBoost
non_numeric_cols = X_train.select_dtypes(exclude=[np.number]).columns
for col in non_numeric_cols:
    X_train[col] = X_train[col].astype('category').cat.codes
    X_test[col] = X_test[col].astype('category').cat.codes

# Train XGBoost Regressor with Hyperparameter Tuning
model = XGBRegressor(n_estimators=500, max_depth=10, learning_rate=0.05, colsample_bytree=0.8, random_state=42)
model.fit(X_train, y_train)

# Make Predictions
y_pred = model.predict(X_test)

# Evaluate Model Performance
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")

R² Score: 0.7264
