In [4]:
import pandas as pd

X_train = pd.read_csv("../data/raw/train_features.csv")
y_train = pd.read_csv("../data/raw/train_labels.csv")
X_test  = pd.read_csv("../data/raw/test_features.csv")

print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test :", X_test.shape)

X_train: (103023, 88)
y_train: (104234, 3)
X_test : (104234, 88)


In [7]:
print("X_train columns:")
print(X_train.columns.tolist())

print("\ny_train columns:")
print(y_train.columns.tolist())

X_train columns:
['hhid', 'com', 'weight', 'strata', 'utl_exp_ppp17', 'male', 'hsize', 'num_children5', 'num_children10', 'num_children18', 'age', 'owner', 'water', 'toilet', 'sewer', 'elect', 'water_source', 'sanitation_source', 'dweltyp', 'num_adult_female', 'num_adult_male', 'num_elderly', 'employed', 'sworkershh', 'share_secondary', 'educ_max', 'sfworkershh', 'any_nonagric', 'sector1d', 'region1', 'region2', 'region3', 'region4', 'region5', 'region6', 'region7', 'urban', 'consumed100', 'consumed200', 'consumed300', 'consumed400', 'consumed500', 'consumed600', 'consumed700', 'consumed800', 'consumed900', 'consumed1000', 'consumed1100', 'consumed1200', 'consumed1300', 'consumed1400', 'consumed1500', 'consumed1600', 'consumed1700', 'consumed1800', 'consumed1900', 'consumed2000', 'consumed2100', 'consumed2200', 'consumed2300', 'consumed2400', 'consumed2500', 'consumed2600', 'consumed2700', 'consumed2800', 'consumed2900', 'consumed3000', 'consumed3100', 'consumed3200', 'consumed3300', '

In [8]:
train_df = X_train.merge(
    y_train,
    on=["survey_id", "hhid"],
    how="inner"
)

print("train_df shape:", train_df.shape)
train_df.head()

train_df shape: (0, 89)


Unnamed: 0,hhid,com,weight,strata,utl_exp_ppp17,male,hsize,num_children5,num_children10,num_children18,...,consumed4300,consumed4400,consumed4500,consumed4600,consumed4700,consumed4800,consumed4900,consumed5000,survey_id,cons_ppp17


In [9]:
print(X_train[["survey_id", "hhid"]].dtypes)
print(y_train[["survey_id", "hhid"]].dtypes)

survey_id    int64
hhid         int64
dtype: object
survey_id    int64
hhid         int64
dtype: object


In [10]:
common = set(zip(X_train.survey_id, X_train.hhid)) & \
         set(zip(y_train.survey_id, y_train.hhid))

len(common)

0

In [14]:
print("X_train info:")
X_train.info()

print("\nX_test info:")
X_test.info()

X_train info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103023 entries, 0 to 103022
Data columns (total 88 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   hhid               103023 non-null  int64  
 1   com                103023 non-null  int64  
 2   weight             103023 non-null  int64  
 3   strata             103023 non-null  int64  
 4   utl_exp_ppp17      102740 non-null  float64
 5   male               103023 non-null  object 
 6   hsize              103023 non-null  int64  
 7   num_children5      103023 non-null  int64  
 8   num_children10     103023 non-null  int64  
 9   num_children18     103023 non-null  int64  
 10  age                103023 non-null  int64  
 11  owner              103023 non-null  object 
 12  water              103023 non-null  object 
 13  toilet             103023 non-null  object 
 14  sewer              103023 non-null  object 
 15  elect              103023 non-null  o

In [16]:
# Combine train and test ONLY for EDA
full_X = pd.concat(
    [X_train.assign(_set="train"), 
     X_test.assign(_set="test")],
    axis=0
)

In [17]:
missing_pct = (full_X.isna().sum() / len(full_X)) * 100
missing_pct = missing_pct[missing_pct > 0].sort_values(ascending=False)

missing_pct.head(15)

sector1d         14.338720
dweltyp           1.123243
utl_exp_ppp17     0.177557
consumed5000      0.082024
consumed4900      0.069479
consumed4400      0.066584
consumed4600      0.065137
consumed4800      0.064172
consumed4500      0.062724
consumed4700      0.062242
consumed4300      0.061759
consumed4200      0.051627
consumed4100      0.047767
employed          0.047767
consumed4000      0.046802
dtype: float64

In [19]:
# DROP very high-missing / useless column
if "sector1d" in X_train.columns:
    X_train = X_train.drop(columns=["sector1d"])
    X_test  = X_test.drop(columns=["sector1d"])


# CATEGORICAL → MODE 
cat_cols = ["dweltyp", "employed"]

for col in cat_cols:
    mode_val = X_train[col].mode()[0]
    X_train[col] = X_train[col].fillna(mode_val)
    X_test[col]  = X_test[col].fillna(mode_val)


#  NUMERIC → MEDIAN 
num_col = "utl_exp_ppp17"
median_val = X_train[num_col].median()

X_train[num_col] = X_train[num_col].fillna(median_val)
X_test[num_col]  = X_test[num_col].fillna(median_val)


consumed_cols = [c for c in X_train.columns if c.startswith("consumed")]

X_train[consumed_cols] = X_train[consumed_cols].fillna(0)
X_test[consumed_cols]  = X_test[consumed_cols].fillna(0)


#  FINAL CHECK
print("Train missing:", X_train.isna().sum().sum())
print("Test missing :", X_test.isna().sum().sum())

Train missing: 21
Test missing : 46


In [20]:
cat_cols_all = X_train.select_dtypes(include="object").columns

for col in cat_cols_all:
    mode_val = X_train[col].mode()[0]
    X_train[col] = X_train[col].fillna(mode_val)
    X_test[col]  = X_test[col].fillna(mode_val)


# FINAL FINAL CHECK
print("Train missing:", X_train.isna().sum().sum())
print("Test missing :", X_test.isna().sum().sum())

Train missing: 10
Test missing : 23


In [21]:
num_cols_all = X_train.select_dtypes(include=["int64", "float64"]).columns

for col in num_cols_all:
    median_val = X_train[col].median()
    X_train[col] = X_train[col].fillna(median_val)
    X_test[col]  = X_test[col].fillna(median_val)


# FINAL ABSOLUTE CHECK
print("Train missing:", X_train.isna().sum().sum())
print("Test missing :", X_test.isna().sum().sum())

Train missing: 0
Test missing : 0


In [24]:
cat_cols = X_train.select_dtypes(include=["object"]).columns
print(len(cat_cols))
print(cat_cols.tolist())

64
['male', 'owner', 'water', 'toilet', 'sewer', 'elect', 'water_source', 'sanitation_source', 'dweltyp', 'employed', 'educ_max', 'any_nonagric', 'sector1d', 'urban', 'consumed100', 'consumed200', 'consumed300', 'consumed400', 'consumed500', 'consumed600', 'consumed700', 'consumed800', 'consumed900', 'consumed1000', 'consumed1100', 'consumed1200', 'consumed1300', 'consumed1400', 'consumed1500', 'consumed1600', 'consumed1700', 'consumed1800', 'consumed1900', 'consumed2000', 'consumed2100', 'consumed2200', 'consumed2300', 'consumed2400', 'consumed2500', 'consumed2600', 'consumed2700', 'consumed2800', 'consumed2900', 'consumed3000', 'consumed3100', 'consumed3200', 'consumed3300', 'consumed3400', 'consumed3500', 'consumed3600', 'consumed3700', 'consumed3800', 'consumed3900', 'consumed4000', 'consumed4100', 'consumed4200', 'consumed4300', 'consumed4400', 'consumed4500', 'consumed4600', 'consumed4700', 'consumed4800', 'consumed4900', 'consumed5000']


In [25]:
for col in cat_cols:
    mode_val = X_train[col].mode()[0]
    X_train[col] = X_train[col].fillna(mode_val)
    X_test[col] = X_test[col].fillna(mode_val)

print(X_train[cat_cols].isna().sum().sum(),
      X_test[cat_cols].isna().sum().sum())

0 0


In [26]:
for col in cat_cols:
    X_train[col] = X_train[col].str.lower().str.strip()
    X_test[col] = X_test[col].str.lower().str.strip()

In [27]:
low_card = [c for c in cat_cols if X_train[c].nunique() <= 10]
high_card = [c for c in cat_cols if X_train[c].nunique() > 10]

print(len(low_card), len(high_card))

63 1


In [28]:
print("Train missing:", X_train.isna().sum().sum())
print("Test missing:", X_test.isna().sum().sum())

Train missing: 293
Test missing: 108


In [31]:
X_train.isna().sum()[X_train.isna().sum() > 0]

utl_exp_ppp17      283
share_secondary     10
dtype: int64

In [32]:
X_test.isna().sum()[X_test.isna().sum() > 0]

utl_exp_ppp17      85
share_secondary    23
dtype: int64

In [33]:
med_utl = X_train["utl_exp_ppp17"].median()
med_share = X_train["share_secondary"].median()

In [34]:
X_train["utl_exp_ppp17"] = X_train["utl_exp_ppp17"].fillna(med_utl)
X_test["utl_exp_ppp17"]  = X_test["utl_exp_ppp17"].fillna(med_utl)

X_train["share_secondary"] = X_train["share_secondary"].fillna(med_share)
X_test["share_secondary"]  = X_test["share_secondary"].fillna(med_share)

In [35]:
print("Train missing:", X_train.isna().sum().sum())
print("Test missing :", X_test.isna().sum().sum())

Train missing: 0
Test missing : 0


In [43]:
# fresh y from original y_train
y = y_train["cons_ppp17"]

print(y.shape)
print(X_train.shape)

(104234,)
(103023, 88)


In [44]:
y = y.iloc[:len(X_train)]

print(X_train.shape)
print(y.shape)

(103023, 88)
(103023,)


In [45]:
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train,
    y,
    test_size=0.2,
    random_state=42
)

print(X_tr.shape, X_val.shape)
print(y_tr.shape, y_val.shape)

(82418, 88) (20605, 88)
(82418,) (20605,)


In [47]:
# fix male column
if X_tr["male"].dtype == "object":
    X_tr["male"] = X_tr["male"].map({"male": 1, "female": 0})
    X_val["male"] = X_val["male"].map({"male": 1, "female": 0})

In [49]:
obj_cols = X_tr.select_dtypes(include="object").columns
obj_cols

Index(['owner', 'water', 'toilet', 'sewer', 'elect', 'water_source',
       'sanitation_source', 'dweltyp', 'employed', 'educ_max', 'any_nonagric',
       'sector1d', 'urban', 'consumed100', 'consumed200', 'consumed300',
       'consumed400', 'consumed500', 'consumed600', 'consumed700',
       'consumed800', 'consumed900', 'consumed1000', 'consumed1100',
       'consumed1200', 'consumed1300', 'consumed1400', 'consumed1500',
       'consumed1600', 'consumed1700', 'consumed1800', 'consumed1900',
       'consumed2000', 'consumed2100', 'consumed2200', 'consumed2300',
       'consumed2400', 'consumed2500', 'consumed2600', 'consumed2700',
       'consumed2800', 'consumed2900', 'consumed3000', 'consumed3100',
       'consumed3200', 'consumed3300', 'consumed3400', 'consumed3500',
       'consumed3600', 'consumed3700', 'consumed3800', 'consumed3900',
       'consumed4000', 'consumed4100', 'consumed4200', 'consumed4300',
       'consumed4400', 'consumed4500', 'consumed4600', 'consumed4700',
    

In [50]:
for col in obj_cols:
    X_tr[col] = X_tr[col].astype("category").cat.codes
    X_val[col] = X_val[col].astype("category").cat.codes

In [51]:
X_tr.dtypes.value_counts()

int8       63
int64      21
float64     4
Name: count, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train,
    y,
    test_size=0.2,
    random_state=42
)

print(X_tr.shape, X_val.shape)
print(y_tr.shape, y_val.shape)

(82418, 88) (20605, 88)
(82418,) (20605,)


In [52]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

model = LinearRegression()
model.fit(X_tr, y_tr)

val_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_pred))

print("Baseline RMSE:", rmse)

Baseline RMSE: 10.386115096417404


In [53]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import numpy as np

ridge = Ridge(alpha=1.0)
ridge.fit(X_tr, y_tr)

ridge_pred = ridge.predict(X_val)
ridge_rmse = np.sqrt(mean_squared_error(y_val, ridge_pred))

ridge_rmse

np.float64(10.386112312142439)

In [54]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.001, max_iter=10000)
lasso.fit(X_tr, y_tr)

lasso_pred = lasso.predict(X_val)
lasso_rmse = np.sqrt(mean_squared_error(y_val, lasso_pred))

lasso_rmse

np.float64(10.385786354848289)

In [56]:
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train,
    y,
    test_size=0.2,
    random_state=42
)

print(X_tr.shape, X_val.shape)
print(y_tr.shape, y_val.shape)

(82418, 88) (20605, 88)
(82418,) (20605,)


In [55]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

cb = CatBoostRegressor(
    iterations=500,
    depth=6,
    learning_rate=0.05,
    loss_function="RMSE",
    verbose=100
)

cb.fit(X_tr, y_tr)
val_pred = cb.predict(X_val)
np.sqrt(mean_squared_error(y_val, val_pred))

0:	learn: 9.9401680	total: 63.3ms	remaining: 31.6s
100:	learn: 9.3197445	total: 775ms	remaining: 3.06s
200:	learn: 9.1758134	total: 1.4s	remaining: 2.08s
300:	learn: 9.0352077	total: 2.01s	remaining: 1.33s
400:	learn: 8.9299424	total: 2.81s	remaining: 694ms
499:	learn: 8.8505073	total: 3.67s	remaining: 0us


np.float64(9.316823010643972)

In [None]:
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train,
    y,
    test_size=0.2,
    random_state=42
)

print(X_tr.shape, X_val.shape)
print(y_tr.shape, y_val.shape)

(82418, 88) (20605, 88)
(82418,) (20605,)


In [57]:
y_tr_log = np.log1p(y_tr)
y_val_log = np.log1p(y_val)

In [59]:
# keep only numeric columns
num_cols = X_tr.select_dtypes(include=["int64", "float64"]).columns

X_tr_num = X_tr[num_cols]
X_val_num = X_val[num_cols]

In [60]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_tr_num, y_tr_log)

val_pred_log = model.predict(X_val_num)
val_pred = np.expm1(val_pred_log)

In [61]:
from sklearn.metrics import mean_squared_error
import numpy as np

rmse = np.sqrt(mean_squared_error(y_val, val_pred))
rmse

np.float64(10.68847505651696)

In [62]:
cat_cols = X_tr.select_dtypes(include=["object"]).columns.tolist()
len(cat_cols)

64

In [63]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    loss_function="RMSE",
    eval_metric="RMSE",
    random_seed=42,
    verbose=100
)

model.fit(
    X_tr,
    y_tr,
    eval_set=(X_val, y_val),
    cat_features=cat_cols,
    use_best_model=True
)

0:	learn: 9.9280801	test: 10.3749201	best: 10.3749201 (0)	total: 101ms	remaining: 1m 41s
100:	learn: 9.2635146	test: 9.6622393	best: 9.6622393 (100)	total: 4.39s	remaining: 39.1s
200:	learn: 9.1453466	test: 9.5747128	best: 9.5746908 (199)	total: 7.57s	remaining: 30.1s
300:	learn: 8.9918707	test: 9.4539938	best: 9.4539938 (300)	total: 10.8s	remaining: 25.1s
400:	learn: 8.8685819	test: 9.3611824	best: 9.3611824 (400)	total: 14.3s	remaining: 21.4s
500:	learn: 8.8008808	test: 9.3193503	best: 9.3189634 (498)	total: 17.6s	remaining: 17.5s
600:	learn: 8.7487652	test: 9.2950294	best: 9.2950226 (599)	total: 20.8s	remaining: 13.8s
700:	learn: 8.6956516	test: 9.2754783	best: 9.2754262 (698)	total: 24.9s	remaining: 10.6s
800:	learn: 8.6521618	test: 9.2635020	best: 9.2635020 (800)	total: 28.4s	remaining: 7.04s
900:	learn: 8.5922872	test: 9.2492265	best: 9.2491331 (897)	total: 32s	remaining: 3.51s
999:	learn: 8.5374425	test: 9.2374176	best: 9.2373450 (997)	total: 35.2s	remaining: 0us

bestTest = 9.2

<catboost.core.CatBoostRegressor at 0x141fc65d0>

In [64]:
y_tr_log = np.log1p(y_tr)
y_val_log = np.log1p(y_val)

In [65]:
model = CatBoostRegressor(
    iterations=3000,
    learning_rate=0.03,
    depth=10,
    l2_leaf_reg=5,
    loss_function="RMSE",
    eval_metric="RMSE",
    random_seed=42,
    verbose=200
)

model.fit(
    X_tr,
    y_tr_log,
    eval_set=(X_val, y_val_log),
    cat_features=cat_cols,
    sample_weight=X_tr["weight"],
    use_best_model=True
)

0:	learn: 0.6212410	test: 0.6294793	best: 0.6294793 (0)	total: 75.4ms	remaining: 3m 46s
200:	learn: 0.6124970	test: 0.6256353	best: 0.6256353 (200)	total: 7.27s	remaining: 1m 41s
400:	learn: 0.6068888	test: 0.6232199	best: 0.6232199 (400)	total: 13.2s	remaining: 1m 25s
600:	learn: 0.5504404	test: 0.5741543	best: 0.5741543 (600)	total: 24.9s	remaining: 1m 39s
800:	learn: 0.5169816	test: 0.5586688	best: 0.5586688 (800)	total: 37s	remaining: 1m 41s
1000:	learn: 0.4936390	test: 0.5522101	best: 0.5522101 (1000)	total: 48.6s	remaining: 1m 36s
1200:	learn: 0.4748275	test: 0.5471919	best: 0.5471919 (1200)	total: 1m 2s	remaining: 1m 33s
1400:	learn: 0.4598728	test: 0.5439954	best: 0.5439906 (1399)	total: 1m 14s	remaining: 1m 25s
1600:	learn: 0.4457386	test: 0.5414402	best: 0.5414402 (1600)	total: 1m 28s	remaining: 1m 17s
1800:	learn: 0.4324409	test: 0.5397817	best: 0.5397817 (1800)	total: 1m 41s	remaining: 1m 7s
2000:	learn: 0.4214954	test: 0.5384222	best: 0.5384222 (2000)	total: 1m 54s	remaini

<catboost.core.CatBoostRegressor at 0x141dfec10>