In [19]:
import pandas as pd

X_train = pd.read_csv("../data/raw/train_features.csv")
X_test  = pd.read_csv("../data/raw/test_features.csv")
y_train = pd.read_csv("../data/raw/train_labels.csv")

In [3]:
# identify categorical columns
cat_cols = X_train.select_dtypes(include="object").columns.tolist()

# cardinality (unique values count)
cardinality = X_train[cat_cols].nunique().sort_values(ascending=False)

cardinality.head(15)

sector1d             17
water_source          8
educ_max              7
sanitation_source     6
dweltyp               5
male                  2
consumed3400          2
consumed3300          2
consumed3200          2
consumed3100          2
consumed3000          2
consumed2800          2
consumed2900          2
consumed3600          2
consumed2700          2
dtype: int64

In [6]:
# safe copy
X_train = X_train.copy()
X_test = X_test.copy()

# drop high-card column
if "sector1d" in X_train.columns:
    X_train.drop(columns=["sector1d"], inplace=True)
    X_test.drop(columns=["sector1d"], inplace=True)

In [10]:
# male column (safe)
if "male" in X_train.columns:

    X_train["male"] = X_train["male"].map({"male": 1, "female": 0})
    X_test["male"]  = X_test["male"].map({"male": 1, "female": 0})

    X_train["male"] = X_train["male"].fillna(0).astype(int)
    X_test["male"]  = X_test["male"].fillna(0).astype(int)

In [11]:
cat_cols = X_train.select_dtypes(include="object").columns
cat_cols


Index(['owner', 'water', 'toilet', 'sewer', 'elect', 'water_source',
       'sanitation_source', 'dweltyp', 'employed', 'educ_max', 'any_nonagric',
       'urban', 'consumed100', 'consumed200', 'consumed300', 'consumed400',
       'consumed500', 'consumed600', 'consumed700', 'consumed800',
       'consumed900', 'consumed1000', 'consumed1100', 'consumed1200',
       'consumed1300', 'consumed1400', 'consumed1500', 'consumed1600',
       'consumed1700', 'consumed1800', 'consumed1900', 'consumed2000',
       'consumed2100', 'consumed2200', 'consumed2300', 'consumed2400',
       'consumed2500', 'consumed2600', 'consumed2700', 'consumed2800',
       'consumed2900', 'consumed3000', 'consumed3100', 'consumed3200',
       'consumed3300', 'consumed3400', 'consumed3500', 'consumed3600',
       'consumed3700', 'consumed3800', 'consumed3900', 'consumed4000',
       'consumed4100', 'consumed4200', 'consumed4300', 'consumed4400',
       'consumed4500', 'consumed4600', 'consumed4700', 'consumed4800',


In [12]:
X_train[cat_cols] = X_train[cat_cols].fillna("missing")
X_test[cat_cols]  = X_test[cat_cols].fillna("missing")

In [13]:
X_train["_set"] = "train"
X_test["_set"] = "test"

full = pd.concat([X_train, X_test], axis=0)

In [14]:
full = pd.get_dummies(full, columns=cat_cols, drop_first=True)

In [15]:
X_train = full[full["_set"] == "train"].drop(columns="_set")
X_test  = full[full["_set"] == "test"].drop(columns="_set")

print(X_train.shape)
print(X_test.shape)
print("Columns match:", list(X_train.columns) == list(X_test.columns))

(103023, 158)
(104234, 158)
Columns match: True


In [20]:
# target
y = y_train["cons_ppp17"]

# drop id columns from features
ID_COLS = ["hhid", "survey_id"]

X = X_train.drop(columns=ID_COLS)
X_test_final = X_test.drop(columns=ID_COLS)

print(X.shape)
print(X_test_final.shape)
print(y.shape)

(103023, 86)
(104234, 86)
(104234,)


In [22]:
# target
y = y_train["cons_ppp17"].values

# features (ids already dropped)
X = X

print(X.shape)
print(y.shape)

(0, 86)
(104234,)


In [23]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train shape: (103023, 88)
y_train shape: (104234, 3)


In [24]:
common_keys = X_train[["survey_id", "hhid"]]

y_train_aligned = y_train.merge(
    common_keys,
    on=["survey_id", "hhid"],
    how="inner"
)

In [25]:
print(X_train.shape)
print(y_train_aligned.shape)

(103023, 88)
(0, 3)
