In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

df = pd.read_csv("adult_data_train.csv")
df = df.drop(columns=["education", "marital-status"])

num_cols = df.select_dtypes(include=[np.number]).columns
non_num_cols = df.select_dtypes(exclude=[np.number]).columns

n_non_numeric = len(non_num_cols)

share_class0 = df["label"].value_counts(normalize=True)[0]

X_num = df[num_cols.drop("label")]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X_num, y, test_size=0.2, random_state=109, stratify=y
)

mean_fnlwgt_train = X_train["fnlwgt"].mean()

knn_base = KNeighborsClassifier()
knn_base.fit(X_train, y_train)
f1_base = f1_score(y_test, knn_base.predict(X_test))

scaler = MinMaxScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

X_train_s_df = pd.DataFrame(X_train_s, columns=X_train.columns)
mean_fnlwgt_train_scaled = X_train_s_df["fnlwgt"].mean()

knn_scaled = KNeighborsClassifier()
knn_scaled.fit(X_train_s, y_train)
f1_scaled = f1_score(y_test, knn_scaled.predict(X_test_s))

cat_hist_feature = "native-country"

mask_missing = (df == "?").any(axis=1)
n_rows_with_missing = int(mask_missing.sum())

df_nona = df[~mask_missing].copy()
X_nona = pd.get_dummies(df_nona.drop(columns=["label"]), drop_first=True)
y_nona = df_nona["label"]
n_features_after_dummies = X_nona.shape[1]

X_train_n, X_test_n, y_train_n, y_test_n = train_test_split(
    X_nona, y_nona, test_size=0.2, random_state=109, stratify=y_nona
)

scaler_n = MinMaxScaler()
X_train_n_s = scaler_n.fit_transform(X_train_n)
X_test_n_s = scaler_n.transform(X_test_n)

knn_nona = KNeighborsClassifier()
knn_nona.fit(X_train_n_s, y_train_n)
f1_nona = f1_score(y_test_n, knn_nona.predict(X_test_n_s))

df_imp = df.copy()
for col in ["workclass", "occupation", "native-country"]:
    mode = df_imp.loc[df_imp[col] != "?", col].mode()[0]
    df_imp.loc[df_imp[col] == "?", col] = mode

X_imp = pd.get_dummies(df_imp.drop(columns=["label"]), drop_first=True)
y_imp = df_imp["label"]

X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(
    X_imp, y_imp, test_size=0.2, random_state=109, stratify=y_imp
)

scaler_i = MinMaxScaler()
X_train_i_s = scaler_i.fit_transform(X_train_i)
X_test_i_s = scaler_i.transform(X_test_i)

knn_imp = KNeighborsClassifier()
knn_imp.fit(X_train_i_s, y_train_i)
f1_imp = f1_score(y_test_i, knn_imp.predict(X_test_i_s))

results = {
    "n_non_numeric": n_non_numeric,
    "share_class0": round(share_class0, 3),
    "mean_fnlwgt_train": round(mean_fnlwgt_train, 3),
    "f1_base": round(f1_base, 3),
    "mean_fnlwgt_train_scaled": round(mean_fnlwgt_train_scaled, 3),
    "f1_scaled": round(f1_scaled, 3),
    "cat_hist_feature": cat_hist_feature,
    "n_rows_with_missing": n_rows_with_missing,
    "n_features_after_dummies": n_features_after_dummies,
    "f1_nona": round(f1_nona, 3),
    "f1_imp": round(f1_imp, 3),
}
results


{'n_non_numeric': 6,
 'share_class0': np.float64(0.759),
 'mean_fnlwgt_train': np.float64(189600.243),
 'f1_base': 0.373,
 'mean_fnlwgt_train_scaled': np.float64(0.12),
 'f1_scaled': 0.516,
 'cat_hist_feature': 'native-country',
 'n_rows_with_missing': 1914,
 'n_features_after_dummies': 75,
 'f1_nona': 0.606,
 'f1_imp': 0.626}