### Importing Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.metrics import f1_score, precision_recall_fscore_support, recall_score, accuracy_score

### Loading Data

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
data = pd.concat([X, y], axis = 1)

In [3]:
adult.variables

In [4]:
data.shape

### EDA

In [5]:
data.info()

In [6]:
48842  - 48568     

#### Exploring income encoding

In [7]:
# Class imbalance, preprocessing for income column needed
# Consider using f1 score instead of accuracy 
y.assign(count = y["income"]).groupby("income").count()

#### Exploring missing and unknown values

In [8]:
# All people with missing workclass also has missing occupation
data[data["workclass"].isna()]["occupation"].unique()

In [9]:
# People with missing occupation either had unknown occupation or never-worked
data[data["occupation"].isna()]["workclass"].unique()

In [10]:
# What is the ? value
data["workclass"].unique()

In [11]:
# People with unknown workclass, occupation, and native-country is marked with ?
data[data["workclass"] == "?"]

In [12]:
# Which columns have ? values
for col in data.columns:
    if "?" in np.array(data[col]):
        print(col)

#### Exploring how many rows impacted if drop missing / ? values

In [13]:
data[(data["workclass"] == "?") | (data["workclass"].isna())]

In [14]:
data[(data["occupation"] == "?") | (data["occupation"].isna())]

In [15]:
data[(data["native-country"] == "?") | (data["native-country"].isna())]

In [16]:
data[data["workclass"] == "?"]["occupation"].unique()

In [17]:
data[data["workclass"] == "?"]["native-country"].unique()

In [18]:
data[data["occupation"] == "?"]["workclass"].unique()

In [19]:
data[data["occupation"] == "?"]["native-country"].unique()

In [20]:
data[data["native-country"] == "?"]["workclass"].unique()

In [21]:
data[data["native-country"] == "?"]["occupation"].unique()

In [22]:
# Dropping doesn't impact much
no_missing = data[(data["workclass"] != "?")]
no_missing = no_missing[(no_missing["occupation"]) != "?"]
no_missing = no_missing[no_missing["native-country"] != "?"]
no_missing = no_missing.dropna()

no_missing

#### Exploring possibly correlated features

In [23]:
# education-num can be used to encode education
data.groupby(["education", "education-num"])[["age"]].count()

#### Exploring how many of each categorical variable there are and distribution

In [24]:
# Number of classes in each categorical cariable
categorical = data.drop(columns = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", 
                                   "hours-per-week", "income"])
categorical_count = {}
for col in categorical.columns:
    categorical_count[col] = len(categorical[col].unique())
    
categorical_count

In [25]:
# Function to get count of classes in each feature
def get_distr(col):
    return no_missing.groupby(col)[["age"]].count().sort_values("age")

In [26]:
get_distr("workclass")

In [35]:
get_distr("education")

In [28]:
get_distr("marital-status")

In [29]:
get_distr("occupation")

In [30]:
get_distr("relationship")

In [31]:
get_distr("race")

In [32]:
get_distr("sex")

In [33]:
get_distr("native-country")

In [34]:
def plotting(data, col, N):
    idx = np.arange(N)
    width = 0.3
    over = data[data["income"] == 1]
    under = data[data["income"] == 0]
    over_grouped = over.groupby(col)["age"].count().reset_index().sort_values(col)
    under_grouped = under.groupby(col)["age"].count().reset_index().sort_values(col)
    values = under_grouped[col]
    plt.bar(idx, under_grouped["age"], width, label = "Under $50k")
    plt.bar(idx + width, over_grouped["age"], width, label = "Over $50k")
    plt.xticks(idx + width / 2, values)
    plt.xlabel(col)
    plt.ylabel("count")
    plt.legend()
    plt.show()
    
plotting(preproc_data, "income", 1)

#### Exploring capital gain and capital loss

In [None]:
no_missing["capital-gain"].describe()

In [None]:
no_missing["capital-loss"].describe()

### Preprocessing

1. Replace nan with ? to represent unknown category or drop all missing and ? values
2. Preprocess income to be a binary value
3. Drop education column since education-num encodes that

In [None]:
# Encodes if you made over 50k as 1 and under 50k as 0
preproc_data = no_missing.assign(income = data["income"].str.split(".").str[0].apply(lambda x: 1 if x[0] == ">" else 0))
# Drop education column
preproc_data = preproc_data.drop(columns = ["education", "fnlwgt"])

In [None]:
preproc_data.groupby("income").count()

### Class Imbalance

In [None]:
sns.countplot(x = preproc_data["income"])
plt.title("Income Distribution")
plt.savefig('income_distr.png')

### Pair Plot

In [None]:
numeric = preproc_data[["age", "education-num", "capital-gain", "capital-loss", "hours-per-week", "income"]]

sns.pairplot(numeric, hue = "income")
plt.savefig('features.png')

### Train Test Validation Split

In [None]:
X = preproc_data.drop(columns = "income")
y = preproc_data["income"]

In [None]:
train = 0.6
val = 0.2
test = 0.2
new_val = val / (1 - test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = new_val, random_state = 42)

### Baselin Model - KNN

In [None]:
preproc = ColumnTransformer(
    transformers = [
        ("categorical", OneHotEncoder(handle_unknown='ignore', sparse_output = False), ["race", "sex", "workclass"]),
        ("numerical", "passthrough", ["age", "hours-per-week", "education-num"])
    ],
    remainder = "drop"
)

In [None]:
pl = Pipeline([
    ("preproc", preproc),
    ("model", KNeighborsClassifier())
])

pl.fit(X_train, y_train)

In [None]:
recall_score(y_val, pl.predict(X_val))

In [None]:
f1_score(y_val, pl.predict(X_val))

In [None]:
accuracy_score(pl.predict(X_test), y_test)