In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer, SimpleImputer

In [None]:
features = pd.read_csv("training_set_features.csv", index_col = 0)
labels = pd.read_csv("training_set_labels.csv", index_col = 0)

In [None]:
labels.head()

In [None]:
df = features.join(labels)

In [None]:
df.dtypes

# Determine columns with missing values

In [None]:
df.isna().sum()

# Split off numerical and categorical data to be processed seperately

In [None]:
df_num = df.select_dtypes(include=['float64'])

In [None]:
df_cat = df.select_dtypes(include=['object'])

# Manually map ordinal data to ensure order is correct

In [None]:
mapper = {'18 - 34 Years':0, '35 - 44 Years':1,'45 - 54 Years':2 ,'55 - 64 Years':3, '65+ Years':4}
df_cat.age_group = df_cat.age_group.replace(mapper)
mapper = {'< 12 Years':0, '12 Years':1,'Some College':2, 'College Graduate':3}
df_cat.education = df_cat.education.replace(mapper)
mapper = {'Below Poverty':0, '<= $75,000, Above Poverty':1, '> $75,000':2}
df_cat.income_poverty = df_cat.income_poverty.replace(mapper)

In [None]:
df_ord = df_cat.select_dtypes(include=['float64','int64'])

# Merge ordinal and numeric data and impute missing values by KNN

In [None]:
df_num = df_num.join(df_ord)

In [None]:
imp_knn = KNNImputer()
df_num[:] = imp_knn.fit_transform(df_num)

# Process nominal data

In [None]:
df_nom = df_cat.select_dtypes(include=['object'])

### These columns are dropped as they have too many missing values. Revisit if needed

In [None]:
df_nom.drop(['employment_industry','employment_occupation'], axis = 1, inplace = True)

In [None]:
df_nom.isna().sum()

# Impute missing nominal data by replacing with the most frequent value

In [None]:
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [None]:
df_nom[:] = imp_mode.fit_transform(df_nom)

In [None]:
df_nom.isna().sum()

In [None]:
df_nom = pd.get_dummies(df_nom)

# Rejoin all data and check that missing values have been handled

In [None]:
processed = df_num.join(df_nom)

In [None]:
processed.isna().sum()

In [None]:
processed.to_csv('processed_data.csv')