# Fitted imputation of missing values

When working with microdata, particularly at the consumer or firm level, instances of missing values are highly common. This phenomena can be particularly troublesome when trying to construct a balanced panel of entities and require a full-rank feature vector for modelling, since any missing values in any feature will exclude the entity from the panel. This results in unnecessarily decreased sample size.



In [1]:
import pandas as pd
import numpy as np
import seaborn as sns  # used for toy datasets

from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")

## load main dataset
df = sns.load_dataset('diamonds')
df = df[['price','carat','color','cut','clarity']]

'''
TODO:
    - Contaminate y var and fill with linear model from carat alone  --> cross-validate
    - ^^ fit intercept --> cross-validate
    - random forest regressor --> cross-validate
    - compare best model to actual prices
'''

def contaminate(vector, share=0.1):
    """
    contaminate one vector at a time with X% NaNs
    """
    cont = np.full(len(vector), False)
    cont[:int(share * len(vector))] = True
    np.random.shuffle(cont)

    removed_values = vector.loc[cont] 
    vector.loc[cont] = np.nan

    return vector, removed_values


def cross_validate(data, model, exog, endog, kfold=None):
    """
    :param data: Pandas df, contains all data
    :param exog: Exog vars
    :param endog: Endog var
    :param kfold: k-Fold object from sklearn
    :param model: model
    :return: Mean absolute error (mae) and std of mae
    """
    if not kfold:
        kfold = KFold(n_splits=5, shuffle=True, random_state=1996)

    errors = []

    for tr, tt in kfold.split(data):
        xtr, ytr = data.iloc[tr][exog], df.iloc[tr][endog]
        xtt, ytt = data.iloc[tt][exog], df.iloc[tt][endog]
        model.fit(xtr, ytr)

        errors += [abs(model.predict(xtt) - ytt)]

    mae = round(float(np.mean(errors)), 2)
    std_err = round(float(np.std(errors)), 2)

    print(f'{mae = }, {std_err = }')

In [2]:
## Basic regression: ln(price) ~ b1*carat + e

y = ['price']
X = ['carat']

pipe = Pipeline([
    ('ols', LinearRegression(fit_intercept=True))
])

cross_validate(data=df, endog=y, exog=X, model=pipe)

ValueError: Unable to coerce to Series, length must be 1: given 0

In [None]:
## Allow prices to be quadratic in carat: ln(price) ~ b0 + b1*carat * b2*carat^2 + e

y = ['price']
X = ['carat']

pipe = Pipeline([
    ('pol', PolynomialFeatures(2)),
    ('ols', LinearRegression(fit_intercept=True))
])

cross_validate(data=df, endog=y, exog=X, model=pipe)

In [12]:
# Regression with dummies for categories: price ~ b0 + b1*carat + b2*carat^2 + b3*VVS1 + ... + e

df_dummies = df.drop(['cut','color','clarity'], axis='columns')\
    .join(pd.get_dummies(df.cut, drop_first=True))\
    .join(pd.get_dummies(df.color, drop_first=True))\
    .join(pd.get_dummies(df.clarity, drop_first=True))

y = ['price']
X = [col for col in df_dummies.columns if col != 'price']

pipe = Pipeline([
    ('pol', PolynomialFeatures(2)),
    ('ols', LinearRegression(fit_intercept=True))
])

cross_validate(data=df_dummies, endog=y, exog=X, model=pipe)

mae = 486.39, std_err = 604.95


In [6]:
# Random forest reg

# df_rf = df.copy()
#
# for col in ['cut', 'clarity', 'color']:
#     df_rf[col] = df_rf[col].cat.codes
#
# y = ['price']
# X = [col for col in df_dummies.columns if col != 'price']
#
# pipe = Pipeline([
#     ('pol', PolynomialFeatures(2)),
#     ('rfr', RandomForestRegressor(n_estimators = 10, max_depth = 10))
# ])
#
# cross_validate(data=df_dummies, endog=y, exog=X, model=pipe)

AttributeError: module 'sys' has no attribute 'which'