In [111]:
import pandas as pd
import sklearn.preprocessing as preprocessing
import sklearn.impute as impute

In [112]:
#import data
data = pd.read_csv('../data/test.csv')

In [113]:
#drop "Id" column
data = data.drop("Id", axis=1)

In [114]:
#convert cataegorical data to numerical
data = pd.get_dummies(data)

In [115]:
#convert lotfrontage NAs to 0
data['LotFrontage'] = data['LotFrontage'].fillna(0)

In [116]:
#outlier treatment for all columns except Id and cataegorical
for col in data.columns:
    if (data[col].dtype == 'int64' or data[col].dtype == 'float64'):
        data[col] = data[col].clip(lower=data[col].quantile(0.001), upper=data[col].quantile(0.999))
        

In [117]:
#use knn imputer to fill in missing values
imputer = impute.KNNImputer(n_neighbors=5)
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)


In [118]:
#are there any missing values?
print(data.isnull().sum().sum())

0


In [119]:
#normalize data
scaler = preprocessing.StandardScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

In [120]:
data

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-0.901061,0.751741,0.611947,-0.817117,0.426849,-0.371690,-1.073523,-0.638472,0.103955,0.925508,...,-0.045392,-0.295268,-0.052432,0.399722,-0.254879,-0.074253,-0.091066,-0.134699,0.460211,-0.299365
1,-0.901061,0.784529,1.379256,-0.075254,0.426849,-0.474450,-1.215993,0.113506,1.200569,-0.334809,...,-0.045392,-0.295268,-0.052432,0.399722,-0.254879,-0.074253,-0.091066,-0.134699,0.460211,-0.299365
2,0.082497,0.555013,1.252484,-0.817117,-0.582507,0.861424,0.683610,-0.638472,0.882430,-0.334809,...,-0.045392,-0.295268,-0.052432,0.399722,-0.254879,-0.074253,-0.091066,-0.134699,0.460211,-0.299365
3,0.082497,0.686165,0.135026,-0.075254,0.426849,0.895677,0.683610,-0.499217,0.426914,-0.334809,...,-0.045392,-0.295268,-0.052432,0.399722,-0.254879,-0.074253,-0.091066,-0.134699,0.460211,-0.299365
4,1.557833,-0.461413,-1.307632,1.408472,-0.582507,0.690158,0.398669,-0.638472,-0.390123,-0.334809,...,-0.045392,-0.295268,-0.052432,0.399722,-0.254879,-0.074253,-0.091066,-0.134699,0.460211,-0.299365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.541391,-1.182748,-1.864475,-1.558981,1.436206,-0.063412,-0.646112,-0.638472,-1.023990,-0.334809,...,-0.045392,-0.295268,-0.052432,0.399722,-0.254879,-0.074253,-0.091066,-0.134699,0.460211,-0.299365
1455,2.541391,-1.182748,-1.864475,-1.558981,-0.582507,-0.063412,-0.646112,-0.638472,-0.416635,-0.334809,...,-0.045392,-0.295268,-0.052432,0.399722,3.923424,-0.074253,-0.091066,-0.134699,-2.172917,-0.299365
1456,-0.901061,1.571440,2.135252,-0.817117,1.436206,-0.405944,0.588630,-0.638472,1.926021,-0.334809,...,-0.045392,-0.295268,-0.052432,0.399722,3.923424,-0.074253,-0.091066,-0.134699,-2.172917,-0.299365
1457,0.697220,0.161558,0.269341,-0.817117,-0.582507,0.690158,0.398669,-0.638472,-0.211773,-0.334809,...,-0.045392,-0.295268,-0.052432,0.399722,-0.254879,-0.074253,-0.091066,-0.134699,0.460211,-0.299365


In [121]:
#define pipeline for preprocessing as function
def preprocess(data):
    data = pd.get_dummies(data)
    data = data.drop("Id", axis=1)
    data['LotFrontage'] = data['LotFrontage'].fillna(0)
    for col in data.columns:
        if (data[col].dtype == 'int64' or data[col].dtype == 'float64'):
            data[col] = data[col].clip(lower=data[col].quantile(0.001), upper=data[col].quantile(0.999))
    imputer = impute.KNNImputer(n_neighbors=5)
    data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
    scaler = preprocessing.StandardScaler()
    data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
    return data
