In [20]:
import pandas as pd
import sklearn.preprocessing as preprocessing
import sklearn.impute as impute

In [21]:
#import data
data = pd.read_csv('../data/test.csv')

In [22]:
#convert cataegorical data to numerical
data = pd.get_dummies(data)

In [23]:
#convert lotfrontage NAs to 0
data['LotFrontage'] = data['LotFrontage'].fillna(0)

In [24]:
#use knn imputer to fill in missing values
imputer = impute.KNNImputer(n_neighbors=5)
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)


In [25]:
#are there any missing values?
print(data.isnull().sum().sum())

0


In [26]:
#normalize data
scaler = preprocessing.StandardScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

In [27]:
#add interaction terms for all pairs of features
data = pd.DataFrame(preprocessing.PolynomialFeatures(degree=2, include_bias=False).fit_transform(data))

In [28]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36845,36846,36847,36848,36849,36850,36851,36852,36853,36854
0,-1.730864,-0.874711,0.684849,0.363929,-0.751101,0.400766,-0.340945,-1.072885,-0.569632,0.063936,...,0.008293,0.012266,-0.041910,0.027262,0.018144,-0.061990,0.040324,0.211794,-0.137771,0.089619
1,-1.728490,-0.874711,0.715852,0.897861,-0.054877,0.400766,-0.439695,-1.214908,0.040544,1.063714,...,0.008293,0.012266,-0.041910,0.027262,0.018144,-0.061990,0.040324,0.211794,-0.137771,0.089619
2,-1.726115,0.061351,0.498831,0.809646,-0.751101,-0.497418,0.844059,0.678742,-0.569632,0.773668,...,0.008293,0.012266,-0.041910,0.027262,0.018144,-0.061990,0.040324,0.211794,-0.137771,0.089619
3,-1.723741,0.061351,0.622843,0.032064,-0.054877,0.400766,0.876976,0.678742,-0.456636,0.358376,...,0.008293,0.012266,-0.041910,0.027262,0.018144,-0.061990,0.040324,0.211794,-0.137771,0.089619
4,-1.721367,1.465443,-0.462261,-0.971808,1.337571,-0.497418,0.679475,0.394694,-0.569632,-0.386513,...,0.008293,0.012266,-0.041910,0.027262,0.018144,-0.061990,0.040324,0.211794,-0.137771,0.089619
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,1.721367,2.401505,-1.144325,-1.591330,-1.447325,1.298950,-0.044694,-0.646813,-0.569632,-0.964407,...,0.008293,0.012266,-0.041910,0.027262,0.018144,-0.061990,0.040324,0.211794,-0.137771,0.089619
1455,1.723741,2.401505,-1.144325,-1.599808,-1.447325,-0.497418,-0.044694,-0.646813,-0.569632,-0.410684,...,0.008293,0.012266,0.197879,0.027262,0.018144,0.292689,0.040324,4.721569,0.650494,0.089619
1456,1.726115,-0.874711,3.165085,2.055150,-0.751101,1.298950,-0.373861,0.584059,-0.569632,1.725105,...,0.008293,0.012266,0.197879,0.027262,0.018144,0.292689,0.040324,4.721569,0.650494,0.089619
1457,1.728490,0.646389,0.126795,0.125527,-0.751101,-0.497418,0.679475,0.394694,-0.569632,-0.223912,...,0.008293,0.012266,-0.041910,0.027262,0.018144,-0.061990,0.040324,0.211794,-0.137771,0.089619


In [30]:
#define pipeline for preprocessing as function
def preprocess(data):
    data = pd.get_dummies(data)
    data['LotFrontage'] = data['LotFrontage'].fillna(0)
    imputer = impute.KNNImputer(n_neighbors=5)
    data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
    scaler = preprocessing.StandardScaler()
    data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
    data = pd.DataFrame(preprocessing.PolynomialFeatures(degree=2, include_bias=False).fit_transform(data))
    return data
