In [47]:
import numpy as np
import pandas as pd
import pandas.api.types as ptypes
from sklearn.preprocessing import OneHotEncoder

In [48]:
# The idea is now to create functions, that replace values of features or remove entire features.
def remove_feats(df, list_of_feats):
  '''
  function that removes features from a pandas dataframe
  df: pandas dataframe
  list_of_feats: list of features to be removed
  '''
  df = df.drop(list_of_feats, axis = 1)
  return df

def replace_values(df, list_of_feats):
  '''
  function that replaces NaN values of features from a pandas datafram with the mean (numerical values) or the mode (categorical values)
  df: pandas datafram
  list_of_feats: list of features where the NaN values should be replaced
  '''
  for feat in list_of_feats:
      if ptypes.is_numeric_dtype(df[feat]):
          df[feat] = df[feat].fillna(df[feat].mean())
      elif ptypes.is_object_dtype(df[feat]) or ptypes.is_categorical_dtype(df[feat]):
          df[feat] = df[feat].fillna(df[feat].mode()[0])
      else:
            print(f"Unhandled dtype for feature: {feat}")
  return df

In [50]:
# Import the train and test datasets, as well as the list of features to remove / replace
train_set_df = pd.read_csv(r'train.csv').copy()
test_set_df = pd.read_csv(r'test.csv').copy()

with open("feats_to_remove.txt", "r") as f:
    feats_to_remove = list(set(line.strip() for line in f))

with open("feats_to_replace.txt", "r") as f:
    feats_to_replace = list(set(line.strip() for line in f))
print(train_set_df.shape, test_set_df.shape)

(1460, 81) (1459, 80)


In [4]:
# Check the number of features that have nan values: 
print('Number of features containing nan values:')
print('Training set: ', train_set_df.isna().any().sum())
print('Test set: ', test_set_df.isna().any().sum())

Number of features containing nan values:
Training set:  19
Test set:  33


In [5]:
# Process the datasets:
train_set_df_without_nan = replace_values(remove_feats(train_set_df, list_of_feats = feats_to_remove), list_of_feats = feats_to_replace)
test_set_df_without_nan = replace_values(remove_feats(test_set_df, list_of_feats = feats_to_remove), list_of_feats = feats_to_replace)

In [6]:
# Check the number of features that have nan values: 
print('Number of features containing nan values after manipulation:')
print('Training set: ', train_set_df_without_nan.isna().any().sum())
print('Test set: ', test_set_df_without_nan.isna().any().sum())

Number of features containing nan values after manipulation:
Training set:  0
Test set:  0


The next step is to use one hot encoding on the categorical features to enable the dataset to be used for machine learning algorithms. To account for different categories present in the training and test data, first the train and test set need to be combined. Afterwards, the one hot encoding is applied and the train and test sets are separeated again.

In [45]:
train_set_df_without_nan['set'] = 'train'
test_set_df_without_nan['set'] = 'test'
print(train_set_df_without_nan.shape, test_set_df_without_nan.shape)

(1460, 75) (1459, 74)


In [51]:
all_data = pd.concat([train_set_df_without_nan, test_set_df_without_nan])
all_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,set
0,1,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,2,2008,WD,Normal,208500.0,train
1,2,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,5,2007,WD,Normal,181500.0,train
2,3,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,9,2008,WD,Normal,223500.0,train
3,4,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,0,0,2,2006,WD,Abnorml,140000.0,train
4,5,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,12,2008,WD,Normal,250000.0,train


In [56]:
one_hot_encoded_all_data = pd.get_dummies(all_data, dtype = int)
one_hot_encoded_all_data.loc[:,~one_hot_encoded_all_data.columns.duplicated()]

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,set_test,set_train
0,1,60,8450,7,5,2003,2003,196.0,706.0,0.0,...,0,1,0,0,0,0,1,0,0,1
1,2,20,9600,6,8,1976,1976,0.0,978.0,0.0,...,0,1,0,0,0,0,1,0,0,1
2,3,60,11250,7,5,2001,2002,162.0,486.0,0.0,...,0,1,0,0,0,0,1,0,0,1
3,4,70,9550,7,5,1915,1970,0.0,216.0,0.0,...,0,1,1,0,0,0,0,0,0,1
4,5,60,14260,8,5,2000,2000,350.0,655.0,0.0,...,0,1,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,1936,4,7,1970,1970,0.0,0.0,0.0,...,0,1,0,0,0,0,1,0,1,0
1455,2916,160,1894,4,5,1970,1970,0.0,252.0,0.0,...,0,1,1,0,0,0,0,0,1,0
1456,2917,20,20000,5,7,1960,1996,0.0,1224.0,0.0,...,0,1,1,0,0,0,0,0,1,0
1457,2918,85,10441,5,5,1992,1992,0.0,337.0,0.0,...,0,1,0,0,0,0,1,0,1,0


In [81]:
train_final_df = one_hot_encoded_all_data[one_hot_encoded_all_data['set_train'] == 1].drop(['set_train', 'set_test'], axis = 1)
test_final_df = one_hot_encoded_all_data[one_hot_encoded_all_data['set_test'] == 1].drop(['set_train', 'set_test', 'SalePrice'], axis = 1)


In [82]:
## Check if a feature has only a single value for all samples in the training data (remove this feature in that case, because it adds no information and therefore no benefit in the model training)
for i in train_final_df.columns:
  if len(train_final_df[i].unique()) == 1:
    print(i)

In [95]:
# train_final_df.to_csv('train-preprocessed.csv', index = False)
# test_final_df.to_csv('test-preprocessed.csv', index = False)