In [8]:
import numpy as np
import pandas as pd
import pandas.api.types as ptypes
from sklearn.preprocessing import OneHotEncoder

In [2]:
# The idea is now to create functions, that replace values of features or remove entire features.
def remove_feats(df, list_of_feats):
  '''
  function that removes features from a pandas dataframe
  df: pandas dataframe
  list_of_feats: list of features to be removed
  '''
  df = df.drop(list_of_feats, axis = 1)
  return df

def replace_values(df, list_of_feats):
  '''
  function that replaces NaN values of features from a pandas datafram with the mean (numerical values) or the mode (categorical values)
  df: pandas datafram
  list_of_feats: list of features where the NaN values should be replaced
  '''
  for feat in list_of_feats:
      if ptypes.is_numeric_dtype(df[feat]):
          df[feat] = df[feat].fillna(df[feat].mean())
      elif ptypes.is_object_dtype(df[feat]) or ptypes.is_categorical_dtype(df[feat]):
          df[feat] = df[feat].fillna(df[feat].mode()[0])
      else:
            print(f"Unhandled dtype for feature: {feat}")
  return df

In [3]:
# Import the train and test datasets, as well as the list of features to remove / replace
train_set_df = pd.read_csv(r'train.csv').copy()
test_set_df = pd.read_csv(r'test.csv').copy()

with open("feats_to_remove.txt", "r") as f:
    feats_to_remove = list(set(line.strip() for line in f))

with open("feats_to_replace.txt", "r") as f:
    feats_to_replace = list(set(line.strip() for line in f))

In [4]:
# Check the number of features that have nan values: 
print('Number of features containing nan values:')
print('Training set: ', train_set_df.isna().any().sum())
print('Test set: ', test_set_df.isna().any().sum())

Number of features containing nan values:
Training set:  19
Test set:  33


In [5]:
# Process the datasets:
train_set_df_without_nan = replace_values(remove_feats(train_set_df, list_of_feats = feats_to_remove), list_of_feats = feats_to_replace)
test_set_df_without_nan = replace_values(remove_feats(test_set_df, list_of_feats = feats_to_remove), list_of_feats = feats_to_replace)

In [6]:
# Check the number of features that have nan values: 
print('Number of features containing nan values after manipulation:')
print('Training set: ', train_set_df_without_nan.isna().any().sum())
print('Test set: ', test_set_df_without_nan.isna().any().sum())

Number of features containing nan values after manipulation:
Training set:  0
Test set:  0


In [13]:
train_set_df_without_nan[['LotShape', 'MSZoning']]

Unnamed: 0,LotShape,MSZoning
0,Reg,RL
1,Reg,RL
2,IR1,RL
3,IR1,RL
4,IR1,RL
...,...,...
1455,Reg,RL
1456,Reg,RL
1457,Reg,RL
1458,Reg,RL


In [26]:
def OneHotEncoding(df):
  for column in df.columns:
    if df[column].dtype == 'object':
      print(df[column].unique())
OneHotEncoding(train_set_df_without_nan)

['RL' 'RM' 'C (all)' 'FV' 'RH']
['Pave' 'Grvl']
['Reg' 'IR1' 'IR2' 'IR3']
['Lvl' 'Bnk' 'Low' 'HLS']
['AllPub' 'NoSeWa']
['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']
['Gtl' 'Mod' 'Sev']
['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']
['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']
['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']
['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs']
['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin']
['Gable' 'Hip' 'Gambrel' 'Mansard' 'Flat' 'Shed']
['CompShg' 'WdShngl' 'Metal' 'WdShake' 'Membran' 'Tar&Grv' 'Roll'
 'ClyTile']
['VinylSd' 'MetalSd' 'Wd Sdng' 'HdBoard' 'BrkFace' 'WdShing' 'CemntBd'
 'Plywood' 'AsbShng' 'Stucco' 'BrkComm' 'AsphShn' 'Stone' 'ImStucc'
 'CBlock']
['VinylSd' 'MetalSd' 'Wd Shng' 'HdBoard' 'Plywood