In [95]:
import numpy as np
import pandas as pd
import pandas.api.types as ptypes

In [96]:
# The idea is now to create functions, that replace values of features or remove entire features.
def remove_feats(df, list_of_feats):
  '''
  function that removes features from a pandas dataframe
  df: pandas dataframe
  list_of_feats: list of features to be removed
  '''
  df = df.drop(list_of_feats, axis = 1)
  return df

def replace_values(df, list_of_feats):
  '''
  function that replaces NaN values of features from a pandas datafram with the mean (numerical values) or the mode (categorical values)
  df: pandas datafram
  list_of_feats: list of features where the NaN values should be replaced
  '''
  for feat in list_of_feats:
      if ptypes.is_numeric_dtype(df[feat]):
          df[feat].fillna(df[feat].mean(), inplace=True)
      elif ptypes.is_object_dtype(df[feat]) or ptypes.is_categorical_dtype(df[feat]):
          df[feat].fillna(df[feat].mode()[0], inplace=True)
      else:
          print(f"Unhandled dtype for feature: {feat}")
  return df

In [114]:
# Import the train and test datasets, as well as the list of features to remove / replace
train_set_df = pd.read_csv(r'train.csv')
test_set_df = pd.read_csv(r'test.csv')

with open("feats_to_remove.txt", "r") as f:
    feats_to_remove = list(set(line.strip() for line in f))

with open("feats_to_replace.txt", "r") as f:
    feats_to_replace = list(set(line.strip() for line in f))

In [115]:
# Check the number of features that have nan values: 
print('Number of features containing nan values:')
print('Training set: ', train_set_df.isna().any().sum())
print('Test set: ', test_set_df.isna().any().sum())

Number of features containing nan values:
Training set:  19
Test set:  33


In [116]:
# Process the datasets:
train_set_df_without_nan = replace_values(remove_feats(train_set_df, list_of_feats = feats_to_remove), list_of_feats = feats_to_replace)
test_set_df_without_nan = replace_values(remove_feats(test_set_df, list_of_feats = feats_to_remove), list_of_feats = feats_to_replace)

In [117]:
# Check the number of features that have nan values: 
print('Number of features containing nan values after manipulation:')
print('Training set: ', train_set_df_without_nan.isna().any().sum())
print('Test set: ', test_set_df_without_nan.isna().any().sum())

Number of features containing nan values after manipulation:
Training set:  0
Test set:  0
