# Data Cleaning

Argumentation and steps are outlined in exploration. So we do them quickly here

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [11]:
df = pd.read_csv("../../data/raw/data-set.csv")

# Removes rows with 3,4 or 5 values missing from income, age, var1, gender, house_type
df = df[df.isna().sum(axis=1)<=2]

df['house_type_bin'] = df['house_type'].map({'OWNER': 1, 'RENTAL': 0})
df['gender_bin'] = df['gender'].map({'m': 1, 'f': 0})

format = "%m/%d/%Y %H:%M"
df['lastVisit_dt'] = pd.to_datetime(df['lastVisit'], format=format)
df['lastVisit_year'] = df['lastVisit_dt'].dt.year
df['lastVisit_month'] = df['lastVisit_dt'].dt.month

min_date = df['lastVisit_dt'].min()
df['lastVisit_days'] = (df['lastVisit_dt'] - min_date).dt.days + 1

df['product02_bin'] = df['product02'].map({'Ja': 1, 'Nee': 0})


To keep in mind:
- first feature experimation: house_type, last_Visit, var1, income
- Resampling needed?
- What would be the best format for lastVisit?
- Cap the lowest income?

In [13]:
df.isna().sum()

subscriber           0
income             226
age                242
var1               243
gender             246
house_type         234
lastVisit            0
product02            0
house_type_bin     234
gender_bin         246
lastVisit_dt         0
lastVisit_year       0
lastVisit_month      0
lastVisit_days       0
product02_bin        0
dtype: int64

## Nan handling


In [14]:
# income, age and var1 seem to be best handled by mean. It's 250/9000 rows so shouldn't impact too much. Deeper investigation later could try other imputations
# For house_type the mode seems better to keep it binary
# Let's add columns as well to see if there have been imputations. Easier to debug later on

In [23]:
for c in ['income', 'age', 'var1']:
    df[f'{c}_imputed'] = df[c].isna()
    df[c] = df[c].fillna(df[c].mean())

In [28]:
c = 'house_type_bin'
df[f'{c}_imputed'] = df[c].isna()
df[c] = df[c].fillna(df[c].mode())

In [29]:
df.isna().sum()

subscriber                  0
income                      0
age                         0
var1                        0
gender                    246
house_type                234
lastVisit                   0
product02                   0
house_type_bin            234
gender_bin                246
lastVisit_dt                0
lastVisit_year              0
lastVisit_month             0
lastVisit_days              0
product02_bin               0
income_imputed              0
age_imputed                 0
var1_imputed                0
house_type_imputed          0
house_type_bin_imputed      0
dtype: int64

In [30]:
df.to_feather("../../data/processed/data-set.ftr")