In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Cleaning and imputation

Missing data affects distributions, and gives a less representatative view of the data and can result in drawing incorrect conclusions.

### Numerical data

Stragegies for adressing missing data, by:
* Drop missing values (<5% of total)
* impute mean, median, mode (depends on distribution and context). mode = central tendency
* Impute by sub-group

In [None]:
# Check missing values for each column
df.isna().sum()

In [None]:
# step 1: Filter columns with less than threshold (drop missing values)
threshold = len(df)*0.05
cols_to_drop = df.columns[df.isna().sum() <= threshold]
df.dropna(subset = cols_to_drop, inplace = True)

In [None]:
# step 2: Filter remaining columns with missing values (impute sum statistics)
cols_with_missing_values = df.columns[df.isna().sum() > 0]

# This examples uses the mode statistics to fill missing values
for col in cols_with_missing_values[:,-1]:
    df[col].fillna(df[col].mode()[0])

In [None]:
# step interm: Can you remove a whole column which has no influence on the missing data?
df['columnname'].value_counts()

In [None]:
# Check remaining missing values for each column
df.isna().sum()

In [None]:
# Step 3: Add data using impute data of subgroups
df_dict = df.groupby('columname1')['columnanme2'].median().to_dict()
df['columnname2'] = df['columnname2'].fillna(df['columname1'].map(df_dict))

# df['columnname2'].fillna(df['columname1'].map(df_dict)) --> fill NaN values with using a dictionary. It searches for the key [columname1] and gives the value for [columname2]

In [None]:
# Check  missing values for each column again
df.isna().sum()

## Categorical data

Previewing only categorical data:

In [None]:
df.select_dtypes("objects").head()

Examine the frequency of columns

In [None]:
df['columnname'].unique()

In [None]:
df['columnname'].value_counts()

In [None]:
# print the number of unique values for each object column
non_numeric = df.select_dtypes("object")
for col in non_numeric.columns:
    # unique counts
    print(f"Number of unique values in {col} column: ", len(non_numeric[col].nunique()) )
    # unique value count within each category
    print(f"Number of unique values in {col} column: ", non_numeric[col].value_counts() )

A dataset can contain various names to describe a simiar category. These namings should be cleaned to the desired categorynames. 

Step 1: Make a list of new categories

In [None]:
new_category = ["cat1", "cat2", "cat3"]

step 2: Define phrases in strings

In [None]:
description1 = "var_interest_1 | var_interest_2"
description2 = "var_interest_1 | var_interest_2 | var_interest_3"
description3 = "var_interest_1"

step 3: Create conditions
* Use seach method for specific strings
* Store those in a variable 

In [None]:
# Seach for specific string
df['column'].str.constains('var_interest')

In [None]:
# Seach for specific multiple strings
df['column'].str.constains('var_interest_1 | var_interest_2')

In [None]:
# Store conditions in a list
conditions = [(df['column'].str.contains(description1),
               df['column'].str.contains(description2),
               df['column'].str.contains(description3))]

step 4: Create a new category column

In [None]:
# New column with condition applied
df.['new_columname'] = np.select(conditions, new_category, default = "Other")

step 5: Verify the variety of values in a column

In [None]:
# Distribution
sns.countplot(x = 'column', data = df)