#L2-2 Part 2: Data Cleaning, Alignment and Feature Engineering

## Setting up

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

## The dataset



In this lab we focus on a synthetic dataset so we can do the data cleaning steps in a reasonable amount of time.

In [None]:
#@title Run this cell to generate the synthetic datasets

%%capture
!pip install names

import names
import string

np.random.seed(100)

num_total = 100

genders = np.array(["female", "male"])
female_ones = np.random.binomial(1, 0.5, (num_total,))
male_ones = 1 - female_ones
d_gender = genders[male_ones[:, None]]
num_females = female_ones.sum()
num_males = num_total - num_females

names_list = []
for is_male in male_ones.astype('bool'):
  names_list.append(names.get_full_name(gender="male" if is_male else "female"))
d_name = np.array(names_list)

lower, upper = 0, 125
mu_female, sigma_female = 41, 20
mu_male, sigma_male = 38, 20
d_age = np.zeros((num_total, 1))
d_age[female_ones.astype('bool')] = \
  np.random.normal(mu_female, sigma_female, (num_females, 1))
d_age[male_ones.astype('bool')] = \
  np.random.normal(mu_male, sigma_male, (num_males, 1))
while np.any(np.logical_or(d_age < lower, d_age > upper)):
  d_age[d_age < lower] = 2 * lower - d_age[d_age < lower]
  d_age[d_age > upper] = 2 * upper - d_age[d_age > upper]
d_age = d_age.astype('int')
r_mu = np.array([0.056, 0, 100, 1.81, 0, 0])
s = np.array([[0.01, 1, 30, 3.16, 1, 1]]).T
c = np.array([[1, 0.28, -0.12, 0, 0.17, 0.31],
                    [0.28, 1, 0.07, 0, -0.24, 0.11],
                    [-0.12, 0.07, 1, 0, 0.04, -0.13],
                    [0, 0, 0, 1, 0, 0.42],
                    [0.17, -0.24, 0.04, 0, 1, 0.17],
                    [0.31, 0.11, -0.13, 0.42, 0.17, 1]])
r_sigma = s * c * s.T
D_b = np.random.multivariate_normal(r_mu, r_sigma, num_total)
D_b[:, 1] = np.ceil(D_b[:, 1])
D_b[:, 1] -= D_b[:, 1].min()
D_b[:, 4] = (D_b[:, 4] < 0)
D_b[:, 5] = (D_b[:, 5] < 0)
D_m = D_b.astype('str')
D_m[:, 1] = np.array(list(string.ascii_lowercase))[D_b[:, 1].astype('int')]
D_m[:, 4] = np.array(["true", "false"])[D_b[:, 4].astype('int')]
D_m[:, 5] = np.array(["negative", "positive"])[D_b[:, 5].astype('int')]
date_index_d = np.random.randint(0, 366, (num_total, 1))
date_range = np.arange('2010-08-16', '2011-08-17', dtype='datetime64[D]')
d_date = date_range[date_index_d]
data = np.c_[d_date.astype('str'), 
          d_name, 
          d_gender, 
          d_age.astype('str'), 
          D_m]

unknowns_d = np.random.choice(np.arange(num_total),
                              size=8,
                              replace=False)
data[unknowns_d, 9] = "_"

data1 = data[:42, :]

data2 = data[42:, np.array([0, 1, 2, 3, 5, 6, 4, 7, 9])]
data2[:, 6] = (100 * data2[:, 6].astype('float')).astype('str')

missings_d25 = np.random.choice(np.where(data2[:, 3].astype('float') > 40)[0], 
                                size=10, 
                                replace=False)
data2[missings_d25, 5] = "_"

missings_d24 = np.random.choice(np.arange(data2.shape[0]),
                                size=5,
                                replace=False)
data2[missings_d24, 4] = "_"

repeateds_d2 = np.random.choice(np.arange(data2.shape[0]),
                                size=3,
                                replace=False)
data2 = np.r_[data2, data2[repeateds_d2, :]]
data2 = data2[np.random.permutation(data2.shape[0])]


feature_names1 = ["Examination Date", "Name", "Gender", "Age", "M1", "Mode", "Q", "DD", "C"]
label_name1 = "Diagnosis"
column_names1 = feature_names1 + [label_name1]
df_c1 = pd.DataFrame(data1, columns=column_names1)
df_c1 = df_c1.sort_values("Examination Date")
df_c1.index = np.arange(len(df_c1))

feature_names2 = ["Examination Date", "Name", "Gender", "Age", "Mode", "Q", "M1", "DD"]
label_name2 = "Diagnosis"
column_names2 = feature_names2 + [label_name2]
df_c2 = pd.DataFrame(data2, columns=column_names2)

df_c2.loc[[3, 44], "Gender"] = "Male"
df_c2.loc[56, "Gender"] = "fmeale"
df_c2.loc[31, "Age"] = 592
df_c2.loc[[31, 32, 33], "Mode"] = "F"

num_maintenance = 22

date_index_m = np.random.randint(0, 366, (num_maintenance, 1))
m_date = date_range[date_index_m]
M_r = np.random.random((num_maintenance, 4))
M_r[:, 0] *= 9.88e-1
M_r[:, 1] *= 1.05e-2
M_r[:, 2] *= 1.21e+1
M_r[:, 3] *= 2.80e-4
clinic_names = np.array(["clinic1", "clinic2"])
clinic_indices = np.random.randint(0, 2, (num_maintenance, 1))
M_s = clinic_names[clinic_indices]
M = np.c_[m_date.astype('str'), M_r.astype('str'), M_s]
missings_m = np.random.choice(np.arange(1, num_maintenance - 1),
                              size=7,
                              replace=False)
M[missings_m, 1] = "_"

column_names3 = ["Inspection Date", "R1", "R2", "R3", "R4", "Device Site"]
df_m = pd.DataFrame(M, columns=column_names3)

df_m = df_m.sort_values("Inspection Date")
df_m.index = np.arange(len(df_m))

In this example we have three datasets, two datasets from two different hypothetical clincs "clinic1" and "clinic2" which diagnose patients with a novel device which takes a number of measurements . The final goal is to see if they have a certain disease or not. Measurements taken from patients in the two clincs are presented in dataframes `df_c1` and `df_c2`. We also have an inspection log, recorded in `df_m`, for the devices used in "clinic1" and "clinic2" where a number of variables from the the device are measured. Two of these variables, **R1** and **R3**, are believed to affect the readings taken from the patients (the other tow readings are not relevant). The `df_c1` and `df_c2` datasets are labeled with an actual diagnosis whether the patient had the disease or not and the goal is to predict the existence of disease based on the measurements taken from the patients. Since the variables of the devices, measured in inspection, affects the measurements taken from patients in clincs, they should also be considered. Here are the data frames:

In [None]:
display(df_c1)

In [None]:
display(df_c2)

In [None]:
display(df_m)

`df_c1` is clean, however, `df_c2` and `df_m` need cleaning and handling missing values (indicated by "_") before we get to align the datasets and create a single consolidated dataset. 

## Cleaning `df_c2`

In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
display(df_c2)

In [None]:
feature_names_c2 = ["Examination Date", "Name", "Gender", "Age", "Mode", "Q", \
                    "M1", "DD"]
label_name_c2 = "Diagnosis"
column_names_c2 = feature_names_c2 + [label_name_c2]

In [None]:
n_c2 = len(df_c2)
m_c2 = len(df_c2.columns) - 1

print("Number of Examples:", n_c2)
print("Number Features:", m_c2)

In [None]:
fig = px.scatter_matrix(df_c2, dimensions=column_names_c2, color=label_name_c2)

fig.update_layout(width=(m_c2 + 1) * 200,
                 height=(m_c2 + 1) * 200,
                 margin=dict(l=0, r=0, t=0, b=0))

fig.show()

In [None]:
numeric_features_c2 = ["Age", "Q", "M1", "DD"]
non_numeric_features_c2 = ["Examination Date", "Name", "Gender", "Mode"]
for feature in feature_names_c2:
  if feature in numeric_features_c2:
    non_numerical_rows = pd.to_numeric(df_c2[feature], errors='coerce').isnull()
    df_non_numeric = df_c2[non_numerical_rows]
    df_numeric = df_c2[~non_numerical_rows]
    fig = px.histogram(df_numeric, 
                      x=feature,
                      color=label_name_c2,
                      marginal="box")
    fig.update_layout(height=400,
                     margin=dict(l=0, r=0, t=100, b=0), 
                     title=feature + ": numeric")
    fig.show()
  else:
    df_non_numeric = df_c2
  fig = px.histogram(df_non_numeric, x=feature, color=label_name_c2)
  fig.update_layout(height=400,
                   margin=dict(l=0, r=0, t=100, b=0), 
                   title=feature + ": non-numeric")

  fig.show()

In [None]:
fig = px.histogram(df_c2, x=label_name_c2)
fig.show()

In [None]:
df_c2_clean = df_c2.copy()

In [None]:
df_c2_clean.drop("Name", axis=1, inplace=True)

In [None]:
missing_label_rows = df_c2_clean[df_c2_clean[label_name_c2] == "_"].index
df_c2_clean.drop(missing_label_rows, axis=0, inplace=True)

In [None]:
df_c2_clean["Gender"].replace("fmeale", "female", inplace=True)
df_c2_clean["Gender"].replace("Male", "male", inplace=True)

df_c2_clean["Mode"].replace("F", "f", inplace=True)

In [None]:
## This will cause an error
# age_outlier_rows = df_c2_clean[df_c2_clean["Age"] > 150]

In [None]:
df_c2_clean.dtypes

In [None]:
df_c2_clean["Examination Date"] = \
  pd.to_datetime(df_c2_clean["Examination Date"])

In [None]:
gender_type = pd.CategoricalDtype(categories=["female", "male"])

df_c2_clean["Gender"] = df_c2_clean["Gender"].astype(gender_type)

In [None]:
df_c2_clean["Age"] = df_c2_clean["Age"].astype("int64")

In [None]:
mode_categories = ["a", "b", "c", "d", "e", "f", "g", "h", "_"]
mode_type = pd.CategoricalDtype(categories=mode_categories)

df_c2_clean["Mode"] = df_c2_clean["Mode"].astype(mode_type)

In [None]:
df_c2_clean["Q"] = pd.to_numeric(df_c2_clean["Q"], errors='coerce')

In [None]:
df_c2_clean["M1"] = pd.to_numeric(df_c2_clean["M1"])

In [None]:
df_c2_clean["DD"] = pd.to_numeric(df_c2_clean["DD"])

In [None]:
diagnosis_type = pd.CategoricalDtype(categories=["negative", "positive"])

df_c2_clean["Diagnosis"] = df_c2_clean["Diagnosis"].astype(diagnosis_type)

In [None]:
df_c2_clean.dtypes

In [None]:
age_outlier_rows = df_c2_clean[df_c2_clean["Age"] > 150].index
df_c2_clean = df_c2_clean.drop(age_outlier_rows, axis=0)

In [None]:
display(df_c2_clean)

In [None]:
duplicate_rows = df_c2_clean.duplicated()

display(duplicate_rows)

In [None]:
df_c2_clean = df_c2_clean[~duplicate_rows]

In [None]:
display(df_c2_clean)

In [None]:
df_c2_clean.reset_index(inplace=True)

In [None]:
display(df_c2_clean)

## Handling missing values of `df_c2`

### Listwise deletion

In [None]:
q_missing = df_c2_clean["Q"].isnull() #You can also use .isna()

display(q_missing)

In [None]:
df_temp = df_c2_clean[~q_missing].copy()

df_temp.reset_index(inplace=True)

display(df_temp)

### Dropping the feature (variable)

In [None]:
df_temp = df_c2_clean.drop("Q", axis=1)

display(df_temp)

### Imputing with constant

#### Zero

In [None]:
df_temp = df_c2_clean.copy()
df_temp.loc[q_missing, "Q"] = 0

display(df_temp)

#### Mean

In [None]:
df_temp = df_c2_clean.copy()
df_temp.loc[q_missing, "Q"] = df_temp["Q"].mean()

display(df_temp)

#### Median

In [None]:
df_temp = df_c2_clean.copy()
df_temp.loc[q_missing, "Q"] = df_temp["Q"].median()

display(df_temp)

#### Mode

In [None]:
df_temp = df_c2_clean.copy()
df_temp.loc[q_missing, "Q"] = df_temp["Q"].mode(dropna=True)[0]

display(df_temp)

#### Constant category

In [None]:
mode_missing = (df_c2_clean["Mode"] == "_")

df_temp = df_c2_clean.copy()
df_temp.loc[mode_missing, "Mode"] = "e"

display(df_temp)

#### Maximum frequency category



In [None]:
df_temp = df_c2_clean.copy()
df_temp.loc[mode_missing, "Mode"] = df_c2_clean["Mode"].mode(dropna=True)[0]

display(df_temp)

#### 'Missing' category

In [None]:
df_temp = df_c2_clean.copy()

df_temp["Mode"] = df_temp["Mode"].astype("str")
df_temp["Mode"].replace("_", "missing", inplace=True)

mode_categories_m = ["a", "b", "c", "d", "e", "f", "g", "h", "missing"]
mode_type_m = pd.CategoricalDtype(categories=mode_categories_m)

df_temp["Mode"] = df_temp["Mode"].astype(mode_type_m)

display(df_temp)

### Non-constant imputation

Educated guessing and ML algorithms we are not going to cover.


#### Depending on another feature

In [None]:
df_temp = df_c2_clean.copy()
df_temp["Q Missing"] = df_temp["Q"].isnull()
for feature in ["Gender", "Age", "Mode", "M1", "DD"]:
  fig = px.histogram(df_temp, x=feature, color="Q Missing")
  fig.update_layout(height=300, margin=dict(l=0, r=0, t=0, b=0))
  fig.show()

In [None]:
age_greater_than_40 = (df_temp["Age"] > 40)
mean_q_when_age_gt_40 = df_temp.loc[age_greater_than_40, "Q"].mean()
df_temp.loc[q_missing, "Q"] = mean_q_when_age_gt_40

display(df_temp)

### Adding 'missingness' indicator feature

We add a 'missingness' indicator feature for **Q** which has missing values.

In [None]:
df_temp = df_c2_clean.copy()

df_temp["Mode"] = df_temp["Mode"].astype("str")
df_temp["Mode"].replace("_", "missing", inplace=True)

df_temp["Mode"] = df_temp["Mode"].astype(mode_type_m)

is_q_missing = df_c2_clean["Q"].isnull()
df_temp["Q Missing"] = is_q_missing

df_temp = df_temp[["Examination Date", "Gender", "Age", "Mode", \
                   "Q", "Q Missing", "M1", "DD", "Diagnosis"]]

display(df_temp)

In [None]:
df_c2_clean = df_temp.copy()

display(df_c2_clean)

## Cleaning `df_m`

In [None]:
display(df_m)

In [None]:
feature_names_m = ["Inspection Date", "R1", "R2", "R3", "R4", "Device Site"]
column_names_m = feature_names_m

In [None]:
n_m = len(df_m)
m_m = len(df_m.columns)

print("Number of Examples:", n_m)
print("Number Features:", m_m)

In [None]:
fig = px.scatter_matrix(df_m, dimensions=column_names_m)

fig.update_layout(width=(m_m + 1) * 200,
                 height=(m_m + 1) * 200,
                 margin=dict(l=0, r=0, t=0, b=0))

fig.show()

In [None]:
numeric_features_m = ["R1", "R2", "R3", "R4"]
non_numeric_features_m = ["Inspection Date", "Device Site"]
for feature in feature_names_m:
  if feature in numeric_features_m:
    non_numerical_rows = pd.to_numeric(df_m[feature], errors='coerce').isnull()
    df_non_numeric = df_m[non_numerical_rows]
    df_numeric = df_m[~non_numerical_rows]
    fig = px.histogram(df_numeric, 
                      x=feature,
                      marginal="box")
    fig.update_layout(height=400,
                     margin=dict(l=0, r=0, t=100, b=0), 
                     title=feature + ": numeric")
    fig.show()
  else:
    df_non_numeric = df_m
  fig = px.histogram(df_non_numeric, x=feature)
  fig.update_layout(height=400,
                   margin=dict(l=0, r=0, t=100, b=0), 
                   title=feature + ": non-numeric")

  fig.show()

In [None]:
df_m_clean = df_m.copy()

In [None]:
df_m_clean.drop(["R2", "R4"], axis=1, inplace=True)

In [None]:
df_m_clean.dtypes

In [None]:
df_m_clean["Inspection Date"] = \
  pd.to_datetime(df_m_clean["Inspection Date"])

In [None]:
df_m_clean["R1"] = pd.to_numeric(df_m_clean["R1"], errors='coerce')

In [None]:
df_m_clean["R3"] = pd.to_numeric(df_m_clean["R3"])

In [None]:
site_type = pd.CategoricalDtype(categories=["clinic1", "clinic2"])

df_m_clean["Device Site"] = df_m_clean["Device Site"].astype(site_type)

In [None]:
df_m_clean.dtypes

In [None]:
duplicate_rows = df_m_clean.duplicated()

display(duplicate_rows.any())

In [None]:
display(df_m_clean)

In [None]:
df_m_clean.drop(9, axis=0, inplace=True)

In [None]:
df_m_clean.reset_index(inplace=True, drop=True)

In [None]:
display(df_m_clean)

## Handling missing values of `df_m`

In [None]:
df_m1_clean = df_m_clean[df_m_clean["Device Site"] == 'clinic1'].copy()
df_m2_clean = df_m_clean[df_m_clean["Device Site"] == 'clinic2'].copy()

df_m1_clean.sort_values(by="Inspection Date", inplace=True)
df_m2_clean.sort_values(by="Inspection Date", inplace=True)

df_m1_clean.reset_index(inplace=True, drop=True)
df_m2_clean.reset_index(inplace=True, drop=True)

In [None]:
display(df_m1_clean)

In [None]:
display(df_m2_clean)

### Last Observation Carried Forward (LOCF)

In [None]:
df_temp = df_m1_clean.copy()

df_temp.fillna(method='ffill', inplace=True)

display(df_temp)

### Next Observation Carried Backward (NOCB)



In [None]:
df_temp = df_m2_clean.copy()

df_temp.fillna(method='bfill', inplace=True)

display(df_temp)

### Linear interpolation



In [None]:
df_temp = df_m1_clean.copy()

df_temp.set_index("Inspection Date", inplace=True)

display(df_temp)

In [None]:
df_temp["R1"].interpolate(method='index', inplace=True)

df_temp.reset_index(inplace=True)

display(df_temp)

## Inspecting `df_c1`

In [None]:
display(df_c1)

In [None]:
feature_names_c1 = ["Examination Date", "Name", "Gender", "Age", "Mode", "Q", \
                    "M1", "DD", "C"]
label_name_c1 = "Diagnosis"
column_names_c1 = feature_names_c1 + [label_name_c1]

In [None]:
n_c1 = len(df_c1)
m_c1 = len(df_c1.columns) - 1

print("Number of Examples:", n_c1)
print("Number Features:", m_c1)

In [None]:
fig = px.scatter_matrix(df_c1, dimensions=column_names_c1, color=label_name_c1)

fig.update_layout(width=(m_c1 + 1) * 200,
                 height=(m_c1 + 1) * 200,
                 margin=dict(l=0, r=0, t=0, b=0))

fig.show()

In [None]:
numeric_features_c1 = ["Age", "Q", "M1", "DD"]
non_numeric_features_c1 = ["Examination Date", "Name", "Gender", "Mode", "C"]
for feature in feature_names_c1:
  if feature in numeric_features_c1:
    fig = px.histogram(df_c1, x=feature, color=label_name_c1, marginal="box")
    fig.update_layout(height=300, margin=dict(l=0, r=0, t=100, b=0))
  else:
    fig = px.histogram(df_c1, x=feature, color=label_name_c1)
    fig.update_layout(height=300, margin=dict(l=0, r=0, t=0, b=0))
  fig.show()

In [None]:
fig = px.histogram(df_c1, x=label_name_c1)
fig.show()

In [None]:
df_c1_clean = df_c1.copy()

In [None]:
df_c1_clean.drop("Name", axis=1, inplace=True)

In [None]:
missing_label_rows = df_c1_clean[df_c1_clean[label_name_c1] == "_"].index
df_c1_clean.drop(missing_label_rows, axis=0, inplace=True)

In [None]:
df_c1_clean.dtypes

In [None]:
df_c1_clean["Examination Date"] = \
  pd.to_datetime(df_c1_clean["Examination Date"])

In [None]:
df_c1_clean["Gender"] = df_c1_clean["Gender"].astype(gender_type)

In [None]:
df_c1_clean["Age"] = df_c1_clean["Age"].astype("int64")

In [None]:
df_c1_clean["Mode"] = df_c1_clean["Mode"].astype(mode_type)

In [None]:
df_c1_clean["Q"] = pd.to_numeric(df_c1_clean["Q"])

In [None]:
df_c1_clean["M1"] = pd.to_numeric(df_c1_clean["M1"])

In [None]:
df_c1_clean["DD"] = pd.to_numeric(df_c1_clean["DD"])

In [None]:
df_c1_clean["C"] = df_c1_clean["C"].astype("bool")

In [None]:
df_c1_clean["Diagnosis"] = df_c1_clean["Diagnosis"].astype(diagnosis_type)

In [None]:
df_c1_clean.dtypes

In [None]:
display(df_c1_clean)

## Aligning datasets

### Preparation

Feature **M1** in `df_c1` is represented in units that are 100 times larger than **M1** in `df_c2` (the values in `df_c1` are 100 times smaller than those of `df_c2`). All other values are in the same units. Let's scale the **M1** values in `df_c1` to match:


In [None]:
df_c1_clean["M1"] *= 100

display(df_c1_clean)

In [None]:
df_c1_clean.rename(columns={"Examination Date": "Date"}, inplace=True)
df_c2_clean.rename(columns={"Examination Date": "Date"}, inplace=True)
df_m1_clean.rename(columns={"Inspection Date": "Date"}, inplace=True)
df_m2_clean.rename(columns={"Inspection Date": "Date"}, inplace=True)

In [None]:
display(df_c1_clean.columns)
display(df_c2_clean.columns)
display(df_m1_clean.columns)
display(df_m2_clean.columns)

### Aligning

In [None]:
df_cm1 = pd.concat([df_c1_clean, df_m1_clean], axis=0)

display(df_cm1)

In [None]:
df_cm1.sort_values("Date", inplace=True)

display(df_cm1)

In [None]:
df_cm1.set_index("Date", inplace=True)
df_cm1["R1"].interpolate(method='index', inplace=True)
df_cm1["R3"].interpolate(method='index', inplace=True)

display(df_cm1)

In [None]:
df_cm1["R1"].fillna(method='bfill', inplace=True)
df_cm1["R3"].fillna(method='bfill', inplace=True)

display(df_cm1)

In [None]:
df_cm1 = df_cm1[df_cm1["Device Site"].isnull()]

display(df_cm1)

In [None]:
df_cm2 = pd.concat([df_c2_clean, df_m2_clean], axis=0)

df_cm2.sort_values("Date", inplace=True)

df_cm2.set_index("Date", inplace=True)

df_cm2["R1"].interpolate(method='index', inplace=True)
df_cm2["R3"].interpolate(method='index', inplace=True)

display(df_cm2)

In [None]:
df_cm2["R1"].fillna(method='bfill', inplace=True)
df_cm2["R3"].fillna(method='bfill', inplace=True)

df_cm2 = df_cm2[df_cm2["Device Site"].isnull()]

display(df_cm2)

In [None]:
df_cm1["Q Missing"] = False
df_cm1["C Missing"] = False

df_cm2["C"] = False
df_cm2["C Missing"] = True

In [None]:
df = pd.concat([df_cm1, df_cm2], axis=0)

display(df)

In [None]:
df.sort_values("Date", inplace=True)

df = df[["Gender", "Age", "M1", "Mode", "Q", "Q Missing", "DD", "C", \
         "C Missing", "R1", "R3", "Diagnosis"]]

display(df)

In [None]:
df.reset_index(inplace=True)

display(df)

In [None]:
df.drop("Date", axis=1, inplace=True)

In [None]:
q_missing = df["Q"].isnull()
age_greater_than_40 = (df["Age"] > 40)
mean_q_when_age_gt_40 = df.loc[age_greater_than_40, "Q"].mean()
df.loc[q_missing, "Q"] = mean_q_when_age_gt_40

display(df)

## Feature encoding

### Converting to numbers

In [None]:
df = pd.get_dummies(df, columns=["Mode"], prefix='mode')

In [None]:
display(df)

In [None]:
df = pd.get_dummies(df, columns=["Gender"], prefix='gender', drop_first=True)
df = pd.get_dummies(df, columns=["Diagnosis"], prefix='diagnosis', drop_first=True)

display(df)

In [None]:
df.rename(columns={"Age": "age", 
                   "M1": "m1", 
                   "Q" : "q", 
                   "Q Missing": "q_missing", 
                   "DD" : "dd", 
                   "C" : "c", 
                   "C Missing": "c_missing",
                   "R1": "r1",
                   "R3": "r3"},
          inplace=True)

df = df[["gender_male", "age", "m1", "mode_a", "mode_b", "mode_c", "mode_d", \
         "mode_e", "mode_f", "mode_g", "mode_h", "mode_missing", "q", \
         "q_missing", "dd", "c", "c_missing", "r1", "r3", "diagnosis_positive"]]

df["age"] = df["age"].astype("uint8")
df["q_missing"] = df["q_missing"].astype("uint8")
df["c"] = df["c"].astype("uint8")
df["c_missing"] = df["c_missing"].astype("uint8")

display(df)

In [None]:
df.dtypes

In [None]:
display(df.duplicated().any())

### Normalization and standardization

In [None]:
import sklearn.preprocessing

In [None]:
x_m1 = df[['m1']].values
m1_scaler = sklearn.preprocessing.StandardScaler()
x_m1_scaled = m1_scaler.fit_transform(x_m1)
df["m1"] = x_m1_scaled

display(df["m1"])

In [None]:
display(x_m1_scaled.mean(), x_m1_scaled.std())

In [None]:
x_q = df[['q']].values
q_scaler = sklearn.preprocessing.StandardScaler()
x_q_scaled = q_scaler.fit_transform(x_q)
df["q"] = x_q_scaled

In [None]:
x_dd = df[['dd']].values
dd_scaler = sklearn.preprocessing.MinMaxScaler()
x_dd_scaled = dd_scaler.fit_transform(x_dd)

df["dd"] = (2 * x_dd_scaled) - 1

In [None]:
display(df)

In [None]:
display(df['dd'].min(), df['dd'].max())

## Feature transformations

In [None]:
df["age_lt_25"] = (df["age"] < 25).astype("uint8")
df["age_gt_40"] = (df["age"] > 40).astype("uint8")

display(df)

In [None]:
df["log_m1"] = np.log(np.abs(df["m1"]))

In [None]:
q_dd_poly3 = sklearn.preprocessing.PolynomialFeatures(degree=3)

q_dd_d3_columns = ["q0dd0", \
                   "q1dd0", "q0dd1", \
                   "q2dd0", "q1dd1", "q0dd2", \
                   "q3dd0", "q2dd1", "q1dd2", "q0dd3"]

df = df.reindex(columns=list(df.columns) + q_dd_d3_columns)    

df[q_dd_d3_columns] = q_dd_poly3.fit_transform(df[["q", "dd"]])

display(df)

In [None]:
label_name = "diagnosis_positive"
labels = df[label_name]
df.drop(label_name, axis=1, inplace=True)
feature_names = df.columns
df[label_name] = labels

display(df)

## Feature selection

See:
1. https://scikit-learn.org/stable/modules/feature_selection.html
2. https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest


In [None]:
for feature in feature_names:
  fig = px.histogram(df, x=feature, color=label_name, marginal="box")
  fig.update_layout(height=300, margin=dict(l=0, r=0, t=0, b=0))
  fig.show()

In [None]:
import sklearn.feature_selection

In [None]:
func = sklearn.feature_selection.mutual_info_classif
feature_selector = sklearn.feature_selection.SelectKBest(func, k=6)
X = df.loc[:, df.columns != label_name].values
y = df[label_name].values
feature_selector.fit(X, y)
selected_columns = feature_names[feature_selector.get_support()]
display(df[selected_columns])

That's all Folks!