In [None]:
import pandas as pd
import pickle
import featuretools as ft
import mrmr
from mrmr import mrmr_classif
import matplotlib.pyplot as plt
import seaborn as sns
from woodwork.logical_types import Boolean, AgeNullable, Ordinal, Categorical, AgeFractional, Integer, Double

In [None]:
# Import necessary dataframes 
df_train = pd.read_csv('../cleaned_imputed_split/X_train.csv')
df_val = pd.read_csv('../cleaned_imputed_split/X_val.csv')
df_test = pd.read_csv('../cleaned_imputed_split/X_test.csv')

df_HQ_train = pd.read_csv('../Processed datasets/After outlier analysis/HQ.csv')
df_FFQ_train = pd.read_csv('../Processed datasets/After outlier analysis/FFQ.csv')
df_CM_train = pd.read_csv('../Processed datasets/After outlier analysis/CM.csv')
df_CE_train = pd.read_csv('../Processed datasets/After outlier analysis/CE.csv')

y_train = pd.read_csv('../cleaned_imputed_split/y_train.csv')

In [None]:
# Create list that contains the train dataframe, validation dataframe and test dataframe 
list_of_dataframes = [df_train, df_val, df_test]

### FFQ Data

In [None]:
complete_columns_FFQ = df_FFQ_train.columns.tolist()

#### 9-point ordinal scales

In [None]:
# Find all the FFQ questions with 9 answer categories, and set dtype to ordinal
dtypes_FFQ = {}

for column in df_FFQ_train:
    if len(df_FFQ_train[column].unique()) == 9:
        dtypes_FFQ[column] = Ordinal(order=[1,2,3,4,5,6,7,8,9])
        complete_columns_FFQ.remove(column)

#### 4-point ordinal scales

In [None]:
# Find all questions with 4 answer categories 
for column in df_FFQ_train:
    if len(df_FFQ_train[column].unique()) == 4:
        print(column)

In [None]:
# Select ordinal ones and assign correct dtype
for column in ['JMEATFAT', 'JFRYEAT']:
    dtypes_FFQ[column] = Ordinal(order=[1,2,3,4])
    complete_columns_FFQ.remove(column)

#### 5-points ordinal scales

In [None]:
# Find all questions with 5 answer categories 
for column in df_FFQ_train:
    if len(df_FFQ_train[column].unique()) == 5:
        print(column)

In [None]:
# Select ordinal ones and assign correct dtype
for column in ['JSALTTAB', 'JSALTCK']:
    dtypes_FFQ[column] = Ordinal(order=[1,2,3,4,5])
    complete_columns_FFQ.remove(column)

#### 6-points ordinal scales

In [None]:
# Find all questions with 6 answer categories 
for column in df_FFQ_train:
    if len(df_FFQ_train[column].unique()) == 6:
        print(column)

In [None]:
# Select ordinal ones and assign correct dtype
dtypes_FFQ['JMILKDAY'] = Ordinal(order=[1,2,3,4,5,6])
complete_columns_FFQ.remove('JMILKDAY')

#### 6-point categorical scales 

In [None]:
# Select categorical ones and assign correct dtype
for column in ['JFRYFAT', 'JBAKEFAT']:
        dtypes_FFQ[column] = Categorical
        complete_columns_FFQ.remove(column)

#### 7-point categorical

In [None]:
# Find all questions with 7 answer categories 
for column in df_FFQ_train:
    if len(df_FFQ_train[column].unique()) == 7:
        print(column)

In [None]:
# Select categorical ones and assign correct dtype 
dtypes_FFQ['JMILKUSE'] = Categorical
complete_columns_FFQ.remove('JMILKUSE')

In [None]:
# Select ordinal ones and assign correct dtype 
dtypes_FFQ['JDIETLNG'] = Ordinal(order=[0,1,2,3,4,5,6])
complete_columns_FFQ.remove('JDIETLNG')

#### Booleans

In [None]:
# Identify Booleans and assign correct dtype
for column in df_FFQ_train:
    if len(df_FFQ_train[column].unique())<=2:
        dtypes_FFQ[column] = Boolean
        complete_columns_FFQ.remove(column)

In [None]:
# The remaining questions are all the 9-point food frequency questions
for column in complete_columns_FFQ:
    dtypes_FFQ[column] = Ordinal(order=[1,2,3,4,5,6,7,8,9])

## HQ Data

In [None]:
dtypes_Q = {}

#### Boolean

In [None]:
# Identify boolean variables and assign correct dtype
booleans = []
for column in df_HQ_train:
    if len(df_HQ_train[column].unique())<=2:
        booleans.append(column)
        dtypes_Q[column] = Boolean

In [None]:
# Set the dtype of one of the features created in this study to Boolean as well
dtypes_Q['problematic_drinking'] = Boolean

#### 3-point scale

In [None]:
# Find all questions with 3 answer categories 
total3 = []

for column in df_HQ_train:
    if len(df_HQ_train[column].unique()) == 3:
        total3.append(column)

total3

In [None]:
# Assign correct dtypes 
for column in total3:
    if column.startswith('JACTI'):
        dtypes_Q[column] = Ordinal(order=[1,2,3])
    elif column == 'JGHQ30':
        dtypes_Q[column] = Ordinal(order=[1,2,3,4])
    else:
         dtypes_Q[column] = Categorical

#### 4-point scales

In [None]:
# Find all questions with 4 answer categories 
total4 = []
for column in df_HQ_train:
    if len(df_HQ_train[column].unique()) == 4:
        total4.append(column)
total4

In [None]:
# Create a list of columns that are categorical
categorical4 = ['JSTATUSX','JCHPACT','JSLDRIVE','JSNORBOT','JACCOM']

In [None]:
# Assign correct dtypes 
for column in df_HQ_train:
    if len(df_HQ_train[column].unique()) == 4:
        if column.startswith('JDPN'):
            dtypes_Q[column] = Ordinal(order=[0,1,2,3])
        elif column == 'JCAR':
            dtypes_Q[column] = Ordinal(order=[0,1,2,3])
        elif column == 'JTRLEP':
            dtypes_Q[column] = Integer
        elif column in categorical4:
            dtypes_Q[column] = Categorical
        else:
            dtypes_Q[column] = Ordinal(order=[1,2,3,4])

#### 5-point scales

In [None]:
# Find all questions with 5 answer categories 
for column in df_HQ_train:
    if len(df_HQ_train[column].unique()) == 5:
        print(column)

In [None]:
# Create a list of categoricals, and ordinals that actually have more than 5 options
categorical5 = 'JNOTMAR'
ordinal7 = ['JCARWASF', 'JSOCCERF', 'JSOCCERH', 'JGOLFF']

In [None]:
# Assign correct dtypes 
for column in df_HQ_train:
    if len(df_HQ_train[column].unique()) == 5:
        if column in categorical5:
            dtypes_Q[column] = Categorical
        elif column == 'JPETATTA':
            dtypes_Q[column] = Ordinal(order=[0,1,2,3,4])
        elif column in ordinal7:
            dtypes_Q[column] = Ordinal(order=[0,1,2,3,4,5,6])
        else:
            dtypes_Q[column] = Ordinal(order=[1,2,3,4,5])

#### 6-point scales

In [None]:
# Find all questions with 6 answer categories 
for column in df_HQ_train:
    if len(df_HQ_train[column].unique()) == 6:
        print(column)

In [None]:
# Create a list with all categorical features 
categorical6 =['JBREAD', 'JMILKTYP', 'JLRNE']

In [None]:
# Assign correct dtypes 
for column in df_HQ_train:
    if len(df_HQ_train[column].unique()) == 6:
        if column in categorical6:
            dtypes_Q[column] = Categorical
        elif column == 'JCHPNUM' or column == 'JGOUT_YR':
            dtypes_Q[column] = Integer
        elif column == 'JGOLFH' or column == 'JMOWF':
            dtypes_Q[column] = Ordinal(order=[0,1,2,3,4,5,6])
        elif column == 'JSNORHOW':
            dtypes_Q[column] = Ordinal(order=[0,5,1,2,3,4])
        elif column == 'JLRNE':
            dtypes_Q[column] = Ordinal(order=[0,1,2,3,4,5])
        else:
            dtypes_Q[column] = Ordinal(order=[1,2,3,4,5,6])

#### 7-point scales

In [None]:
# Find all questions with 7 answer categories 
for column in df_HQ_train:
    if len(df_HQ_train[column].unique()) == 7:
        print(column)

In [None]:
# Assign correct dtypes 
for column in df_HQ_train:
    if len(df_HQ_train[column].unique()) == 7:
        if column.startswith('JAR'):
            dtypes_Q[column] = Ordinal(order=[1,2,3,4,5,6,7])
        elif column == 'JSNOROFT':
            dtypes_Q[column] = Ordinal(order=[1,2,3,4,5,6,0])
        elif column == 'JOST_PYR' or column == 'JFRUITVG' or column == 'JRHE_AYR':
            dtypes_Q[column] = Integer
        else:
            dtypes_Q[column] = Ordinal(order=[0,1,2,3,4,5,6])

#### 8-point scales

In [None]:
# Find all questions with 8 answer categories 
for column in df_HQ_train:
    if len(df_HQ_train[column].unique()) == 8:
        print(column)

In [None]:
# Assign correct dtypes 
for column in df_HQ_train:
    if len(df_HQ_train[column].unique()) == 8:
        if column == 'JASSETHH':
            dtypes_Q[column] = Ordinal(order=[1,2,3,4,5,6,7,8])
        else:
            dtypes_Q[column] = Integer

#### 9-point scales 

In [None]:
# Find all questions with 9 answer categories 
for column in df_HQ_train:
    if len(df_HQ_train[column].unique()) == 9:
        print(column)

In [None]:
# Assign correct dtypes 
dtypes_Q['JHSADMNO'] = Integer 
dtypes_Q['JVIGFQ_S'] = Integer 
dtypes_Q['JTSF'] = Integer
dtypes_Q['JINCHH'] = Ordinal(order=[1,2,3,4,5,6,7,8,9])
dtypes_Q['JASSETXH'] = Ordinal(order=[1,2,3,4,5,6,7,8,9])

#### Left over features

In [None]:
# Check which features did not got assigned a dtype yet 
columns_in_dataframe = df_HQ_train.columns.to_list()
keys = dtypes_Q.keys()
key_list = list(keys)
difference = list(set(columns_in_dataframe) - set(key_list))
difference

In [None]:
# Correct for non-integer features 
dtypes_Q['JCASPAUT'] = Ordinal(order=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15])
dtypes_Q['JLRCLGD'] = Categorical
dtypes_Q['JAGE_C'] = AgeFractional

In [None]:
# Assign integer dtype to all other features 
for column in difference:
    dtypes_Q[column] = Integer

In [None]:
# Update existing dtype dictionary with new dtypes 
dtypes_FFQ.update(dtypes_Q)

## CM Data

In [None]:
dtypes_CM = {}

In [None]:
total_columns_CM = df_CM_train.columns.to_list()

#### Categorical

In [None]:
# Set the datatype of all features that start with JMG to categorical 
CM_categorical = []
for column in df_CM_train:
    if column.startswith('JMG'):
        dtypes_CM[column] = Categorical
        CM_categorical.append(column)

In [None]:
# Check which features are not categorical 
columns_CM = [x for x in total_columns_CM if x not in CM_categorical]

#### Booleans

In [None]:
# Identify all booleans
booleans = []
for column in columns_CM:
    if len(df_CM_train[column].unique())<=2:
        booleans.append(column)

In [None]:
# Assign correct dtype
for column in booleans:
    dtypes_CM[column] = Boolean

In [None]:
# See which features are left 
columns_CM = [x for x in columns_CM if x not in booleans]

#### Integers

In [None]:
# Assign correct dtype based on int or float
for column in columns_CM:
    if df_CM_train[column].dtype == 'int64':
        dtypes_CM[column] = Integer
    elif df_CM_train[column].dtype == 'float64':
        dtypes_CM[column] = Double

In [None]:
# Update datatype dictionary
dtypes_FFQ.update(dtypes_CM)

## CM Data

In [None]:
dtypes_CE = {}

In [None]:
# All CM features are Booleans 
for column in df_CE_train:
    dtypes_CE[column] = Boolean

In [None]:
# Update datatype dictionary
dtypes_FFQ.update(dtypes_CE)

In [None]:
# Save dictionary to access later 
with open('../cleaned_imputed_split/datatype_dictionary.pkl', 'wb') as f:
    pickle.dump(dtypes_FFQ, f)

In [None]:
# Convert from float64 to float32 to save memory 
df_float64 = df_train.select_dtypes(include=['float64'])
df_train[df_float64.columns] = df_float64.astype('float32')

df_float64_val = df_val.select_dtypes(include=['float64'])
df_val[df_float64_val.columns] = df_float64_val.astype('float32')

df_float64_test = df_test.select_dtypes(include=['float64'])
df_test[df_float64_test.columns] = df_float64_test.astype('float32')

In [None]:
# Create copy's of dataframes for late use 
df_train_copy = df_train.copy()
df_val_copy = df_val.copy()
df_test_copy = df_test.copy()

## Featuretools

In [None]:
# Create a Entity Set 
es = ft.EntitySet('T2D')

In [None]:
# Add the dataframe and the correct dtypes 
es = es.add_dataframe(
    dataframe_name = 'Q_FFQ', 
    dataframe = df_train,
    index = 'Id_random_DPUK', 
    logical_types=dtypes_FFQ, 
)

In [None]:
es_val = ft.EntitySet('T2D_v')

In [None]:
es_val = es_val.add_dataframe(
    dataframe_name = 'Q_FFQ_val', 
    dataframe = df_val,
    index = 'Id_random_DPUK', 
    logical_types=dtypes_FFQ, 
)

In [None]:
es_test = ft.EntitySet('T2D_t')

In [None]:
es_test = es_test.add_dataframe(
    dataframe_name = 'Q_FFQ_test', 
    dataframe = df_test,
    index = 'Id_random_DPUK', 
    logical_types=dtypes_FFQ, 
)

#### Check if all datatypes are correct

In [None]:
es['Q_FFQ'].ww.schema

## Selection of transform primitives

In [None]:
# Retrieve all possible transform primitives
primitives = ft.list_primitives()
trans_primitives = primitives.loc[primitives['type'] == 'transform']

In [None]:
# Investigate per category (Boolean, Integer,Double, Ordinal) which transformations are possible
keywords = [ 'Boolean']
trans_primitives_filtered = trans_primitives[trans_primitives['valid_inputs'].str.contains('|'.join(keywords))]
trans_primitives_filtered = trans_primitives_filtered.drop(columns=['type', 'valid_inputs', 'return_type'])
pd.options.display.max_colwidth = 300
trans_primitives_filtered 

## Automatic Feature Engineering

In [None]:
# Automatically engineer features
feature_matrix, feature_defs = ft.dfs(entityset=es, target_dataframe_name='Q_FFQ', max_depth=1,
                                      trans_primitives=['multiply_boolean','and', 'or', 'not',
                                                        'square_root', 'multiply_numeric_boolean', 
                                                        'percentile', 'add_numeric', 'multiply_numeric',
                                                        'subtract_numeric'])

feature_matrix

In [None]:
feature_matrix_val, feature_defs = ft.dfs(entityset=es_val, target_dataframe_name='Q_FFQ_val', max_depth=1,
                                      trans_primitives=['multiply_boolean','and', 'or', 'not',
                                                        'square_root', 'multiply_numeric_boolean', 
                                                        'percentile', 'add_numeric', 'multiply_numeric',
                                                        'subtract_numeric'])

feature_matrix_val

In [None]:
feature_matrix_test, feature_defs = ft.dfs(entityset=es_test, target_dataframe_name='Q_FFQ_test', max_depth=1,
                                      trans_primitives=['multiply_boolean','and', 'or', 'not',
                                                        'square_root', 'multiply_numeric_boolean', 
                                                        'percentile', 'add_numeric', 'multiply_numeric',
                                                        'subtract_numeric'])

feature_matrix_test

In [None]:
# Reset indices 
feature_matrix.reset_index(drop=True, inplace=True)
feature_matrix_val.reset_index(drop=True, inplace=True)
feature_matrix_test.reset_index(drop=True, inplace=True)

In [None]:
# Change True and False back to 0 and 1 

for column in feature_matrix.columns:
    if feature_matrix[column].dtype == 'bool' or feature_matrix[column].dtype == 'boolean':
        feature_matrix[column] = feature_matrix[column].astype(int)

for column in feature_matrix_val.columns:
    if feature_matrix_val[column].dtype == 'bool' or feature_matrix_val[column].dtype == 'boolean':
        feature_matrix_val[column] = feature_matrix_val[column].astype(int)

for column in feature_matrix_test.columns:
    if feature_matrix_test[column].dtype == 'bool' or feature_matrix_test[column].dtype == 'boolean':
        feature_matrix_test[column] = feature_matrix_test[column].astype(int)

## Feature selection with mRMR

In [None]:
num_columns = len(feature_matrix.columns)
step_size = 0.1 
full_dataframe_mrmr = []

num_steps = int(1/step_size)

for i in range(num_steps):
    start = int(i * step_size * num_columns)
    end = int((i + 1) * step_size * num_columns)

    column_list = list(feature_matrix.columns[start:end])
    feature_matrix_slice = feature_matrix[column_list]
    selected_features = mrmr_classif(X=feature_matrix_slice, y=y_train, K=150)
    full_dataframe_mrmr.append(selected_features)

In [None]:
import pickle 

# Save 150 selected predictors per 10% of the data frame 
with open('../cleaned_imputed_split/150predictorsmrmr.pkl', 'wb') as f:
    pickle.dump(full_dataframe_mrmr, f)

In [None]:
# Add all the selected columns to a list 
concatenated_columns = [column for sublist in full_dataframe_mrmr for column in sublist]
print(len(concatenated_columns))

In [None]:
# Create a feature matrix with only the selected 1500 columns
feature_matrix_1500 = feature_matrix[concatenated_columns]

# Save the created data frame 
feature_matrix_1500.to_csv('../cleaned_imputed_split/1500top_predictors.csv', index=False)

In [None]:
# Select the top 500 predictors 
selected_features_500 = mrmr_classif(X=feature_matrix_1500, y=y_train, K=500)

In [None]:
# Save list of 500 features 
with open('../cleaned_imputed_split/500predictorsmrmr.pkl', 'wb') as f:
    pickle.dump(selected_features_500, f)

In [None]:
# Subset dataframes to found 500 features
mrmr_df_four_categories = feature_matrix[selected_features_500]
mrmr_df_val_four_categories = feature_matrix_val[selected_features_500]
mrmr_df_test_four_categories = feature_matrix_test[selected_features_500]

In [None]:
# Save dataframes
mrmr_df_four_categories.to_csv('../Processed datasets/After splitting/mrmr/500selected_four_categories.csv', index=False)
mrmr_df_val_four_categories.to_csv('../Processed datasets/After splitting/mrmr/500selected_val_four_categories.csv', index=False)
mrmr_df_test_four_categories.to_csv('../Processed datasets/After splitting/mrmr/500selected_test_four_categories.csv', index=False)