In [1]:
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks

import os, sys

# Relative paths
dirname = os.path.dirname
sep = os.sep

ml_folder = dirname(os.getcwd())
sys.path.append(ml_folder)

from src.utils import mining_data_tb as md
from src.utils import visualization_tb as vi

import warnings

warnings.filterwarnings("ignore")

In [2]:
all_data_dfs = md.read_all_data(2,["1_demographics", "2_dietary", "3_examination", "4_laboratory", "5_questionnaire"])

In [3]:
def concatenate_dfs(data_dfs):
    files = {}
    count = 0

    for key, dfs in data_dfs.items():
        key_ = key[:-2]

        if count == 0:
            files[key_] = dfs
        else:
            if key_ not in files.keys():
                files[key_] = dfs
            else:
                files[key_] = pd.concat([files[key_], dfs])

        count +=1

    return files

In [4]:
def concatenate_all_dfs(data_dfs_list):
    #end_dfs = []
    end_dfs = {}
    
    for data_dfs in data_dfs_list:
        files = concatenate_dfs(data_dfs)
        end_dfs = {**end_dfs, **files}
        #end_dfs.append(files)

    return end_dfs

In [5]:
def merge_dfs(end_dfs):
    keys = list(end_dfs.keys())
    f_df = end_dfs.pop(keys[0])

    for name, df in end_dfs.items():
        f_df = pd.merge(f_df, df, how = "outer", on = "SEQN")

    return f_df

In [6]:
test = concatenate_all_dfs(all_data_dfs)

In [7]:
test2 = merge_dfs(test)

In [8]:
# WTDRD1_x, WTDR2D_x, DRABF_x, DRDINT_x, WTSAF2YR_x, LBXHCT_x

In [9]:
def clean_columns(df):
    df = df.drop(["WTDRD1_y", "WTDR2D_y", "DRABF_y", "DRDINT_y", "WTSAF2YR_y", "LBXHCT_y"], axis = 1)

    columns_correction = {
        "WTDRD1_x" : "WTDRD1",
        "WTDR2D_x" : "WTDR2D",
        "DRABF_x" : "DRABF",
        "DRDINT_x" : "DRDINT",
        "WTSAF2YR_x" : "WTSAF2YR",
        "LBXHCT_x" : "LBXHCT"
    }

    df = df.rename(columns = columns_correction)

    return df

In [10]:
test3 = clean_columns(test2)

In [19]:
test2.shape

(29400, 956)

In [11]:
def heart_disease(df):
    # Conditions to remove values of no interest from the columns of interest
    cond_b = df.MCQ160B != 9
    cond_c = df.MCQ160C != 7
    cond_d = (df.MCQ160D != 9) & (df.MCQ160D != 7)
    cond_e = df.MCQ160E != 9
    cond_f = df.MCQ160F != 9

    # Filter the data with the previous conditions
    heart_df = df[(cond_b) & (cond_c) & (cond_d) & (cond_e) & (cond_f)]

    # New column to group all heart diseases
    heart_df["heart_disease"] = 0

    # Conditions to filter by any heart disease
    pos_cond_b = heart_df.MCQ160B == 1
    pos_cond_c = heart_df.MCQ160C == 1
    pos_cond_d = heart_df.MCQ160D == 1
    pos_cond_e = heart_df.MCQ160E == 1
    pos_cond_f = heart_df.MCQ160F == 1

    # Given the previous conditions, place a "1" in the column if they are matched
    heart_df.loc[(pos_cond_b) | (pos_cond_c) | (pos_cond_d) | (pos_cond_e) | (pos_cond_f), "heart_disease"] = 1

    return heart_df

In [12]:
test4 = heart_disease(test3)

In [13]:
test4.heart_disease.value_counts()

0    27422
1     1863
Name: heart_disease, dtype: int64

In [14]:
def model_data(df, split, cv, epochs = 1, scaler = False, balance = None, seed = 42):
    features_nom = list(df.columns)
    features = [md.var_descr_detector(nom, variable_names) for nom in features_nom]  

    ### Independent variables
    X = np.array(df.iloc[:, 1:])

    ### Dependent variable
    y = np.array(df.iloc[:, 0])

    ### Data scaling
    if scaler:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

    ### Train-test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split, random_state = seed)

    ### Balancing data
    if balance != None:
        sm = SMOTE(sampling_strategy = balance, random_state = seed, n_jobs = -1)
        X_train, y_train = sm.fit_resample(X_train, y_train)

    ### Cross validation
    kfold = RepeatedStratifiedKFold(n_splits = cv, n_repeats = epochs, random_state = seed)


    return X_train, X_test, y_train, y_test, features, kfold

In [15]:
test5 = test4.loc[:, ["MCQ010", "RIAGENDR", "RIDAGEYR", "BPXDI1", "BPXSY1", "BMXWT", "DXDTOPF", "BMXWAIST"]].dropna()

In [16]:
X_train, X_test, y_train, y_test, features, kfold = model_data(test5, split = .2, cv = 5)

AttributeError: module 'src.utils.mining_data_tb' has no attribute 'var_descr_detector'

In [64]:
def ml_trainer(X_train, y_train, kfold, model, features = None):
    train_scores = []
    val_scores = []
    count = 1

    for (train, val) in kfold.split(X_train, y_train):
        # Train-Validation sets
        x_t, y_t = X_train[train], y_train[train]
        x_v, y_v = X_train[val], y_train[val]


        # Internal structure
        y_t_unique, y_t_counts = np.unique(y_t, return_counts=True)
        y_v_unique, y_v_counts = np.unique(y_v, return_counts=True)

        # Training
        model.fit(x_t, y_t)

        # Scores
        train_score = model.score(x_t, y_t)
        val_score = model.score(x_v, y_v)

        train_scores.append(train_score)
        val_scores.append(val_score)

        print(f"\n-- Model {count} --")
        print("-" * 25)
        print("Set structure:")
        print("Train structure:", dict(zip(y_t_unique, y_t_counts / len(y_t))))
        print("Validation structure:", dict(zip(y_v_unique, y_v_counts / len(y_v))))
        print("-" * 25)
        print(">train score:", train_score)
        print(">test score:", val_score)
        print("#" * 75)

        count += 1

    try:
        importances = model.feature_importances_
        feature_importances = list(zip(features, importances))

        feature_importances_df = pd.DataFrame(feature_importances, columns = ["features", "importance"]).sort_values(by = "importance", ascending = False)

        return train_scores, val_scores, feature_importances_df
    
    except:
        return train_scores, val_scores

In [45]:
var_path = "data/6_variables/0_final_variables.csv"
variable_names = md.var_data(2, var_path)

In [63]:
#model = RandomForestClassifier(n_jobs = -1, random_state = 42)
model = LogisticRegression(n_jobs = -1, random_state = 42, max_iter = 300)

train_scores, val_scores = ml_trainer(X_train, y_train, kfold, model, features)


-- Model 1 --
-------------------------
Set structure:
Train structure: {1.0: 0.169847087071907, 2.0: 0.8295210413244029, 9.0: 0.0006318716036901302}
Validation structure: {1.0: 0.16978271854471955, 2.0: 0.829711975745326, 9.0: 0.0005053057099545225}
-------------------------
>train score: 0.8295210413244029
>test score: 0.829711975745326
###########################################################################

-- Model 2 --
-------------------------
Set structure:
Train structure: {1.0: 0.169847087071907, 2.0: 0.8296474156451409, 9.0: 0.0005054972829521042}
Validation structure: {1.0: 0.16978271854471955, 2.0: 0.8292066700353714, 9.0: 0.001010611419909045}
-------------------------
>train score: 0.8296474156451409
>test score: 0.8292066700353714
###########################################################################

-- Model 3 --
-------------------------
Set structure:
Train structure: {1.0: 0.16982562547384383, 2.0: 0.8295425827647207, 9.0: 0.0006317917614354309}
Validation

In [60]:
features

Unnamed: 0,features,importance
4,Systolic: Blood pressure (first reading) mm Hg,0.199815
6,Total Percent Fat,0.19827
5,Weight (kg),0.192193
3,Diastolic: Blood pressure (first reading) mm Hg,0.132394
1,Gender of the participant.,0.132116
2,Age in years of the participant at the time of...,0.130182
0,The following questions are about different me...,0.01503


print(diet_dfs["dr1tot_h"].shape)
print(diet_dfs["dr1tot_i"].shape)
print(diet_dfs["dr1tot_j"].shape)

files["dr1tot"].shape

def read_all_data(up_levels, folders):
    #dem_folder, diet_folder, exam_folder, lab_folder, quest_folder = folders

    dfs_list = []
    

    for folder in folders:
        dfs_list.append(md.read_data(up_levels, folder))

    #demo_dfs = md.read_data(up_levels, dem_folder)
    #diet_dfs = md.read_data(up_levels, diet_folder)
    #exam_dfs = md.read_data(up_levels, exam_folder)
    #lab_dfs = md.read_data(up_levels, lab_folder)
    #quest_dfs = md.read_data(up_levels, quest_folder)

    #return demo_dfs, diet_dfs, exam_dfs, lab_dfs, quest_dfs
    return dfs_list

demo_dfs, diet_dfs, exam_dfs, lab_dfs, quest_dfs = read_all_data(2,["1_demographics", "2_dietary", "3_examination", "4_laboratory", "5_questionnaire"])

data = read_all_data(2, ["1_demographics", "2_dietary"])
data["diet_dfs"]["dr1tot_j"]

def var_data(up_levels, filepath):

    path = dirname(os.getcwd())
    for i in range(up_levels): path = dirname(path)

    fullpath = path + sep + filepath
    data = pd.read_csv(fullpath, index_col = 0)

    return data