In [1]:
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks

import os, sys

# Relative paths
dirname = os.path.dirname
sep = os.sep

ml_folder = dirname(os.getcwd())
sys.path.append(ml_folder)

from src.utils import mining_data_tb as md
from src.utils import visualization_tb as vi

import warnings

warnings.filterwarnings("ignore")

In [2]:
all_data_dfs = md.read_all_data(2,["1_demographics", "2_dietary", "3_examination", "4_laboratory", "5_questionnaire"])

files = {}
count = 0

for key, dfs in diet_dfs.items():
    key_ = key[:-2]

    if count == 0:
        files[key_] = dfs
    else:
        if key_ not in files.keys():
            files[key_] = dfs
        else:
            files[key_] = pd.concat([files[key_], dfs])

    count +=1

In [3]:
def concatenate_dfs(data_dfs):
    files = {}
    count = 0

    for key, dfs in data_dfs.items():
        key_ = key[:-2]

        if count == 0:
            files[key_] = dfs
        else:
            if key_ not in files.keys():
                files[key_] = dfs
            else:
                files[key_] = pd.concat([files[key_], dfs])

        count +=1

    return files

In [4]:
def concatenate_all_dfs(data_dfs_list):
    #end_dfs = []
    end_dfs = {}
    
    for data_dfs in data_dfs_list:
        files = concatenate_dfs(data_dfs)
        end_dfs = {**end_dfs, **files}
        #end_dfs.append(files)

    return end_dfs

In [5]:
def merge_dfs(end_dfs):
    keys = list(end_dfs.keys())
    f_df = end_dfs.pop(keys[0])

    for name, df in end_dfs.items():
        f_df = pd.merge(f_df, df, how = "outer", on = "SEQN")

    return f_df

In [6]:
test = concatenate_all_dfs(all_data_dfs)

In [7]:
test2 = merge_dfs(test)

In [9]:
cols = list(test2.columns)

In [18]:
duplicated = test2.loc[:, ["WTDRD1_y", "WTDRD1_x"]]

In [19]:
duplicated["duplicated"] = duplicated.iloc[:, 0] == duplicated.iloc[:, 1]

In [32]:
# duplicated["duplicated"].value_counts()
duplicated[duplicated["duplicated"] == False].iloc[:, 1].isna().sum()

1339

In [14]:
test2.shape

(29400, 956)

In [10]:
for col in cols: print(col)

SDDSRVYR
RIDSTATR
RIAGENDR
RIDAGEYR
RIDAGEMN
RIDRETH1
RIDRETH3
RIDEXMON
RIDEXAGM
DMQMILIZ
DMQADFC
DMDBORN4
DMDCITZN
DMDYRSUS
DMDEDUC3
DMDEDUC2
DMDMARTL
RIDEXPRG
SIALANG
SIAPROXY
SIAINTRP
FIALANG
FIAPROXY
FIAINTRP
MIALANG
MIAPROXY
MIAINTRP
AIALANGA
DMDHHSIZ
DMDFMSIZ
DMDHHSZA
DMDHHSZB
DMDHHSZE
DMDHRGND
DMDHRAGE
DMDHRBR4
DMDHREDU
DMDHRMAR
DMDHSEDU
WTINT2YR
WTMEC2YR
SDMVPSU
SDMVSTRA
INDHHIN2
INDFMIN2
INDFMPIR
DMDHRAGZ
DMDHREDZ
DMDHRMAZ
DMDHSEDZ
WTDRD1_x
WTDR2D_x
DR1DRSTZ
DR1EXMER
DRABF_x
DRDINT_x
DR1DBIH
DR1DAY
DR1LANG
DR1MRESP
DR1HELP
DBQ095Z
DBD100
DRQSPREP
DR1STY
DR1SKY
DRQSDIET
DRQSDT1
DRQSDT2
DRQSDT3
DRQSDT4
DRQSDT5
DRQSDT6
DRQSDT7
DRQSDT8
DRQSDT9
DRQSDT10
DRQSDT11
DRQSDT12
DRQSDT91
DR1TNUMF
DR1TKCAL
DR1TPROT
DR1TCARB
DR1TSUGR
DR1TFIBE
DR1TTFAT
DR1TSFAT
DR1TMFAT
DR1TPFAT
DR1TCHOL
DR1TATOC
DR1TATOA
DR1TRET
DR1TVARA
DR1TACAR
DR1TBCAR
DR1TCRYP
DR1TLYCO
DR1TLZ
DR1TVB1
DR1TVB2
DR1TNIAC
DR1TVB6
DR1TFOLA
DR1TFA
DR1TFF
DR1TFDFE
DR1TCHL
DR1TVB12
DR1TB12A
DR1TVC
DR1TVD
DR1TVK
DR1TCALC
DR1TPHOS


print(diet_dfs["dr1tot_h"].shape)
print(diet_dfs["dr1tot_i"].shape)
print(diet_dfs["dr1tot_j"].shape)

files["dr1tot"].shape

def read_all_data(up_levels, folders):
    #dem_folder, diet_folder, exam_folder, lab_folder, quest_folder = folders

    dfs_list = []
    

    for folder in folders:
        dfs_list.append(md.read_data(up_levels, folder))

    #demo_dfs = md.read_data(up_levels, dem_folder)
    #diet_dfs = md.read_data(up_levels, diet_folder)
    #exam_dfs = md.read_data(up_levels, exam_folder)
    #lab_dfs = md.read_data(up_levels, lab_folder)
    #quest_dfs = md.read_data(up_levels, quest_folder)

    #return demo_dfs, diet_dfs, exam_dfs, lab_dfs, quest_dfs
    return dfs_list

demo_dfs, diet_dfs, exam_dfs, lab_dfs, quest_dfs = read_all_data(2,["1_demographics", "2_dietary", "3_examination", "4_laboratory", "5_questionnaire"])

data = read_all_data(2, ["1_demographics", "2_dietary"])
data["diet_dfs"]["dr1tot_j"]

def var_data(up_levels, filepath):

    path = dirname(os.getcwd())
    for i in range(up_levels): path = dirname(path)

    fullpath = path + sep + filepath
    data = pd.read_csv(fullpath, index_col = 0)

    return data