# Diabetes project

In [17]:
import pandas as pd
from pathlib import Path

## 1.1 Read SPSS databaseabs

In [18]:
df_spss = pd.read_spss("DIALECT-1 + 2. definitief ruwe database (08-2019) met exclusie van dialect-1 patienten (n=671)_2.sav")


In [19]:
df_spss

Unnamed: 0,Subjectnr,Geslacht,Leeftijd_poli1,Polidatum1,Polijaar_1,Polibezoek,Arts,Freq_arts,Freq_vp,Freq_tot,...,SumOfvite,SumOfrae,SumOffole,SumOfzink,SumOfdpa,SumOfarachidonz,SumOffolaat,SumOfnico,SumOfwater,SumOfnatrium
0,1.0,man,65.0,2009-08-31,2009.0,Eerste bezoek,Schot,4.0,0.0,4.0,...,13.267616,602.510257,173.289978,9.203426,0.000000,0.034463,173.289978,16.178387,1591.211480,2607.443336
1,2.0,man,61.0,2009-08-31,2009.0,Eerste bezoek,Oving,1.0,4.0,5.0,...,10.662237,645.575531,133.897548,7.454809,0.000010,0.047574,133.897548,16.796363,2216.603139,1891.477166
2,3.0,man,56.0,2009-08-31,2009.0,Eerste bezoek,Veneman,0.0,0.0,0.0,...,29.996976,1207.776682,209.312323,13.193616,0.000494,0.031634,205.339877,22.143815,2758.233627,3351.952037
3,4.0,vrouw,51.0,2009-08-31,2009.0,Eerste bezoek,Ouwehand,1.0,4.0,5.0,...,4.281328,218.128055,90.296190,5.820861,0.000057,0.039529,90.296190,5.644264,673.484131,1646.457431
4,5.0,vrouw,60.0,2009-08-31,2009.0,Eerste bezoek,van Zanten,1.0,4.0,5.0,...,16.653028,1482.346823,207.709797,10.275772,0.000243,0.079542,207.709797,18.294950,2028.428955,2972.889884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,732.0,vrouw,74.0,2019-05-22,2019.0,Eerste bezoek,van berkum,3.0,1.0,4.0,...,,,,,,,,,,
668,733.0,vrouw,28.0,2019-04-26,2019.0,Eerste bezoek,Visser,7.0,24.0,31.0,...,,,,,,,,,,
669,734.0,man,65.0,2019-05-08,2019.0,Eerste bezoek,de Vries,5.0,8.0,13.0,...,,,,,,,,,,
670,736.0,man,77.0,2019-05-22,2019.0,Eerste bezoek,Laverman,4.0,2.0,6.0,...,,,,,,,,,,


## 1.2 Read Excel files

In [20]:
def read_recursively(path: str) -> list:
    """Read the data for each one of the folder """
    paths = list(Path(path).iterdir())
    paths.sort()
    return {k: df for k, df in [process_folder(p) for p in paths]}


def process_folder(dir: Path) -> list:
    """Read the files from each folder"""
    path_name = dir / f"{dir.name}-steps.xlsx"
    if path_name.exists():
        df = pd.read_excel(path_name)
    else:
        df = None
    return dir.name, df

In [21]:
time_series = read_recursively("Diabetes")
time_series.keys()

dict_keys(['353', '364', '369', '371', '380', '458', '466', '470', '471', '473', '476', '477', '479', '480', '482', '483', '485', '486', '487', '488', '489', '490'])

In [22]:
df_353 = time_series['353']
df_353
df_353.columns

Index(['time', '2017-01-13', '2017-01-14', '2017-01-15', '2017-01-16',
       '2017-01-17', '2017-01-18', '2017-01-19', '2017-01-20'],
      dtype='object')

# 2 Features

* [x] Geslacht => new column called "Gender"
* [x] Leeftijd_poli1
* [ ] Freq_tot
* [ ] DMduur = polijaar_1 – Dmaanvang
* [ ] Pack_years
* [ ] Alcoholgebruik1EHpermaand
* [ ] Lengte_poli1
* [ ] Gewicht_poli1
* [ ] Buikomvang_1
* [ ] Heupomvang_1
* [ ] SBP_poli1
* [ ] DBP_poli1
* [ ] Pols
* [ ] MAP_poli1
* [ ] Microvas_total (INPUT OR OUTPUT)
* [ ] Macrovasculaire_ziekten (INPUT OR OUTPUT)
* [ ] SerumHbA1c_1 (INPUT OR OUTPUT)
* [ ] Serum_cholesterol_1
* [ ] Total_number_drugs
* [ ] Insulin_bin
* [ ] EH_insuline


## 3 Clean the dataframes

In [45]:
df_ml = pd.DataFrame()

### 3.1 Clean Gender column

In [46]:
df_ml["Gender"] = df_spss["Geslacht"].apply(lambda x: 0.0 if x == "man" else 1.0)
df_ml["Gender"]


0      0.0
1      0.0
2      0.0
3      1.0
4      1.0
      ... 
667    1.0
668    1.0
669    0.0
670    0.0
671    0.0
Name: Gender, Length: 672, dtype: category
Categories (2, float64): [0.0, 1.0]

### 3.2 Leeftijd_poli1

In [31]:
df_spss["Leeftijd_poli1"].isna().any()

False

In [None]:
df_ml = df_spss["Leeftijd_poli1"]

 ###  3.3 Freq_tot

In [44]:
df_spss["Freq_tot"].isna().any()

True

In [53]:
freq_tot = df_spss["Freq_tot"]
mean_value = freq_tot[~freq_tot.isna()].mean()
mean_value

5.325373134328358

In [56]:
df_ml["Freq_tot"] = df_spss["Freq_tot"]
df_ml[freq_tot.isna()] = mean_value