## Importerer pakker

In [59]:
import pandas as pd
import numpy as np

import warnings

from sklearn.impute import KNNImputer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

## Leser inn data

In [60]:
demographic_df = pd.read_csv('../raw_data/demographic.csv')
hospital_df = pd.read_csv('../raw_data/hospital.csv')
physiological_df = pd.read_table('../raw_data/physiological.txt')
severity_json = pd.read_json('../raw_data/severity.json')

demographic_df_sample = pd.read_csv('../sample_data/demographic.csv')
hospital_df_sample = pd.read_csv('../sample_data/hospital.csv')
physiological_df_sample = pd.read_table('../sample_data/physiological.txt')
severity_json_sample = pd.read_json('../sample_data/severity.json')
pd.set_option('display.max_columns', None)


## Omgjør Severity_json til DataFrame

In [61]:
# Eksploderer alle kolonner fra indeks 2 til siste kolonne, inkludert 'dnr_dag'
severity_df = severity_json.iloc[:, 0:].explode(list(severity_json.columns[2:]))
severity_df.reset_index(drop=True, inplace=True)
severity_df = severity_df.sort_values(by=['pasient_id'], ignore_index=True)
severity_df

severity_df_sample = severity_json_sample.iloc[:, 0:].explode(list(severity_json_sample.columns[2:]))
severity_df_sample.reset_index(drop=True, inplace=True)
severity_df_sample = severity_df_sample.sort_values(by=['pasient_id'], ignore_index=True)

severity_df_sample

Unnamed: 0,sykdomskategori_id,sykdomskategori,pasient_id,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,adl_pasient,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status,dnr_dag
0,ChE,Cancer,1,0,Lung Cancer,0,0.0,7.0,7.0,33.898438,20.0,0.262939,0.036995,0,0,metastatic,0.5,0.25,no dnr,5.0
1,ChE,Cancer,22,1,Lung Cancer,0,0.0,,,26.199219,35.0,0.535889,0.213989,0,0,metastatic,0.5,0.125,dnr ved innleggelse,11.0
2,ChE,Cancer,25,1,Colon Cancer,0,0.0,,,18.898438,10.0,0.853882,0.674927,0,0,metastatic,0.5,0.1,no dnr,4.0
3,BrY,COPD/CHF/Cirrhosis,26,1,COPD,1,0.0,5.0,4.0,14.199219,16.0,0.939941,0.897949,0,0,no,0.9,0.6,dnr ved innleggelse,4.0
4,A1s,ARF/MOSF,29,0,ARF/MOSF w/Sepsis,2,37.0,,,22.398438,21.0,0.823975,0.764893,0,0,no,,,no dnr,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1360,A1s,ARF/MOSF,9049,0,ARF/MOSF w/Sepsis,0,0.0,0.0,0.0,16.898438,20.0,0.856934,0.807983,0,0,no,0.9,0.9,no dnr,9.0
1361,ChE,Cancer,9050,1,Colon Cancer,2,0.0,0.0,0.0,18.097656,13.0,0.809937,0.592896,0,0,metastatic,0.5,0.25,dnr ved innleggelse,1.0
1362,A1s,ARF/MOSF,9078,0,ARF/MOSF w/Sepsis,3,0.0,,2.0,17.796875,8.0,0.882935,0.841919,1,0,no,,,dnr ved innleggelse,55.0
1363,ChE,Cancer,9080,1,Lung Cancer,2,0.0,,,11.0,18.0,0.827881,0.625977,1,0,metastatic,0.6,0.3,no dnr,5.0


## Slår sammen DataFrames

In [62]:
demographic_hospital_df = demographic_df.merge(hospital_df, on='pasient_id', how='left')
demographic_hospital_phisiological_df = demographic_hospital_df.merge(physiological_df, on='pasient_id')
pasient_info_df = demographic_hospital_phisiological_df.merge(severity_df, on='pasient_id')


# Konverter pasient_id i severity_df_sample til int64
severity_df_sample['pasient_id'] = severity_df_sample['pasient_id'].astype('int64')
demographic_hospital_df_sample = demographic_df_sample.merge(hospital_df_sample, on='pasient_id', how='left')
demographic_hospital_phisiological_df_sample = demographic_hospital_df_sample.merge(physiological_df_sample, on='pasient_id')
pasient_info_df_sample = demographic_hospital_phisiological_df_sample.merge(severity_df_sample, on='pasient_id')
pasient_info_df_sample


Unnamed: 0,pasient_id,alder,kjønn,utdanning,inntekt,etnisitet,sykehusdød,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,sykdomskategori_id,sykdomskategori,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,adl_pasient,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status,dnr_dag
0,1,62.84998,male,11.0,$11-$25k,other,0,97.0,6.000000,69.0,22.0,36.00000,388.00000,1.799805,0.199982,1.199951,141.0,7.459961,,,,ChE,Cancer,0,Lung Cancer,0,0.0,7.0,7.0,33.898438,20.0,0.262939,0.036995,0,0,metastatic,0.5,0.25,no dnr,5.0
1,22,48.70398,male,16.0,,other,0,66.0,12.500000,125.0,30.0,37.00000,170.00000,,,1.000000,133.0,7.519531,,,,ChE,Cancer,1,Lung Cancer,0,0.0,,,26.199219,35.0,0.535889,0.213989,0,0,metastatic,0.5,0.125,dnr ved innleggelse,11.0
2,25,29.36099,female,17.0,$25-$50k,white,0,96.0,10.599609,112.0,20.0,37.00000,,3.699707,1.399902,0.599976,137.0,,,,,ChE,Cancer,1,Colon Cancer,0,0.0,,,18.898438,10.0,0.853882,0.674927,0,0,metastatic,0.5,0.1,no dnr,4.0
3,26,53.84000,male,,under $11k,white,0,134.0,7.799805,106.0,22.0,37.89844,304.12500,3.799805,0.500000,0.799927,141.0,7.379883,,,,BrY,COPD/CHF/Cirrhosis,1,COPD,1,0.0,5.0,4.0,14.199219,16.0,0.939941,0.897949,0,0,no,0.9,0.6,dnr ved innleggelse,4.0
4,29,30.10799,male,,,asian,0,92.0,8.798828,112.0,27.0,37.59375,225.00000,,,0.399963,134.0,7.459961,,,,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,2,37.0,,,22.398438,21.0,0.823975,0.764893,0,0,no,,,no dnr,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1360,9049,77.69995,male,16.0,,white,0,77.0,6.199219,140.0,28.0,35.29688,338.06250,3.899902,0.500000,3.599609,137.0,7.339844,88.0,49.0,800.0,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,0,0.0,0.0,0.0,16.898438,20.0,0.856934,0.807983,0,0,no,0.9,0.9,no dnr,9.0
1361,9050,61.09497,male,12.0,,white,0,104.0,,118.0,18.0,36.09375,,,,0.799927,133.0,,128.0,18.0,3350.0,ChE,Cancer,1,Colon Cancer,2,0.0,0.0,0.0,18.097656,13.0,0.809937,0.592896,0,0,metastatic,0.5,0.25,dnr ved innleggelse,1.0
1362,9078,62.84998,male,,,white,0,65.0,19.296875,140.0,22.0,36.59375,195.00000,,,2.000000,132.0,7.449219,137.0,58.0,2570.0,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,3,0.0,,2.0,17.796875,8.0,0.882935,0.841919,1,0,no,,,dnr ved innleggelse,55.0
1363,9080,53.32498,male,12.0,$25-$50k,white,0,71.0,8.000000,135.0,20.0,36.39844,230.40625,,1.299805,1.099854,137.0,7.309570,118.0,19.0,1510.0,ChE,Cancer,1,Lung Cancer,2,0.0,,,11.0,18.0,0.827881,0.625977,1,0,metastatic,0.6,0.3,no dnr,5.0


## Dropper duplikat pasient_ider

In [63]:
pasient_info_df = pasient_info_df.drop_duplicates()
pasient_info_df_sample = pasient_info_df.drop_duplicates()

Mangler verdier for utdanning, inntekt, etnisitet, hvite_bloglegemer, lungefunksjon, serumalbumin, bilirubin, kreatinin, blod_ph, glukose, blodurea_nitrogen, urinmengde

## Fjerner pasienter med negativ oppholdslengde

In [64]:
pasient_info_df = pasient_info_df[pasient_info_df['oppholdslengde']>0]

## Runder av alder til heltall, og setter negativ alder til NaN

In [65]:
pasient_info_df.loc[pasient_info_df['alder'] < 0, 'alder'] = np.nan
pasient_info_df['alder'] = pasient_info_df['alder'].round().astype('Int64')
pasient_info_df

pasient_info_df_sample.loc[pasient_info_df_sample['alder'] < 0, 'alder'] = np.nan
pasient_info_df_sample['alder'] = pasient_info_df_sample['alder'].round().astype('Int64')
pasient_info_df


Unnamed: 0,pasient_id,alder,kjønn,utdanning,inntekt,etnisitet,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,sykdomskategori_id,sykdomskategori,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,adl_pasient,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status,dnr_dag
0,2,60,female,12.0,$11-$25k,white,1,4,43.0,17.097656,112.0,34.0,34.59375,98.00000,,,5.500000,132.0,7.250000,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,44.0,,1.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,,
1,3,53,female,12.0,under $11k,white,0,17,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.199707,2.000000,134.0,7.459961,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,0.0,1.0,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,,
2,4,42,female,11.0,under $11k,white,0,3,75.0,9.099609,88.0,32.0,35.00000,,,,0.799927,139.0,,,,,ChE,Cancer,1,Lung Cancer,2,0.0,0.0,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,,
4,6,93,male,14.0,,white,1,4,110.0,10.398438,101.0,44.0,38.39844,266.62500,,,0.699951,140.0,7.659180,,,,DWw,Coma,1,Coma,1,55.0,,1.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,,
5,7,62,male,14.0,$25-$50k,white,0,9,78.0,11.699219,120.0,28.0,37.39844,309.50000,4.799805,0.399963,1.599854,132.0,7.479492,,,,BrY,COPD/CHF/Cirrhosis,1,CHF,1,0.0,0.0,1.0,17.296875,46.0,0.892944,0.820923,0,0,no,,0.7,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,9101,66,male,8.0,,white,0,23,109.0,7.399414,104.0,22.0,35.69531,280.00000,3.699707,0.399963,1.099854,131.0,7.459961,188.0,21.0,,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,,
7736,9102,55,female,11.0,,white,0,29,43.0,,0.0,8.0,38.59375,218.50000,,,5.899414,135.0,7.289062,190.0,49.0,0.0,DWw,Coma,0,Coma,1,41.0,,0.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,,
7737,9103,70,male,,,white,0,8,111.0,8.398438,83.0,24.0,36.69531,180.00000,,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,,,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,,
7738,9104,47,male,13.0,,white,1,7,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,0.399963,3.500000,135.0,7.469727,246.0,55.0,,A1s,ARF/MOSF,1,MOSF w/Malig,1,0.0,,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,,


## Setter ugyldige verdier til NaN

In [66]:
pasient_info_df.loc[pasient_info_df['blodtrykk'] <= 0, 'blodtrykk'] = np.nan
pasient_info_df.loc[pasient_info_df['hjertefrekvens'] <= 0, 'hjertefrekvens'] = np.nan
pasient_info_df.loc[pasient_info_df['respirasjonsfrekvens'] <= 0, 'respirasjonsfrekvens'] = np.nan
pasient_info_df.loc[pasient_info_df['urinmengde'] <= 0, 'urinmengde'] = np.nan
pasient_info_df.loc[pasient_info_df['glukose'] <= 0, 'glukose'] = np.nan

pasient_info_df_sample.loc[pasient_info_df_sample['blodtrykk'] <= 0, 'blodtrykk'] = np.nan
pasient_info_df_sample.loc[pasient_info_df_sample['hjertefrekvens'] <= 0, 'hjertefrekvens'] = np.nan
pasient_info_df_sample.loc[pasient_info_df_sample['respirasjonsfrekvens'] <= 0, 'respirasjonsfrekvens'] = np.nan
pasient_info_df_sample.loc[pasient_info_df_sample['urinmengde'] <= 0, 'urinmengde'] = np.nan
pasient_info_df_sample.loc[pasient_info_df_sample['glukose'] <= 0, 'glukose'] = np.nan


# Fjerning av variabler

## Sykdomskategori er redundant, da den vil være lik sykdomskategori

In [67]:
pasient_info_df.drop(columns={'sykdomskategori_id'}, inplace=True)

pasient_info_df_sample.drop(columns={'sykdomskategori_id'}, inplace=True)

## Fjerner døsfall, da dette ikke er nødvendig å inkludere om en pasient har død etter sykehusoppholdet

In [68]:
pasient_info_df.drop(columns={'dødsfall'}, inplace=True)

pasient_info_df_sample.drop(columns={'dødsfall'}, inplace=True)

## Fjerner variabler som ikke er utfylt på dag 1

In [69]:
pasient_info_df.drop(columns={'bilirubin'}, inplace=True)
pasient_info_df.drop(columns={'adl_pasient'}, inplace=True)

pasient_info_df_sample.drop(columns={'bilirubin'}, inplace=True)
pasient_info_df_sample.drop(columns={'adl_pasient'}, inplace=True)

## Fjerner dnr_dag, da dnr_status allerede forteller om en pasient fikk dnr status før eller etter innleggelse. I 

In [70]:
pasient_info_df.drop(columns={'dnr_dag'}, inplace=True)

## Rydder opp ved å fjerne pasient_id og sette oppholdslengde som første rad

In [71]:
col_to_move = pasient_info_df.pop('oppholdslengde')
pasient_info_df.insert(0, 'oppholdslengde', col_to_move)
pasient_info_df.drop(columns=['pasient_id'], inplace=True)
pasient_info_df

col_to_move = pasient_info_df_sample.pop('oppholdslengde')
pasient_info_df_sample.insert(0, 'oppholdslengde', col_to_move)
pasient_info_df_sample.drop(columns=['pasient_id'], inplace=True)
pasient_info_df

Unnamed: 0,oppholdslengde,alder,kjønn,utdanning,inntekt,etnisitet,sykehusdød,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,sykdomskategori,sykdom_underkategori,antall_komorbiditeter,koma_score,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status
0,4,60,female,12.0,$11-$25k,white,1,43.0,17.097656,112.0,34.0,34.59375,98.00000,,5.500000,132.0,7.250000,,,,COPD/CHF/Cirrhosis,Cirrhosis,2,44.0,1.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,
1,17,53,female,12.0,under $11k,white,0,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.000000,134.0,7.459961,,,,COPD/CHF/Cirrhosis,Cirrhosis,2,0.0,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,
2,3,42,female,11.0,under $11k,white,0,75.0,9.099609,88.0,32.0,35.00000,,,0.799927,139.0,,,,,Cancer,Lung Cancer,2,0.0,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,
4,4,93,male,14.0,,white,1,110.0,10.398438,101.0,44.0,38.39844,266.62500,,0.699951,140.0,7.659180,,,,Coma,Coma,1,55.0,1.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,
5,9,62,male,14.0,$25-$50k,white,0,78.0,11.699219,120.0,28.0,37.39844,309.50000,4.799805,1.599854,132.0,7.479492,,,,COPD/CHF/Cirrhosis,CHF,1,0.0,1.0,17.296875,46.0,0.892944,0.820923,0,0,no,,0.7,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,23,66,male,8.0,,white,0,109.0,7.399414,104.0,22.0,35.69531,280.00000,3.699707,1.099854,131.0,7.459961,188.0,21.0,,ARF/MOSF,ARF/MOSF w/Sepsis,1,0.0,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,
7736,29,55,female,11.0,,white,0,43.0,,,8.0,38.59375,218.50000,,5.899414,135.0,7.289062,190.0,49.0,,Coma,Coma,1,41.0,0.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,
7737,8,70,male,,,white,0,111.0,8.398438,83.0,24.0,36.69531,180.00000,,2.699707,139.0,7.379883,189.0,60.0,3900.0,ARF/MOSF,ARF/MOSF w/Sepsis,1,0.0,,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,
7738,7,47,male,13.0,,white,1,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,3.500000,135.0,7.469727,246.0,55.0,,ARF/MOSF,MOSF w/Malig,1,0.0,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,


## Sjekker data

In [72]:
pasient_info_df.dtypes

oppholdslengde                     int64
alder                              Int64
kjønn                             object
utdanning                        float64
inntekt                           object
etnisitet                         object
sykehusdød                         int64
blodtrykk                        float64
hvite_blodlegemer                float64
hjertefrekvens                   float64
respirasjonsfrekvens             float64
kroppstemperatur                 float64
lungefunksjon                    float64
serumalbumin                     float64
kreatinin                        float64
natrium                          float64
blod_ph                          float64
glukose                          float64
blodurea_nitrogen                float64
urinmengde                       float64
sykdomskategori                   object
sykdom_underkategori              object
antall_komorbiditeter             object
koma_score                        object
adl_stedfortrede

## Omgjør variabeler til sin respektive type

In [73]:
columns_category = pasient_info_df.select_dtypes(include=['object']).columns.tolist()
columns_category

['kjønn',
 'inntekt',
 'etnisitet',
 'sykdomskategori',
 'sykdom_underkategori',
 'antall_komorbiditeter',
 'koma_score',
 'adl_stedfortreder',
 'fysiologisk_score',
 'apache_fysiologisk_score',
 'overlevelsesestimat_2mnd',
 'overlevelsesestimat_6mnd',
 'diabetes',
 'demens',
 'kreft',
 'lege_overlevelsesestimat_2mnd',
 'lege_overlevelsesestimat_6mnd',
 'dnr_status']

In [74]:
to_int_columns = ['antall_komorbiditeter', 'diabetes', 'demens']
to_float_columns = ['koma_score', 'fysiologisk_score', 'apache_fysiologisk_score', 'overlevelsesestimat_2mnd', 'overlevelsesestimat_6mnd', 'lege_overlevelsesestimat_2mnd', 'lege_overlevelsesestimat_6mnd', 'adl_stedfortreder']

In [75]:
pasient_info_df[to_int_columns] = pasient_info_df[to_int_columns].astype(dtype=int)
pasient_info_df[to_float_columns] = pasient_info_df[to_float_columns].astype(dtype=float)

pasient_info_df_sample[to_int_columns] = pasient_info_df_sample[to_int_columns].astype(dtype=int)
pasient_info_df_sample[to_float_columns] = pasient_info_df_sample[to_float_columns].astype(dtype=float)

## Fyller inn manglende data med anbefalt imputeringsverdier

In [76]:
# Verdier hentet fra https://archive.ics.uci.edu/dataset/880/support2
fill_values = {
    'serumalbumin': 3.5,
    'lungefunksjon': 333.3,
    'bilirubin': 1.01,
    'kreatinin': 1.01,
    'blodurea_nitrogen': 6.51,
    'hvite_blodlegemer': 9,
    'urinmengde': 2502
}

# Fill missing values in the DataFrame using the dictionary-based approach
pasient_info_df.fillna(fill_values, inplace=True)
pasient_info_df_sample.fillna(fill_values, inplace=True)

## Oversikt over manglende data

In [77]:
# Get the count of missing values in each column
missing_columns = pasient_info_df.isna().sum()

# Create a DataFrame with columns that have missing values greater than 0
missing_columns_df = pd.DataFrame(missing_columns[missing_columns > 0], columns=['Missing Values'])

# Display the DataFrame with missing values greater than 0
missing_columns_df

Unnamed: 0,Missing Values
alder,5
utdanning,1380
inntekt,2516
etnisitet,37
blodtrykk,45
hjertefrekvens,73
respirasjonsfrekvens,55
blod_ph,1911
glukose,3818
adl_stedfortreder,2440


## Deler inn i trenings-, validerings- og testdata

In [78]:
df_train, df_valtest = train_test_split(pasient_info_df, test_size=0.3, random_state=0)
df_val, df_test = train_test_split(df_valtest, test_size=0.5, random_state=0)

## Laster opp renset data deling

In [79]:
df_train.to_csv('train_df.csv', index=False)
df_val.to_csv('validation_df.csv', index=False)
df_test.to_csv('test_df.csv', index=False)
pasient_info_df_sample.to_csv('sample_test.csv', index=False)