## Importerer pakker

In [466]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split

## Leser inn data

In [467]:
demographic_df = pd.read_csv('../raw_data/demographic.csv')
hospital_df = pd.read_csv('../raw_data/hospital.csv')
physiological_df = pd.read_table('../raw_data/physiological.txt')
severity_json = pd.read_json('../raw_data/severity.json')


## Deler demographic_df i aldersgrupper

In [468]:
# Setter alle negative verdier til NaN
demographic_df.loc[demographic_df['alder'] < 0, 'alder'] = np.nan

# Deler inn i aldersgrupper
bins = [-10, 18, 40, 60, 80, 120]
labels = ['ugyldige data', '18-39','40-59','60-79','80+']
demographic_df['aldersgrupper'] = pd.cut(demographic_df['alder'], bins=bins, right=False, labels=labels)

## Fjerner duplikat pasient_ider

In [469]:
# Pasient_id 4 og 59 er doble like variabler
# Derfor kan vi slette en av de 
#demographic_df[demographic_df['pasient_id']==4]
#demographic_df[demographic_df['pasient_id']==59]
demographic_df = demographic_df.drop([7740, 7741])
demographic_df

Unnamed: 0,pasient_id,alder,kjønn,utdanning,inntekt,etnisitet,aldersgrupper
0,2,60.33899,female,12.0,$11-$25k,white,60-79
1,3,52.74698,female,12.0,under $11k,white,40-59
2,4,42.38498,female,11.0,under $11k,white,40-59
3,5,79.88495,female,,,white,60-79
4,6,93.01599,male,14.0,,white,80+
...,...,...,...,...,...,...,...
7735,9101,66.07300,male,8.0,,white,60-79
7736,9102,55.15399,female,11.0,,white,40-59
7737,9103,70.38196,male,,,white,60-79
7738,9104,47.01999,male,13.0,,white,40-59


## Deler demographic_df i utdanningsnivåer

In [470]:
bins = [0, 10 , 13, 60]
labels = [1, 2, 3]
demographic_df['utdanningsnivå'] = pd.cut(demographic_df['utdanning'], bins=bins, right=False, labels=labels)

## Mapper Inntekt til dummyvariabler

In [471]:
income_mapping = {
    'under $11k' : 0,
    '$11-$25k' : 1,
    '$25-$50k' : 2,
    '>$50k' : 3
}
demographic_df['inntektsskala'] = demographic_df['inntekt'].map(income_mapping)

demographic_df = demographic_df.drop(columns=['inntekt'])
demographic_df

Unnamed: 0,pasient_id,alder,kjønn,utdanning,etnisitet,aldersgrupper,utdanningsnivå,inntektsskala
0,2,60.33899,female,12.0,white,60-79,2,1.0
1,3,52.74698,female,12.0,white,40-59,2,0.0
2,4,42.38498,female,11.0,white,40-59,2,0.0
3,5,79.88495,female,,white,60-79,,
4,6,93.01599,male,14.0,white,80+,3,
...,...,...,...,...,...,...,...,...
7735,9101,66.07300,male,8.0,white,60-79,1,
7736,9102,55.15399,female,11.0,white,40-59,2,
7737,9103,70.38196,male,,white,60-79,,
7738,9104,47.01999,male,13.0,white,40-59,3,


## Omgjør Severity_json til DataFrame

In [472]:
severity_df = severity_json.iloc[:,0:-1].explode(list(severity_json.columns[2:-1]))
severity_df.reset_index(drop=True, inplace=True)
severity_df = severity_df.sort_values(by=['pasient_id'], ignore_index=True)
severity_df


Unnamed: 0,sykdomskategori_id,sykdomskategori,pasient_id,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,adl_pasient,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status
0,BrY,COPD/CHF/Cirrhosis,2,1,Cirrhosis,2,44.0,,1.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,
1,BrY,COPD/CHF/Cirrhosis,3,1,Cirrhosis,2,0.0,1.0,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,
2,ChE,Cancer,4,1,Lung Cancer,2,0.0,0.0,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,
3,A1s,ARF/MOSF,5,0,ARF/MOSF w/Sepsis,1,26.0,,2.0,23.5,30.0,0.634888,0.532959,0,0,no,0.9,0.9,
4,DWw,Coma,6,1,Coma,1,55.0,,1.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,A1s,ARF/MOSF,9101,0,ARF/MOSF w/Sepsis,1,0.0,,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,
7736,DWw,Coma,9102,0,Coma,1,41.0,,0.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,
7737,A1s,ARF/MOSF,9103,0,ARF/MOSF w/Sepsis,1,0.0,,,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,
7738,A1s,ARF/MOSF,9104,1,MOSF w/Malig,1,0.0,,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,


## Slår sammen adl

In [473]:
severity_df['adl'] = (
    severity_df[['adl_pasient', 'adl_stedfortreder']].mean(axis=1)
    .combine_first(severity_df['adl_pasient'].combine_first(severity_df['adl_stedfortreder']))
)

severity_df.drop(columns=['adl_pasient', 'adl_stedfortreder'], inplace=True)
severity_df

Unnamed: 0,sykdomskategori_id,sykdomskategori,pasient_id,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status,adl
0,BrY,COPD/CHF/Cirrhosis,2,1,Cirrhosis,2,44.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,,1.0
1,BrY,COPD/CHF/Cirrhosis,3,1,Cirrhosis,2,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,,0.5
2,ChE,Cancer,4,1,Lung Cancer,2,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,,0.0
3,A1s,ARF/MOSF,5,0,ARF/MOSF w/Sepsis,1,26.0,23.5,30.0,0.634888,0.532959,0,0,no,0.9,0.9,,2.0
4,DWw,Coma,6,1,Coma,1,55.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,A1s,ARF/MOSF,9101,0,ARF/MOSF w/Sepsis,1,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,,0.0
7736,DWw,Coma,9102,0,Coma,1,41.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,,0.0
7737,A1s,ARF/MOSF,9103,0,ARF/MOSF w/Sepsis,1,0.0,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,,
7738,A1s,ARF/MOSF,9104,1,MOSF w/Malig,1,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,,0.0


## Fjerner pasienter med negativ oppholdslengde i Hospital_df

In [474]:
negative_oppholdslengde = hospital_df[hospital_df['oppholdslengde'] < 0]
negative_oppholdslengde_count = negative_oppholdslengde['oppholdslengde'].value_counts()
hospital_df[hospital_df['oppholdslengde']==-99]

# Fjerner pasient id 5, 23, 102, 256, 508, 8000 for å rense data
pasient_ids_to_remove = [5, 23, 102, 256, 508, 8000]
index_to_drop = hospital_df[hospital_df['pasient_id'].isin(pasient_ids_to_remove)].index
hospital_df = hospital_df.drop(index_to_drop)
hospital_df

Unnamed: 0,pasient_id,sykehusdød,oppholdslengde
0,2,1,4
1,3,0,17
2,4,0,3
4,6,1,4
5,7,0,9
...,...,...,...
7735,9101,0,23
7736,9102,0,29
7737,9103,0,8
7738,9104,1,7


## Fjerner pasient_idene som er ugyldige(negativ oppholdslengde) i alle datasett

In [475]:
# Fjerner fra demographic
pasient_ids_to_remove = [5, 23, 102, 256, 508, 8000]
index_to_drop = demographic_df[demographic_df['pasient_id'].isin(pasient_ids_to_remove)].index
demographic_df = demographic_df.drop(index_to_drop)
demographic_df

Unnamed: 0,pasient_id,alder,kjønn,utdanning,etnisitet,aldersgrupper,utdanningsnivå,inntektsskala
0,2,60.33899,female,12.0,white,60-79,2,1.0
1,3,52.74698,female,12.0,white,40-59,2,0.0
2,4,42.38498,female,11.0,white,40-59,2,0.0
4,6,93.01599,male,14.0,white,80+,3,
5,7,62.37097,male,14.0,white,60-79,3,2.0
...,...,...,...,...,...,...,...,...
7735,9101,66.07300,male,8.0,white,60-79,1,
7736,9102,55.15399,female,11.0,white,40-59,2,
7737,9103,70.38196,male,,white,60-79,,
7738,9104,47.01999,male,13.0,white,40-59,3,


In [476]:
pasient_ids_to_remove = [5, 23, 102, 256, 508, 8000]
index_to_drop = physiological_df[physiological_df['pasient_id'].isin(pasient_ids_to_remove)].index
physiological_df = physiological_df.drop(index_to_drop)
physiological_df

Unnamed: 0,pasient_id,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde
0,2,43.0,17.097656,112.0,34.0,34.59375,98.00000,,,5.500000,132.0,7.250000,,,
1,3,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.199707,2.000000,134.0,7.459961,,,
2,4,75.0,9.099609,88.0,32.0,35.00000,,,,0.799927,139.0,,,,
4,6,110.0,10.398438,101.0,44.0,38.39844,266.62500,,,0.699951,140.0,7.659180,,,
5,7,78.0,11.699219,120.0,28.0,37.39844,309.50000,4.799805,0.399963,1.599854,132.0,7.479492,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,9101,109.0,7.399414,104.0,22.0,35.69531,280.00000,3.699707,0.399963,1.099854,131.0,7.459961,188.0,21.0,
7736,9102,43.0,,0.0,8.0,38.59375,218.50000,,,5.899414,135.0,7.289062,190.0,49.0,0.0
7737,9103,111.0,8.398438,83.0,24.0,36.69531,180.00000,,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0
7738,9104,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,0.399963,3.500000,135.0,7.469727,246.0,55.0,


In [477]:
pasient_ids_to_remove = [5, 23, 102, 256, 508, 8000]
index_to_drop = severity_df[severity_df['pasient_id'].isin(pasient_ids_to_remove)].index
severity_df = severity_df.drop(index_to_drop)
severity_df

Unnamed: 0,sykdomskategori_id,sykdomskategori,pasient_id,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status,adl
0,BrY,COPD/CHF/Cirrhosis,2,1,Cirrhosis,2,44.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,,1.0
1,BrY,COPD/CHF/Cirrhosis,3,1,Cirrhosis,2,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,,0.5
2,ChE,Cancer,4,1,Lung Cancer,2,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,,0.0
4,DWw,Coma,6,1,Coma,1,55.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,,1.0
5,BrY,COPD/CHF/Cirrhosis,7,1,CHF,1,0.0,17.296875,46.0,0.892944,0.820923,0,0,no,,0.7,,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,A1s,ARF/MOSF,9101,0,ARF/MOSF w/Sepsis,1,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,,0.0
7736,DWw,Coma,9102,0,Coma,1,41.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,,0.0
7737,A1s,ARF/MOSF,9103,0,ARF/MOSF w/Sepsis,1,0.0,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,,
7738,A1s,ARF/MOSF,9104,1,MOSF w/Malig,1,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,,0.0


## Forbereder Severity_df

In [478]:
severity_df = pd.get_dummies(severity_df, columns=['kreft'])
severity_df.drop(columns=['kreft_no'], inplace=True)
severity_df['kreft_yes'] = severity_df[['kreft_metastatic', 'kreft_yes']].max(axis=1)
severity_df.rename(columns={'kreft_yes':'kreft'}, inplace=True)
severity_df


Unnamed: 0,sykdomskategori_id,sykdomskategori,pasient_id,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status,adl,kreft_metastatic,kreft
0,BrY,COPD/CHF/Cirrhosis,2,1,Cirrhosis,2,44.0,52.695312,74.0,0.001,0.0,0,0,0.0,0.0,,1.0,False,False
1,BrY,COPD/CHF/Cirrhosis,3,1,Cirrhosis,2,0.0,20.5,45.0,0.790894,0.664917,0,0,0.75,0.5,,0.5,False,False
2,ChE,Cancer,4,1,Lung Cancer,2,0.0,20.097656,19.0,0.698975,0.411987,0,0,0.9,0.5,,0.0,True,True
4,DWw,Coma,6,1,Coma,1,55.0,19.398438,27.0,0.284973,0.214996,0,0,0.0,0.0,,1.0,False,False
5,BrY,COPD/CHF/Cirrhosis,7,1,CHF,1,0.0,17.296875,46.0,0.892944,0.820923,0,0,,0.7,,0.5,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,A1s,ARF/MOSF,9101,0,ARF/MOSF w/Sepsis,1,0.0,16.296875,22.0,0.852905,0.80188,0,0,0.8,0.512,,0.0,False,False
7736,DWw,Coma,9102,0,Coma,1,41.0,25.796875,31.0,0.553955,0.485962,0,0,0.5,0.5,,0.0,False,False
7737,A1s,ARF/MOSF,9103,0,ARF/MOSF w/Sepsis,1,0.0,22.699219,39.0,0.741943,0.660889,0,0,0.9,0.8,,,False,False
7738,A1s,ARF/MOSF,9104,1,MOSF w/Malig,1,0.0,40.195312,51.0,0.177979,0.091995,0,0,0.09,0.09,,0.0,False,True


In [479]:
severity_df = pd.get_dummies(severity_df, columns=['dnr_status'])
severity_df.rename(columns={'dnr_status_dnr før innleggelse' : 'dnr_før_innleggelse', 'dnr_status_dnr ved innleggelse' : 'dnr_ved_innleggelse'}, inplace=True)
severity_df

Unnamed: 0,sykdomskategori_id,sykdomskategori,pasient_id,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,adl,kreft_metastatic,kreft,dnr_før_innleggelse,dnr_ved_innleggelse
0,BrY,COPD/CHF/Cirrhosis,2,1,Cirrhosis,2,44.0,52.695312,74.0,0.001,0.0,0,0,0.0,0.0,1.0,False,False,False,False
1,BrY,COPD/CHF/Cirrhosis,3,1,Cirrhosis,2,0.0,20.5,45.0,0.790894,0.664917,0,0,0.75,0.5,0.5,False,False,False,False
2,ChE,Cancer,4,1,Lung Cancer,2,0.0,20.097656,19.0,0.698975,0.411987,0,0,0.9,0.5,0.0,True,True,False,False
4,DWw,Coma,6,1,Coma,1,55.0,19.398438,27.0,0.284973,0.214996,0,0,0.0,0.0,1.0,False,False,False,False
5,BrY,COPD/CHF/Cirrhosis,7,1,CHF,1,0.0,17.296875,46.0,0.892944,0.820923,0,0,,0.7,0.5,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,A1s,ARF/MOSF,9101,0,ARF/MOSF w/Sepsis,1,0.0,16.296875,22.0,0.852905,0.80188,0,0,0.8,0.512,0.0,False,False,False,False
7736,DWw,Coma,9102,0,Coma,1,41.0,25.796875,31.0,0.553955,0.485962,0,0,0.5,0.5,0.0,False,False,False,False
7737,A1s,ARF/MOSF,9103,0,ARF/MOSF w/Sepsis,1,0.0,22.699219,39.0,0.741943,0.660889,0,0,0.9,0.8,,False,False,False,False
7738,A1s,ARF/MOSF,9104,1,MOSF w/Malig,1,0.0,40.195312,51.0,0.177979,0.091995,0,0,0.09,0.09,0.0,False,True,False,False


## Sammenligner pasient_id i datasettene

In [480]:
# Uthente pasient_id kolonner
demographic_ids = demographic_df.iloc[:, 0]
physiological_ids = physiological_df.iloc[:, 0]
hospital_ids = hospital_df.iloc[:, 0]
severity_ids = severity_df.iloc[:, 3]

# Finner forskjeller
ids_in_demo_not_in_physio = set(demographic_ids) - set(physiological_ids)
ids_in_physio_not_in_demo = set(physiological_ids) - set(demographic_ids)
ids_in_hospital_not_in_demo = set(hospital_ids) - set(demographic_ids)
ids_in_demo_not_in_hospital = set(demographic_ids) - set(hospital_ids)

# Se om det er noen forskjeller
if not ids_in_demo_not_in_physio and not ids_in_physio_not_in_demo and not ids_in_hospital_not_in_demo and not ids_in_demo_not_in_hospital:
    print("No mismatched IDs, all DataFrames have the same pasient_id values.")
else:
    print("There are mismatched IDs.")

No mismatched IDs, all DataFrames have the same pasient_id values.


## Samler tabeller

In [481]:
demographic_hospital_df = demographic_df.merge(hospital_df, on='pasient_id', how='left')
demographic_hospital_phisiological_df = demographic_hospital_df.merge(physiological_df, on='pasient_id')
pasient_info_df = demographic_hospital_phisiological_df.merge(severity_df, on='pasient_id')
pasient_info_df

Unnamed: 0,pasient_id,alder,kjønn,utdanning,etnisitet,aldersgrupper,utdanningsnivå,inntektsskala,sykehusdød,oppholdslengde,...,overlevelsesestimat_6mnd,diabetes,demens,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,adl,kreft_metastatic,kreft,dnr_før_innleggelse,dnr_ved_innleggelse
0,2,60.33899,female,12.0,white,60-79,2,1.0,1,4,...,0.0,0,0,0.0,0.0,1.0,False,False,False,False
1,3,52.74698,female,12.0,white,40-59,2,0.0,0,17,...,0.664917,0,0,0.75,0.5,0.5,False,False,False,False
2,4,42.38498,female,11.0,white,40-59,2,0.0,0,3,...,0.411987,0,0,0.9,0.5,0.0,True,True,False,False
3,6,93.01599,male,14.0,white,80+,3,,1,4,...,0.214996,0,0,0.0,0.0,1.0,False,False,False,False
4,7,62.37097,male,14.0,white,60-79,3,2.0,0,9,...,0.820923,0,0,,0.7,0.5,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7729,9101,66.07300,male,8.0,white,60-79,1,,0,23,...,0.80188,0,0,0.8,0.512,0.0,False,False,False,False
7730,9102,55.15399,female,11.0,white,40-59,2,,0,29,...,0.485962,0,0,0.5,0.5,0.0,False,False,False,False
7731,9103,70.38196,male,,white,60-79,,,0,8,...,0.660889,0,0,0.9,0.8,,False,False,False,False
7732,9104,47.01999,male,13.0,white,40-59,3,,1,7,...,0.091995,0,0,0.09,0.09,0.0,False,True,False,False


## Deler inn i trenings-, validerings- og testdata

In [482]:
train_df, rest_df = train_test_split(pasient_info_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(rest_df, test_size=0.50, random_state=42)
train_df

Unnamed: 0,pasient_id,alder,kjønn,utdanning,etnisitet,aldersgrupper,utdanningsnivå,inntektsskala,sykehusdød,oppholdslengde,...,overlevelsesestimat_6mnd,diabetes,demens,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,adl,kreft_metastatic,kreft,dnr_før_innleggelse,dnr_ved_innleggelse
6964,8203,74.12500,male,12.0,white,60-79,2,0.0,0,5,...,0.580933,0,0,0.9,0.9,0.0,False,False,False,False
5117,6018,65.26495,male,12.0,white,60-79,2,1.0,1,5,...,0.004999,0,0,0.1,0.001,0.0,False,False,False,False
7258,8556,71.98395,male,12.0,white,60-79,2,,0,45,...,0.661987,1,0,0.9,0.9,0.0,False,True,False,False
3680,4320,52.26599,male,20.0,asian,40-59,3,3.0,0,19,...,0.106995,0,0,,,0.0,True,True,False,False
7571,8922,69.77100,male,12.0,white,60-79,2,3.0,0,5,...,0.700928,0,0,,,0.0,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,6146,58.71597,male,9.0,white,40-59,1,0.0,0,7,...,0.889893,0,0,0.8,0.7,0.0,False,False,False,False
5390,6341,81.90594,female,,white,80+,,0.0,0,33,...,0.756958,0,0,0.9,0.9,5.0,False,True,False,False
860,999,60.37000,female,12.0,white,60-79,2,3.0,0,6,...,0.691895,0,0,0.8,0.5,0.0,True,True,False,False
7603,8959,80.05499,male,8.0,white,80+,1,1.0,0,4,...,0.416992,0,0,0.95,0.8,1.0,False,True,True,False


## Statistikk

In [483]:
columns_to_exclude = ['pasient_id', 'aldersgrupper', 'utdanningsnivå']
for col in train_df: 
    if col not in columns_to_exclude:
        fig = px.histogram(train_df, x=col)
        fig.show()

In [45]:
Gjennomsnittlig_oppholdslengde = demographic_hospital_phisiological_df['oppholdslengde'].mean()
Gjennomsnittlig_oppholdslengde

17.819239720713732

In [46]:
fig = px.scatter(demographic_hospital_phisiological_df, 'oppholdslengde', 'alder')
fig.update_layout(font=dict(size=18), template="simple_white", showlegend=False)
fig.update_xaxes(range=[0, 200])
fig.update_yaxes(range=[18, 110])
fig.show()

## Predikerer manglende data