## Importerer pakker

In [495]:
import pandas as pd
import numpy as np

import warnings

from sklearn.impute import KNNImputer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

## Leser inn data

In [496]:
demographic_df = pd.read_csv('../raw_data/demographic.csv')
hospital_df = pd.read_csv('../raw_data/hospital.csv')
physiological_df = pd.read_table('../raw_data/physiological.txt')
severity_json = pd.read_json('../raw_data/severity.json')
pd.set_option('display.max_columns', None)


## Omgjør Severity_json til DataFrame

In [497]:
severity_df = severity_json.iloc[:,0:-1].explode(list(severity_json.columns[2:-1]))
severity_df.reset_index(drop=True, inplace=True)
severity_df = severity_df.sort_values(by=['pasient_id'], ignore_index=True)
severity_df


Unnamed: 0,sykdomskategori_id,sykdomskategori,pasient_id,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,adl_pasient,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status
0,BrY,COPD/CHF/Cirrhosis,2,1,Cirrhosis,2,44.0,,1.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,
1,BrY,COPD/CHF/Cirrhosis,3,1,Cirrhosis,2,0.0,1.0,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,
2,ChE,Cancer,4,1,Lung Cancer,2,0.0,0.0,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,
3,A1s,ARF/MOSF,5,0,ARF/MOSF w/Sepsis,1,26.0,,2.0,23.5,30.0,0.634888,0.532959,0,0,no,0.9,0.9,
4,DWw,Coma,6,1,Coma,1,55.0,,1.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,A1s,ARF/MOSF,9101,0,ARF/MOSF w/Sepsis,1,0.0,,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,
7736,DWw,Coma,9102,0,Coma,1,41.0,,0.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,
7737,A1s,ARF/MOSF,9103,0,ARF/MOSF w/Sepsis,1,0.0,,,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,
7738,A1s,ARF/MOSF,9104,1,MOSF w/Malig,1,0.0,,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,


## Slår sammen DataFrames

In [498]:
demographic_hospital_df = demographic_df.merge(hospital_df, on='pasient_id', how='left')
demographic_hospital_phisiological_df = demographic_hospital_df.merge(physiological_df, on='pasient_id')
pasient_info_df = demographic_hospital_phisiological_df.merge(severity_df, on='pasient_id')
pasient_info_df

Unnamed: 0,pasient_id,alder,kjønn,utdanning,inntekt,etnisitet,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,sykdomskategori_id,sykdomskategori,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,adl_pasient,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status
0,2,60.33899,female,12.0,$11-$25k,white,1,4,43.0,17.097656,112.0,34.0,34.59375,98.00000,,,5.500000,132.0,7.250000,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,44.0,,1.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,
1,3,52.74698,female,12.0,under $11k,white,0,17,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.199707,2.000000,134.0,7.459961,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,0.0,1.0,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,
2,4,42.38498,female,11.0,under $11k,white,0,3,75.0,9.099609,88.0,32.0,35.00000,,,,0.799927,139.0,,,,,ChE,Cancer,1,Lung Cancer,2,0.0,0.0,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,
3,5,79.88495,female,,,white,0,-99,59.0,13.500000,112.0,20.0,37.89844,173.31250,,,0.799927,143.0,7.509766,,,,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,26.0,,2.0,23.5,30.0,0.634888,0.532959,0,0,no,0.9,0.9,
4,6,93.01599,male,14.0,,white,1,4,110.0,10.398438,101.0,44.0,38.39844,266.62500,,,0.699951,140.0,7.659180,,,,DWw,Coma,1,Coma,1,55.0,,1.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7737,9103,70.38196,male,,,white,0,8,111.0,8.398438,83.0,24.0,36.69531,180.00000,,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,,,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,
7738,9104,47.01999,male,13.0,,white,1,7,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,0.399963,3.500000,135.0,7.469727,246.0,55.0,,A1s,ARF/MOSF,1,MOSF w/Malig,1,0.0,,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,
7739,9105,81.53894,female,8.0,$11-$25k,white,0,12,75.0,8.599609,69.0,24.0,36.19531,230.40625,4.500000,0.599976,1.199951,137.0,7.289062,187.0,15.0,,A1s,ARF/MOSF,1,ARF/MOSF w/Sepsis,1,0.0,0.0,,18.097656,7.0,0.832886,0.776978,1,0,no,,,
7740,59,72.55896,female,,,white,1,4,123.0,13.298828,98.0,32.0,37.19531,144.87500,,,0.500000,136.0,7.339844,,,,BrY,COPD/CHF/Cirrhosis,1,COPD,3,0.0,,,26.0,25.0,0.60791,0.419983,0,0,metastatic,,,dnr ved innleggelse


## Dropper duplikat pasient_ider

In [499]:
pasient_info_df = pasient_info_df.drop_duplicates()
pasient_info_df

Unnamed: 0,pasient_id,alder,kjønn,utdanning,inntekt,etnisitet,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,sykdomskategori_id,sykdomskategori,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,adl_pasient,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status
0,2,60.33899,female,12.0,$11-$25k,white,1,4,43.0,17.097656,112.0,34.0,34.59375,98.00000,,,5.500000,132.0,7.250000,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,44.0,,1.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,
1,3,52.74698,female,12.0,under $11k,white,0,17,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.199707,2.000000,134.0,7.459961,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,0.0,1.0,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,
2,4,42.38498,female,11.0,under $11k,white,0,3,75.0,9.099609,88.0,32.0,35.00000,,,,0.799927,139.0,,,,,ChE,Cancer,1,Lung Cancer,2,0.0,0.0,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,
3,5,79.88495,female,,,white,0,-99,59.0,13.500000,112.0,20.0,37.89844,173.31250,,,0.799927,143.0,7.509766,,,,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,26.0,,2.0,23.5,30.0,0.634888,0.532959,0,0,no,0.9,0.9,
4,6,93.01599,male,14.0,,white,1,4,110.0,10.398438,101.0,44.0,38.39844,266.62500,,,0.699951,140.0,7.659180,,,,DWw,Coma,1,Coma,1,55.0,,1.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,9101,66.07300,male,8.0,,white,0,23,109.0,7.399414,104.0,22.0,35.69531,280.00000,3.699707,0.399963,1.099854,131.0,7.459961,188.0,21.0,,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,
7736,9102,55.15399,female,11.0,,white,0,29,43.0,,0.0,8.0,38.59375,218.50000,,,5.899414,135.0,7.289062,190.0,49.0,0.0,DWw,Coma,0,Coma,1,41.0,,0.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,
7737,9103,70.38196,male,,,white,0,8,111.0,8.398438,83.0,24.0,36.69531,180.00000,,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,,,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,
7738,9104,47.01999,male,13.0,,white,1,7,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,0.399963,3.500000,135.0,7.469727,246.0,55.0,,A1s,ARF/MOSF,1,MOSF w/Malig,1,0.0,,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,


Mangler verdier for utdanning, inntekt, etnisitet, hvite_bloglegemer, lungefunksjon, serumalbumin, bilirubin, kreatinin, blod_ph, glukose, blodurea_nitrogen, urinmengde

## Fjerner pasienter med negativ oppholdslengde

In [500]:
negative_oppholdslengde = pasient_info_df[pasient_info_df['oppholdslengde'] < 0]
pasient_info_df[pasient_info_df['oppholdslengde']==-99]

# Fjerner pasient id 5, 23, 102, 256, 508, 8000 for å rense data
pasient_ids_to_remove = [5, 23, 102, 256, 508, 8000]
index_to_drop = hospital_df[hospital_df['pasient_id'].isin(pasient_ids_to_remove)].index
pasient_info_df = pasient_info_df.drop(index_to_drop)
pasient_info_df_no_dummies = pasient_info_df

## Runder av alder til heltall, og setter negativ alder til NaN

In [501]:

pasient_info_df.loc[pasient_info_df['alder'] < 0, 'alder'] = np.nan
pasient_info_df['alder'] = pasient_info_df['alder'].round().astype('Int64')
pasient_info_df


Unnamed: 0,pasient_id,alder,kjønn,utdanning,inntekt,etnisitet,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,sykdomskategori_id,sykdomskategori,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,adl_pasient,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status
0,2,60,female,12.0,$11-$25k,white,1,4,43.0,17.097656,112.0,34.0,34.59375,98.00000,,,5.500000,132.0,7.250000,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,44.0,,1.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,
1,3,53,female,12.0,under $11k,white,0,17,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.199707,2.000000,134.0,7.459961,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,0.0,1.0,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,
2,4,42,female,11.0,under $11k,white,0,3,75.0,9.099609,88.0,32.0,35.00000,,,,0.799927,139.0,,,,,ChE,Cancer,1,Lung Cancer,2,0.0,0.0,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,
4,6,93,male,14.0,,white,1,4,110.0,10.398438,101.0,44.0,38.39844,266.62500,,,0.699951,140.0,7.659180,,,,DWw,Coma,1,Coma,1,55.0,,1.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,
5,7,62,male,14.0,$25-$50k,white,0,9,78.0,11.699219,120.0,28.0,37.39844,309.50000,4.799805,0.399963,1.599854,132.0,7.479492,,,,BrY,COPD/CHF/Cirrhosis,1,CHF,1,0.0,0.0,1.0,17.296875,46.0,0.892944,0.820923,0,0,no,,0.7,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,9101,66,male,8.0,,white,0,23,109.0,7.399414,104.0,22.0,35.69531,280.00000,3.699707,0.399963,1.099854,131.0,7.459961,188.0,21.0,,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,
7736,9102,55,female,11.0,,white,0,29,43.0,,0.0,8.0,38.59375,218.50000,,,5.899414,135.0,7.289062,190.0,49.0,0.0,DWw,Coma,0,Coma,1,41.0,,0.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,
7737,9103,70,male,,,white,0,8,111.0,8.398438,83.0,24.0,36.69531,180.00000,,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,,,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,
7738,9104,47,male,13.0,,white,1,7,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,0.399963,3.500000,135.0,7.469727,246.0,55.0,,A1s,ARF/MOSF,1,MOSF w/Malig,1,0.0,,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,


## Mapper Inntekt til dummyvariabler

In [502]:
income_mapping ={
    'under $11k': 8,
    '$11-$25k' : 20,
    '$25-$50k':35,
    '>$50k ' : 60
}
pasient_info_df['inntekt'] = pasient_info_df['inntekt'].map(income_mapping)
pasient_info_df

Unnamed: 0,pasient_id,alder,kjønn,utdanning,inntekt,etnisitet,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,sykdomskategori_id,sykdomskategori,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,adl_pasient,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status
0,2,60,female,12.0,20.0,white,1,4,43.0,17.097656,112.0,34.0,34.59375,98.00000,,,5.500000,132.0,7.250000,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,44.0,,1.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,
1,3,53,female,12.0,8.0,white,0,17,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.199707,2.000000,134.0,7.459961,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,0.0,1.0,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,
2,4,42,female,11.0,8.0,white,0,3,75.0,9.099609,88.0,32.0,35.00000,,,,0.799927,139.0,,,,,ChE,Cancer,1,Lung Cancer,2,0.0,0.0,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,
4,6,93,male,14.0,,white,1,4,110.0,10.398438,101.0,44.0,38.39844,266.62500,,,0.699951,140.0,7.659180,,,,DWw,Coma,1,Coma,1,55.0,,1.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,
5,7,62,male,14.0,35.0,white,0,9,78.0,11.699219,120.0,28.0,37.39844,309.50000,4.799805,0.399963,1.599854,132.0,7.479492,,,,BrY,COPD/CHF/Cirrhosis,1,CHF,1,0.0,0.0,1.0,17.296875,46.0,0.892944,0.820923,0,0,no,,0.7,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,9101,66,male,8.0,,white,0,23,109.0,7.399414,104.0,22.0,35.69531,280.00000,3.699707,0.399963,1.099854,131.0,7.459961,188.0,21.0,,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,
7736,9102,55,female,11.0,,white,0,29,43.0,,0.0,8.0,38.59375,218.50000,,,5.899414,135.0,7.289062,190.0,49.0,0.0,DWw,Coma,0,Coma,1,41.0,,0.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,
7737,9103,70,male,,,white,0,8,111.0,8.398438,83.0,24.0,36.69531,180.00000,,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,,,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,
7738,9104,47,male,13.0,,white,1,7,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,0.399963,3.500000,135.0,7.469727,246.0,55.0,,A1s,ARF/MOSF,1,MOSF w/Malig,1,0.0,,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,


## Slår sammen adl

In [503]:
# Attempt to use 'adl_pasient' first
pasient_info_df['adl'] = pasient_info_df['adl_pasient']

# Where 'adl_pasient' is missing, use 'adl_stedfortreder'
pasient_info_df['adl'].fillna(pasient_info_df['adl_stedfortreder'], inplace=True)

# Drop the original columns
pasient_info_df.drop(columns=['adl_pasient', 'adl_stedfortreder'], inplace=True)
pasient_info_df


Unnamed: 0,pasient_id,alder,kjønn,utdanning,inntekt,etnisitet,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,sykdomskategori_id,sykdomskategori,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status,adl
0,2,60,female,12.0,20.0,white,1,4,43.0,17.097656,112.0,34.0,34.59375,98.00000,,,5.500000,132.0,7.250000,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,44.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,,1.0
1,3,53,female,12.0,8.0,white,0,17,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.199707,2.000000,134.0,7.459961,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,,1.0
2,4,42,female,11.0,8.0,white,0,3,75.0,9.099609,88.0,32.0,35.00000,,,,0.799927,139.0,,,,,ChE,Cancer,1,Lung Cancer,2,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,,0.0
4,6,93,male,14.0,,white,1,4,110.0,10.398438,101.0,44.0,38.39844,266.62500,,,0.699951,140.0,7.659180,,,,DWw,Coma,1,Coma,1,55.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,,1.0
5,7,62,male,14.0,35.0,white,0,9,78.0,11.699219,120.0,28.0,37.39844,309.50000,4.799805,0.399963,1.599854,132.0,7.479492,,,,BrY,COPD/CHF/Cirrhosis,1,CHF,1,0.0,17.296875,46.0,0.892944,0.820923,0,0,no,,0.7,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,9101,66,male,8.0,,white,0,23,109.0,7.399414,104.0,22.0,35.69531,280.00000,3.699707,0.399963,1.099854,131.0,7.459961,188.0,21.0,,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,,0.0
7736,9102,55,female,11.0,,white,0,29,43.0,,0.0,8.0,38.59375,218.50000,,,5.899414,135.0,7.289062,190.0,49.0,0.0,DWw,Coma,0,Coma,1,41.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,,0.0
7737,9103,70,male,,,white,0,8,111.0,8.398438,83.0,24.0,36.69531,180.00000,,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,,
7738,9104,47,male,13.0,,white,1,7,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,0.399963,3.500000,135.0,7.469727,246.0,55.0,,A1s,ARF/MOSF,1,MOSF w/Malig,1,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,,0.0


## Rydder opp ved å fjerne pasient_id og sette oppholdslengde som første rad

In [504]:
col_to_move = pasient_info_df.pop('oppholdslengde')
pasient_info_df.insert(0, 'oppholdslengde', col_to_move)
pasient_info_df.drop(columns=['pasient_id'], inplace=True)
pasient_info_df

Unnamed: 0,oppholdslengde,alder,kjønn,utdanning,inntekt,etnisitet,sykehusdød,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,sykdomskategori_id,sykdomskategori,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status,adl
0,4,60,female,12.0,20.0,white,1,43.0,17.097656,112.0,34.0,34.59375,98.00000,,,5.500000,132.0,7.250000,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,44.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,,1.0
1,17,53,female,12.0,8.0,white,0,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.199707,2.000000,134.0,7.459961,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,,1.0
2,3,42,female,11.0,8.0,white,0,75.0,9.099609,88.0,32.0,35.00000,,,,0.799927,139.0,,,,,ChE,Cancer,1,Lung Cancer,2,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,,0.0
4,4,93,male,14.0,,white,1,110.0,10.398438,101.0,44.0,38.39844,266.62500,,,0.699951,140.0,7.659180,,,,DWw,Coma,1,Coma,1,55.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,,1.0
5,9,62,male,14.0,35.0,white,0,78.0,11.699219,120.0,28.0,37.39844,309.50000,4.799805,0.399963,1.599854,132.0,7.479492,,,,BrY,COPD/CHF/Cirrhosis,1,CHF,1,0.0,17.296875,46.0,0.892944,0.820923,0,0,no,,0.7,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,23,66,male,8.0,,white,0,109.0,7.399414,104.0,22.0,35.69531,280.00000,3.699707,0.399963,1.099854,131.0,7.459961,188.0,21.0,,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,,0.0
7736,29,55,female,11.0,,white,0,43.0,,0.0,8.0,38.59375,218.50000,,,5.899414,135.0,7.289062,190.0,49.0,0.0,DWw,Coma,0,Coma,1,41.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,,0.0
7737,8,70,male,,,white,0,111.0,8.398438,83.0,24.0,36.69531,180.00000,,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,,
7738,7,47,male,13.0,,white,1,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,0.399963,3.500000,135.0,7.469727,246.0,55.0,,A1s,ARF/MOSF,1,MOSF w/Malig,1,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,,0.0


## Sjekker data

In [505]:
pasient_info_df.dtypes

oppholdslengde                     int64
alder                              Int64
kjønn                             object
utdanning                        float64
inntekt                          float64
etnisitet                         object
sykehusdød                         int64
blodtrykk                        float64
hvite_blodlegemer                float64
hjertefrekvens                   float64
respirasjonsfrekvens             float64
kroppstemperatur                 float64
lungefunksjon                    float64
serumalbumin                     float64
bilirubin                        float64
kreatinin                        float64
natrium                          float64
blod_ph                          float64
glukose                          float64
blodurea_nitrogen                float64
urinmengde                       float64
sykdomskategori_id                object
sykdomskategori                   object
dødsfall                          object
sykdom_underkate

In [506]:
columns_category = pasient_info_df.select_dtypes(include=['object']).columns.tolist()
columns_category

['kjønn',
 'etnisitet',
 'sykdomskategori_id',
 'sykdomskategori',
 'dødsfall',
 'sykdom_underkategori',
 'antall_komorbiditeter',
 'koma_score',
 'fysiologisk_score',
 'apache_fysiologisk_score',
 'overlevelsesestimat_2mnd',
 'overlevelsesestimat_6mnd',
 'diabetes',
 'demens',
 'kreft',
 'lege_overlevelsesestimat_2mnd',
 'lege_overlevelsesestimat_6mnd',
 'dnr_status']

In [507]:
to_int_columns = ['dødsfall', 'antall_komorbiditeter', 'diabetes', 'demens']
to_float_columns = ['koma_score', 'fysiologisk_score', 'apache_fysiologisk_score', 'overlevelsesestimat_2mnd', 'overlevelsesestimat_6mnd', 'lege_overlevelsesestimat_2mnd', 'lege_overlevelsesestimat_6mnd']

In [508]:
pasient_info_df[to_int_columns] = pasient_info_df[to_int_columns].astype(dtype=int)
pasient_info_df[to_float_columns] = pasient_info_df[to_float_columns].astype(dtype=float)

In [509]:
pasient_info_df.dtypes

oppholdslengde                     int64
alder                              Int64
kjønn                             object
utdanning                        float64
inntekt                          float64
etnisitet                         object
sykehusdød                         int64
blodtrykk                        float64
hvite_blodlegemer                float64
hjertefrekvens                   float64
respirasjonsfrekvens             float64
kroppstemperatur                 float64
lungefunksjon                    float64
serumalbumin                     float64
bilirubin                        float64
kreatinin                        float64
natrium                          float64
blod_ph                          float64
glukose                          float64
blodurea_nitrogen                float64
urinmengde                       float64
sykdomskategori_id                object
sykdomskategori                   object
dødsfall                           int64
sykdom_underkate

## Fyller inn manglende data med anbefalt imputeringsverdier

In [510]:
# Verdier hentet fra https://archive.ics.uci.edu/dataset/880/support2
fill_values = {
    'serumalbumin': 3.5,
    'lungefunksjon': 333.3,
    'bilirubin': 1.01,
    'kreatinin': 1.01,
    'blodurea_nitrogen': 6.51,
    'hvite_blodlegemer': 9,
    'urinmengde': 2502
}

# Fill missing values in the DataFrame using the dictionary-based approach
pasient_info_df.fillna(fill_values, inplace=True)

## Oversikt over manglende data

In [511]:
# Get the count of missing values in each column
missing_columns = pasient_info_df.isna().sum()

# Create a DataFrame with columns that have missing values greater than 0
missing_columns_df = pd.DataFrame(missing_columns[missing_columns > 0], columns=['Missing Values'])

# Display the DataFrame with missing values greater than 0
missing_columns_df

Unnamed: 0,Missing Values
alder,5
utdanning,1380
inntekt,3099
etnisitet,37
blod_ph,1911
glukose,3817
lege_overlevelsesestimat_2mnd,1419
lege_overlevelsesestimat_6mnd,1405
dnr_status,6664
adl,1758


## Deler inn i trenings-, validerings- og testdata

In [512]:
df_train, df_valtest = train_test_split(pasient_info_df, test_size=0.3, random_state=0)
df_val, df_test = train_test_split(df_valtest, test_size=0.5, random_state=0)

## Laster opp renset data deling

In [513]:
df_train.to_csv('train_df.csv', index=False)
df_val.to_csv('validation_df.csv', index=False)
df_test.to_csv('test_df.csv', index=False)