## Importerer pakker

In [104]:
import pandas as pd
import numpy as np

import warnings

from sklearn.impute import KNNImputer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
import plotly.express as px

warnings.filterwarnings('ignore')

## Leser inn data

In [105]:
demographic_df = pd.read_csv('../raw_data/demographic.csv')
hospital_df = pd.read_csv('../raw_data/hospital.csv')
physiological_df = pd.read_table('../raw_data/physiological.txt')
severity_json = pd.read_json('../raw_data/severity.json')
pd.set_option('display.max_columns', None)


## Omgjør Severity_json til DataFrame

In [106]:
severity_df = severity_json.iloc[:,0:-1].explode(list(severity_json.columns[2:-1]))
severity_df.reset_index(drop=True, inplace=True)
severity_df = severity_df.sort_values(by=['pasient_id'], ignore_index=True)
severity_df


Unnamed: 0,sykdomskategori_id,sykdomskategori,pasient_id,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,adl_pasient,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status
0,BrY,COPD/CHF/Cirrhosis,2,1,Cirrhosis,2,44.0,,1.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,
1,BrY,COPD/CHF/Cirrhosis,3,1,Cirrhosis,2,0.0,1.0,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,
2,ChE,Cancer,4,1,Lung Cancer,2,0.0,0.0,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,
3,A1s,ARF/MOSF,5,0,ARF/MOSF w/Sepsis,1,26.0,,2.0,23.5,30.0,0.634888,0.532959,0,0,no,0.9,0.9,
4,DWw,Coma,6,1,Coma,1,55.0,,1.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,A1s,ARF/MOSF,9101,0,ARF/MOSF w/Sepsis,1,0.0,,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,
7736,DWw,Coma,9102,0,Coma,1,41.0,,0.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,
7737,A1s,ARF/MOSF,9103,0,ARF/MOSF w/Sepsis,1,0.0,,,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,
7738,A1s,ARF/MOSF,9104,1,MOSF w/Malig,1,0.0,,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,


## Slår sammen DataFrames

In [107]:
demographic_hospital_df = demographic_df.merge(hospital_df, on='pasient_id', how='left')
demographic_hospital_phisiological_df = demographic_hospital_df.merge(physiological_df, on='pasient_id')
pasient_info_df = demographic_hospital_phisiological_df.merge(severity_df, on='pasient_id')
pasient_info_df

Unnamed: 0,pasient_id,alder,kjønn,utdanning,inntekt,etnisitet,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,sykdomskategori_id,sykdomskategori,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,adl_pasient,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status
0,2,60.33899,female,12.0,$11-$25k,white,1,4,43.0,17.097656,112.0,34.0,34.59375,98.00000,,,5.500000,132.0,7.250000,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,44.0,,1.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,
1,3,52.74698,female,12.0,under $11k,white,0,17,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.199707,2.000000,134.0,7.459961,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,0.0,1.0,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,
2,4,42.38498,female,11.0,under $11k,white,0,3,75.0,9.099609,88.0,32.0,35.00000,,,,0.799927,139.0,,,,,ChE,Cancer,1,Lung Cancer,2,0.0,0.0,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,
3,5,79.88495,female,,,white,0,-99,59.0,13.500000,112.0,20.0,37.89844,173.31250,,,0.799927,143.0,7.509766,,,,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,26.0,,2.0,23.5,30.0,0.634888,0.532959,0,0,no,0.9,0.9,
4,6,93.01599,male,14.0,,white,1,4,110.0,10.398438,101.0,44.0,38.39844,266.62500,,,0.699951,140.0,7.659180,,,,DWw,Coma,1,Coma,1,55.0,,1.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7737,9103,70.38196,male,,,white,0,8,111.0,8.398438,83.0,24.0,36.69531,180.00000,,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,,,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,
7738,9104,47.01999,male,13.0,,white,1,7,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,0.399963,3.500000,135.0,7.469727,246.0,55.0,,A1s,ARF/MOSF,1,MOSF w/Malig,1,0.0,,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,
7739,9105,81.53894,female,8.0,$11-$25k,white,0,12,75.0,8.599609,69.0,24.0,36.19531,230.40625,4.500000,0.599976,1.199951,137.0,7.289062,187.0,15.0,,A1s,ARF/MOSF,1,ARF/MOSF w/Sepsis,1,0.0,0.0,,18.097656,7.0,0.832886,0.776978,1,0,no,,,
7740,59,72.55896,female,,,white,1,4,123.0,13.298828,98.0,32.0,37.19531,144.87500,,,0.500000,136.0,7.339844,,,,BrY,COPD/CHF/Cirrhosis,1,COPD,3,0.0,,,26.0,25.0,0.60791,0.419983,0,0,metastatic,,,dnr ved innleggelse


## Dropper duplikat pasient_ider

In [108]:
pasient_info_df = pasient_info_df.drop([7740, 7741])
pasient_info_df

Unnamed: 0,pasient_id,alder,kjønn,utdanning,inntekt,etnisitet,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,sykdomskategori_id,sykdomskategori,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,adl_pasient,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status
0,2,60.33899,female,12.0,$11-$25k,white,1,4,43.0,17.097656,112.0,34.0,34.59375,98.00000,,,5.500000,132.0,7.250000,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,44.0,,1.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,
1,3,52.74698,female,12.0,under $11k,white,0,17,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.199707,2.000000,134.0,7.459961,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,0.0,1.0,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,
2,4,42.38498,female,11.0,under $11k,white,0,3,75.0,9.099609,88.0,32.0,35.00000,,,,0.799927,139.0,,,,,ChE,Cancer,1,Lung Cancer,2,0.0,0.0,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,
3,5,79.88495,female,,,white,0,-99,59.0,13.500000,112.0,20.0,37.89844,173.31250,,,0.799927,143.0,7.509766,,,,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,26.0,,2.0,23.5,30.0,0.634888,0.532959,0,0,no,0.9,0.9,
4,6,93.01599,male,14.0,,white,1,4,110.0,10.398438,101.0,44.0,38.39844,266.62500,,,0.699951,140.0,7.659180,,,,DWw,Coma,1,Coma,1,55.0,,1.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,9101,66.07300,male,8.0,,white,0,23,109.0,7.399414,104.0,22.0,35.69531,280.00000,3.699707,0.399963,1.099854,131.0,7.459961,188.0,21.0,,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,
7736,9102,55.15399,female,11.0,,white,0,29,43.0,,0.0,8.0,38.59375,218.50000,,,5.899414,135.0,7.289062,190.0,49.0,0.0,DWw,Coma,0,Coma,1,41.0,,0.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,
7737,9103,70.38196,male,,,white,0,8,111.0,8.398438,83.0,24.0,36.69531,180.00000,,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,,,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,
7738,9104,47.01999,male,13.0,,white,1,7,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,0.399963,3.500000,135.0,7.469727,246.0,55.0,,A1s,ARF/MOSF,1,MOSF w/Malig,1,0.0,,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,


## Oversikt over manglende data

In [109]:
# Get the count of missing values in each column
missing_columns = pasient_info_df.isna().sum()

# Convert the series to a DataFrame and transpose it
missing_columns_df = pd.DataFrame(missing_columns).T

# Display the DataFrame
missing_columns_df

Unnamed: 0,pasient_id,alder,kjønn,utdanning,inntekt,etnisitet,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,sykdomskategori_id,sykdomskategori,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,adl_pasient,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status
0,0,0,0,1381,2518,37,0,0,0,175,0,0,0,1944,2849,2196,57,0,1912,3823,3692,4113,0,0,0,0,0,0,4798,2441,0,0,0,0,0,0,0,1421,1407,6670


Mangler verdier for utdanning, inntekt, etnisitet, hvite_bloglegemer, lungefunksjon, serumalbumin, bilirubin, kreatinin, blod_ph, glukose, blodurea_nitrogen, urinmengde

## Fjerner pasienter med negativ oppholdslengde

In [110]:
negative_oppholdslengde = pasient_info_df[pasient_info_df['oppholdslengde'] < 0]
pasient_info_df[pasient_info_df['oppholdslengde']==-99]

# Fjerner pasient id 5, 23, 102, 256, 508, 8000 for å rense data
pasient_ids_to_remove = [5, 23, 102, 256, 508, 8000]
index_to_drop = hospital_df[hospital_df['pasient_id'].isin(pasient_ids_to_remove)].index
pasient_info_df = pasient_info_df.drop(index_to_drop)
pasient_info_df_no_dummies = pasient_info_df

## Runder av alder til heltall, og setter negativ alder til NaN

In [111]:

pasient_info_df.loc[pasient_info_df['alder'] < 0, 'alder'] = np.nan
pasient_info_df['alder'] = pasient_info_df['alder'].round().astype('Int64')
pasient_info_df


Unnamed: 0,pasient_id,alder,kjønn,utdanning,inntekt,etnisitet,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,sykdomskategori_id,sykdomskategori,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,adl_pasient,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status
0,2,60,female,12.0,$11-$25k,white,1,4,43.0,17.097656,112.0,34.0,34.59375,98.00000,,,5.500000,132.0,7.250000,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,44.0,,1.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,
1,3,53,female,12.0,under $11k,white,0,17,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.199707,2.000000,134.0,7.459961,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,0.0,1.0,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,
2,4,42,female,11.0,under $11k,white,0,3,75.0,9.099609,88.0,32.0,35.00000,,,,0.799927,139.0,,,,,ChE,Cancer,1,Lung Cancer,2,0.0,0.0,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,
4,6,93,male,14.0,,white,1,4,110.0,10.398438,101.0,44.0,38.39844,266.62500,,,0.699951,140.0,7.659180,,,,DWw,Coma,1,Coma,1,55.0,,1.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,
5,7,62,male,14.0,$25-$50k,white,0,9,78.0,11.699219,120.0,28.0,37.39844,309.50000,4.799805,0.399963,1.599854,132.0,7.479492,,,,BrY,COPD/CHF/Cirrhosis,1,CHF,1,0.0,0.0,1.0,17.296875,46.0,0.892944,0.820923,0,0,no,,0.7,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,9101,66,male,8.0,,white,0,23,109.0,7.399414,104.0,22.0,35.69531,280.00000,3.699707,0.399963,1.099854,131.0,7.459961,188.0,21.0,,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,
7736,9102,55,female,11.0,,white,0,29,43.0,,0.0,8.0,38.59375,218.50000,,,5.899414,135.0,7.289062,190.0,49.0,0.0,DWw,Coma,0,Coma,1,41.0,,0.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,
7737,9103,70,male,,,white,0,8,111.0,8.398438,83.0,24.0,36.69531,180.00000,,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,,,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,
7738,9104,47,male,13.0,,white,1,7,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,0.399963,3.500000,135.0,7.469727,246.0,55.0,,A1s,ARF/MOSF,1,MOSF w/Malig,1,0.0,,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,


## Dummyvariabler for kjønn 

In [112]:
pasient_info_df = pd.get_dummies(pasient_info_df, columns=['kjønn'], drop_first=True)
pasient_info_df

Unnamed: 0,pasient_id,alder,utdanning,inntekt,etnisitet,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,sykdomskategori_id,sykdomskategori,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,adl_pasient,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status,kjønn_male
0,2,60,12.0,$11-$25k,white,1,4,43.0,17.097656,112.0,34.0,34.59375,98.00000,,,5.500000,132.0,7.250000,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,44.0,,1.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,,False
1,3,53,12.0,under $11k,white,0,17,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.199707,2.000000,134.0,7.459961,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,0.0,1.0,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,,False
2,4,42,11.0,under $11k,white,0,3,75.0,9.099609,88.0,32.0,35.00000,,,,0.799927,139.0,,,,,ChE,Cancer,1,Lung Cancer,2,0.0,0.0,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,,False
4,6,93,14.0,,white,1,4,110.0,10.398438,101.0,44.0,38.39844,266.62500,,,0.699951,140.0,7.659180,,,,DWw,Coma,1,Coma,1,55.0,,1.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,,True
5,7,62,14.0,$25-$50k,white,0,9,78.0,11.699219,120.0,28.0,37.39844,309.50000,4.799805,0.399963,1.599854,132.0,7.479492,,,,BrY,COPD/CHF/Cirrhosis,1,CHF,1,0.0,0.0,1.0,17.296875,46.0,0.892944,0.820923,0,0,no,,0.7,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,9101,66,8.0,,white,0,23,109.0,7.399414,104.0,22.0,35.69531,280.00000,3.699707,0.399963,1.099854,131.0,7.459961,188.0,21.0,,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,,True
7736,9102,55,11.0,,white,0,29,43.0,,0.0,8.0,38.59375,218.50000,,,5.899414,135.0,7.289062,190.0,49.0,0.0,DWw,Coma,0,Coma,1,41.0,,0.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,,False
7737,9103,70,,,white,0,8,111.0,8.398438,83.0,24.0,36.69531,180.00000,,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,,,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,,True
7738,9104,47,13.0,,white,1,7,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,0.399963,3.500000,135.0,7.469727,246.0,55.0,,A1s,ARF/MOSF,1,MOSF w/Malig,1,0.0,,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,,True


## Dummy variabler for etnisitet

In [113]:
pasient_info_df = pd.get_dummies(pasient_info_df, columns=['etnisitet'], dummy_na=True)


## Mapper Inntekt til dummyvariabler

In [114]:

pasient_info_df = pd.get_dummies(pasient_info_df, columns=['inntekt'], dummy_na=True)
pasient_info_df



Unnamed: 0,pasient_id,alder,utdanning,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,sykdomskategori_id,sykdomskategori,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,adl_pasient,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status,kjønn_male,etnisitet_asian,etnisitet_black,etnisitet_hispanic,etnisitet_other,etnisitet_white,etnisitet_nan,inntekt_$11-$25k,inntekt_$25-$50k,inntekt_>$50k,inntekt_under $11k,inntekt_nan
0,2,60,12.0,1,4,43.0,17.097656,112.0,34.0,34.59375,98.00000,,,5.500000,132.0,7.250000,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,44.0,,1.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,,False,False,False,False,False,True,False,True,False,False,False,False
1,3,53,12.0,0,17,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.199707,2.000000,134.0,7.459961,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,0.0,1.0,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,,False,False,False,False,False,True,False,False,False,False,True,False
2,4,42,11.0,0,3,75.0,9.099609,88.0,32.0,35.00000,,,,0.799927,139.0,,,,,ChE,Cancer,1,Lung Cancer,2,0.0,0.0,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,,False,False,False,False,False,True,False,False,False,False,True,False
4,6,93,14.0,1,4,110.0,10.398438,101.0,44.0,38.39844,266.62500,,,0.699951,140.0,7.659180,,,,DWw,Coma,1,Coma,1,55.0,,1.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,,True,False,False,False,False,True,False,False,False,False,False,True
5,7,62,14.0,0,9,78.0,11.699219,120.0,28.0,37.39844,309.50000,4.799805,0.399963,1.599854,132.0,7.479492,,,,BrY,COPD/CHF/Cirrhosis,1,CHF,1,0.0,0.0,1.0,17.296875,46.0,0.892944,0.820923,0,0,no,,0.7,,True,False,False,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,9101,66,8.0,0,23,109.0,7.399414,104.0,22.0,35.69531,280.00000,3.699707,0.399963,1.099854,131.0,7.459961,188.0,21.0,,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,,True,False,False,False,False,True,False,False,False,False,False,True
7736,9102,55,11.0,0,29,43.0,,0.0,8.0,38.59375,218.50000,,,5.899414,135.0,7.289062,190.0,49.0,0.0,DWw,Coma,0,Coma,1,41.0,,0.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,,False,False,False,False,False,True,False,False,False,False,False,True
7737,9103,70,,0,8,111.0,8.398438,83.0,24.0,36.69531,180.00000,,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,,,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,,True,False,False,False,False,True,False,False,False,False,False,True
7738,9104,47,13.0,1,7,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,0.399963,3.500000,135.0,7.469727,246.0,55.0,,A1s,ARF/MOSF,1,MOSF w/Malig,1,0.0,,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,,True,False,False,False,False,True,False,False,False,False,False,True


## Slår sammen adl

In [115]:
# Attempt to use 'adl_pasient' first
pasient_info_df['adl'] = pasient_info_df['adl_pasient']

# Where 'adl_pasient' is missing, use 'adl_stedfortreder'
pasient_info_df['adl'].fillna(pasient_info_df['adl_stedfortreder'], inplace=True)

# Drop the original columns
pasient_info_df.drop(columns=['adl_pasient', 'adl_stedfortreder'], inplace=True)
pasient_info_df['adl'].fillna(pasient_info_df['adl'].median(), inplace=True)
pasient_info_df


Unnamed: 0,pasient_id,alder,utdanning,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,sykdomskategori_id,sykdomskategori,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status,kjønn_male,etnisitet_asian,etnisitet_black,etnisitet_hispanic,etnisitet_other,etnisitet_white,etnisitet_nan,inntekt_$11-$25k,inntekt_$25-$50k,inntekt_>$50k,inntekt_under $11k,inntekt_nan,adl
0,2,60,12.0,1,4,43.0,17.097656,112.0,34.0,34.59375,98.00000,,,5.500000,132.0,7.250000,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,44.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,,False,False,False,False,False,True,False,True,False,False,False,False,1.0
1,3,53,12.0,0,17,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.199707,2.000000,134.0,7.459961,,,,BrY,COPD/CHF/Cirrhosis,1,Cirrhosis,2,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,,False,False,False,False,False,True,False,False,False,False,True,False,1.0
2,4,42,11.0,0,3,75.0,9.099609,88.0,32.0,35.00000,,,,0.799927,139.0,,,,,ChE,Cancer,1,Lung Cancer,2,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,,False,False,False,False,False,True,False,False,False,False,True,False,0.0
4,6,93,14.0,1,4,110.0,10.398438,101.0,44.0,38.39844,266.62500,,,0.699951,140.0,7.659180,,,,DWw,Coma,1,Coma,1,55.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,,True,False,False,False,False,True,False,False,False,False,False,True,1.0
5,7,62,14.0,0,9,78.0,11.699219,120.0,28.0,37.39844,309.50000,4.799805,0.399963,1.599854,132.0,7.479492,,,,BrY,COPD/CHF/Cirrhosis,1,CHF,1,0.0,17.296875,46.0,0.892944,0.820923,0,0,no,,0.7,,True,False,False,False,False,True,False,False,True,False,False,False,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,9101,66,8.0,0,23,109.0,7.399414,104.0,22.0,35.69531,280.00000,3.699707,0.399963,1.099854,131.0,7.459961,188.0,21.0,,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,,True,False,False,False,False,True,False,False,False,False,False,True,0.0
7736,9102,55,11.0,0,29,43.0,,0.0,8.0,38.59375,218.50000,,,5.899414,135.0,7.289062,190.0,49.0,0.0,DWw,Coma,0,Coma,1,41.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,,False,False,False,False,False,True,False,False,False,False,False,True,0.0
7737,9103,70,,0,8,111.0,8.398438,83.0,24.0,36.69531,180.00000,,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,A1s,ARF/MOSF,0,ARF/MOSF w/Sepsis,1,0.0,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,,True,False,False,False,False,True,False,False,False,False,False,True,0.0
7738,9104,47,13.0,1,7,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,0.399963,3.500000,135.0,7.469727,246.0,55.0,,A1s,ARF/MOSF,1,MOSF w/Malig,1,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,,True,False,False,False,False,True,False,False,False,False,False,True,0.0


## Oppretter dummy_variabler

In [116]:
pasient_info_df = pd.get_dummies(pasient_info_df, columns=['sykdom_underkategori'])
severity_df

Unnamed: 0,sykdomskategori_id,sykdomskategori,pasient_id,dødsfall,sykdom_underkategori,antall_komorbiditeter,koma_score,adl_pasient,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status
0,BrY,COPD/CHF/Cirrhosis,2,1,Cirrhosis,2,44.0,,1.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,
1,BrY,COPD/CHF/Cirrhosis,3,1,Cirrhosis,2,0.0,1.0,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,
2,ChE,Cancer,4,1,Lung Cancer,2,0.0,0.0,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,
3,A1s,ARF/MOSF,5,0,ARF/MOSF w/Sepsis,1,26.0,,2.0,23.5,30.0,0.634888,0.532959,0,0,no,0.9,0.9,
4,DWw,Coma,6,1,Coma,1,55.0,,1.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,A1s,ARF/MOSF,9101,0,ARF/MOSF w/Sepsis,1,0.0,,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,
7736,DWw,Coma,9102,0,Coma,1,41.0,,0.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,
7737,A1s,ARF/MOSF,9103,0,ARF/MOSF w/Sepsis,1,0.0,,,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,
7738,A1s,ARF/MOSF,9104,1,MOSF w/Malig,1,0.0,,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,


## Slår sammen sykdomskategorier til underkategorien

In [117]:

new_names_severity = {
    'sykdom_underkategori_ARF/MOSF w/Sepsis' : 'sykdom_ARF/MOSF w/Sepsis',
    'sykdom_underkategori_CHF' : 'sykdom_CHF',
    'sykdom_underkategori_COPD' : 'sykdom_COPD',
    'sykdom_underkategori_Cirrhosis' : 'sykdom_Cirrhosis',
    'sykdom_underkategori_Colon Cancer' : 'sykdom_Colon_Cancer',
    'sykdom_underkategori_Coma' : 'sykdom_Coma',
    'sykdom_underkategori_Lung Cancer' : 'sykdom_Lung_Cancer',
    'sykdom_underkategori_MOSF w/Malig' : 'sykdom_MOSF_w/Malig'
}
pasient_info_df.rename(columns=new_names_severity, inplace=True)
pasient_info_df

pasient_info_df.drop(columns={'sykdomskategori'}, inplace=True)
pasient_info_df.drop(columns={'sykdomskategori_id'}, inplace=True)
pasient_info_df





Unnamed: 0,pasient_id,alder,utdanning,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,dødsfall,antall_komorbiditeter,koma_score,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status,kjønn_male,etnisitet_asian,etnisitet_black,etnisitet_hispanic,etnisitet_other,etnisitet_white,etnisitet_nan,inntekt_$11-$25k,inntekt_$25-$50k,inntekt_>$50k,inntekt_under $11k,inntekt_nan,adl,sykdom_ARF/MOSF w/Sepsis,sykdom_CHF,sykdom_COPD,sykdom_Cirrhosis,sykdom_Colon_Cancer,sykdom_Coma,sykdom_Lung_Cancer,sykdom_MOSF_w/Malig
0,2,60,12.0,1,4,43.0,17.097656,112.0,34.0,34.59375,98.00000,,,5.500000,132.0,7.250000,,,,1,2,44.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0,,False,False,False,False,False,True,False,True,False,False,False,False,1.0,False,False,False,True,False,False,False,False
1,3,53,12.0,0,17,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.199707,2.000000,134.0,7.459961,,,,1,2,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5,,False,False,False,False,False,True,False,False,False,False,True,False,1.0,False,False,False,True,False,False,False,False
2,4,42,11.0,0,3,75.0,9.099609,88.0,32.0,35.00000,,,,0.799927,139.0,,,,,1,2,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,,False,False,False,False,False,True,False,False,False,False,True,False,0.0,False,False,False,False,False,False,True,False
4,6,93,14.0,1,4,110.0,10.398438,101.0,44.0,38.39844,266.62500,,,0.699951,140.0,7.659180,,,,1,1,55.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0,,True,False,False,False,False,True,False,False,False,False,False,True,1.0,False,False,False,False,False,True,False,False
5,7,62,14.0,0,9,78.0,11.699219,120.0,28.0,37.39844,309.50000,4.799805,0.399963,1.599854,132.0,7.479492,,,,1,1,0.0,17.296875,46.0,0.892944,0.820923,0,0,no,,0.7,,True,False,False,False,False,True,False,False,True,False,False,False,0.0,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,9101,66,8.0,0,23,109.0,7.399414,104.0,22.0,35.69531,280.00000,3.699707,0.399963,1.099854,131.0,7.459961,188.0,21.0,,0,1,0.0,16.296875,22.0,0.852905,0.80188,0,0,no,0.8,0.512,,True,False,False,False,False,True,False,False,False,False,False,True,0.0,True,False,False,False,False,False,False,False
7736,9102,55,11.0,0,29,43.0,,0.0,8.0,38.59375,218.50000,,,5.899414,135.0,7.289062,190.0,49.0,0.0,0,1,41.0,25.796875,31.0,0.553955,0.485962,0,0,no,0.5,0.5,,False,False,False,False,False,True,False,False,False,False,False,True,0.0,False,False,False,False,False,True,False,False
7737,9103,70,,0,8,111.0,8.398438,83.0,24.0,36.69531,180.00000,,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,0,1,0.0,22.699219,39.0,0.741943,0.660889,0,0,no,0.9,0.8,,True,False,False,False,False,True,False,False,False,False,False,True,0.0,True,False,False,False,False,False,False,False
7738,9104,47,13.0,1,7,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,0.399963,3.500000,135.0,7.469727,246.0,55.0,,1,1,0.0,40.195312,51.0,0.177979,0.091995,0,0,yes,0.09,0.09,,True,False,False,False,False,True,False,False,False,False,False,True,0.0,False,False,False,False,False,False,False,True


## Oppretter Dummies for kreft

In [118]:
pasient_info_df = pd.get_dummies(pasient_info_df, columns=['kreft'])

# Setter kreft_yes til True dersom pasienten har kreft_metastatic
pasient_info_df.loc[pasient_info_df['kreft_metastatic']==True, 'kreft_yes'] = True

# Fjerner kreft_no
pasient_info_df.drop(columns=['kreft_no'], inplace=True)

# Omgjør kreft_yes til annen kreft
# Dersom pasienten ikke har lungekreft eller colon kreft, setter vi at pasienten har en annen kreft
pasient_info_df.rename(columns={'kreft_yes':'sykdom_Other_Cancer'}, inplace=True)
pasient_info_df.loc[pasient_info_df['sykdom_Lung_Cancer'] == True, 'sykdom_Other_Cancer'] = False
pasient_info_df.loc[pasient_info_df['sykdom_Colon_Cancer'] == True, 'sykdom_Other_Cancer'] = False
pasient_info_df



Unnamed: 0,pasient_id,alder,utdanning,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,dødsfall,antall_komorbiditeter,koma_score,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status,kjønn_male,etnisitet_asian,etnisitet_black,etnisitet_hispanic,etnisitet_other,etnisitet_white,etnisitet_nan,inntekt_$11-$25k,inntekt_$25-$50k,inntekt_>$50k,inntekt_under $11k,inntekt_nan,adl,sykdom_ARF/MOSF w/Sepsis,sykdom_CHF,sykdom_COPD,sykdom_Cirrhosis,sykdom_Colon_Cancer,sykdom_Coma,sykdom_Lung_Cancer,sykdom_MOSF_w/Malig,kreft_metastatic,sykdom_Other_Cancer
0,2,60,12.0,1,4,43.0,17.097656,112.0,34.0,34.59375,98.00000,,,5.500000,132.0,7.250000,,,,1,2,44.0,52.695312,74.0,0.001,0.0,0,0,0.0,0.0,,False,False,False,False,False,True,False,True,False,False,False,False,1.0,False,False,False,True,False,False,False,False,False,False
1,3,53,12.0,0,17,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.199707,2.000000,134.0,7.459961,,,,1,2,0.0,20.5,45.0,0.790894,0.664917,0,0,0.75,0.5,,False,False,False,False,False,True,False,False,False,False,True,False,1.0,False,False,False,True,False,False,False,False,False,False
2,4,42,11.0,0,3,75.0,9.099609,88.0,32.0,35.00000,,,,0.799927,139.0,,,,,1,2,0.0,20.097656,19.0,0.698975,0.411987,0,0,0.9,0.5,,False,False,False,False,False,True,False,False,False,False,True,False,0.0,False,False,False,False,False,False,True,False,True,False
4,6,93,14.0,1,4,110.0,10.398438,101.0,44.0,38.39844,266.62500,,,0.699951,140.0,7.659180,,,,1,1,55.0,19.398438,27.0,0.284973,0.214996,0,0,0.0,0.0,,True,False,False,False,False,True,False,False,False,False,False,True,1.0,False,False,False,False,False,True,False,False,False,False
5,7,62,14.0,0,9,78.0,11.699219,120.0,28.0,37.39844,309.50000,4.799805,0.399963,1.599854,132.0,7.479492,,,,1,1,0.0,17.296875,46.0,0.892944,0.820923,0,0,,0.7,,True,False,False,False,False,True,False,False,True,False,False,False,0.0,False,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,9101,66,8.0,0,23,109.0,7.399414,104.0,22.0,35.69531,280.00000,3.699707,0.399963,1.099854,131.0,7.459961,188.0,21.0,,0,1,0.0,16.296875,22.0,0.852905,0.80188,0,0,0.8,0.512,,True,False,False,False,False,True,False,False,False,False,False,True,0.0,True,False,False,False,False,False,False,False,False,False
7736,9102,55,11.0,0,29,43.0,,0.0,8.0,38.59375,218.50000,,,5.899414,135.0,7.289062,190.0,49.0,0.0,0,1,41.0,25.796875,31.0,0.553955,0.485962,0,0,0.5,0.5,,False,False,False,False,False,True,False,False,False,False,False,True,0.0,False,False,False,False,False,True,False,False,False,False
7737,9103,70,,0,8,111.0,8.398438,83.0,24.0,36.69531,180.00000,,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,0,1,0.0,22.699219,39.0,0.741943,0.660889,0,0,0.9,0.8,,True,False,False,False,False,True,False,False,False,False,False,True,0.0,True,False,False,False,False,False,False,False,False,False
7738,9104,47,13.0,1,7,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,0.399963,3.500000,135.0,7.469727,246.0,55.0,,1,1,0.0,40.195312,51.0,0.177979,0.091995,0,0,0.09,0.09,,True,False,False,False,False,True,False,False,False,False,False,True,0.0,False,False,False,False,False,False,False,True,False,True


## Oppretter Dummies for dnr_status

In [119]:
pasient_info_df = pd.get_dummies(pasient_info_df, columns=['dnr_status'])
pasient_info_df.rename(columns={'dnr_status_dnr før innleggelse' : 'dnr_før_innleggelse', 'dnr_status_dnr ved innleggelse' : 'dnr_ved_innleggelse'}, inplace=True)
pasient_info_df

Unnamed: 0,pasient_id,alder,utdanning,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,dødsfall,antall_komorbiditeter,koma_score,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,kjønn_male,etnisitet_asian,etnisitet_black,etnisitet_hispanic,etnisitet_other,etnisitet_white,etnisitet_nan,inntekt_$11-$25k,inntekt_$25-$50k,inntekt_>$50k,inntekt_under $11k,inntekt_nan,adl,sykdom_ARF/MOSF w/Sepsis,sykdom_CHF,sykdom_COPD,sykdom_Cirrhosis,sykdom_Colon_Cancer,sykdom_Coma,sykdom_Lung_Cancer,sykdom_MOSF_w/Malig,kreft_metastatic,sykdom_Other_Cancer,dnr_før_innleggelse,dnr_ved_innleggelse
0,2,60,12.0,1,4,43.0,17.097656,112.0,34.0,34.59375,98.00000,,,5.500000,132.0,7.250000,,,,1,2,44.0,52.695312,74.0,0.001,0.0,0,0,0.0,0.0,False,False,False,False,False,True,False,True,False,False,False,False,1.0,False,False,False,True,False,False,False,False,False,False,False,False
1,3,53,12.0,0,17,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.199707,2.000000,134.0,7.459961,,,,1,2,0.0,20.5,45.0,0.790894,0.664917,0,0,0.75,0.5,False,False,False,False,False,True,False,False,False,False,True,False,1.0,False,False,False,True,False,False,False,False,False,False,False,False
2,4,42,11.0,0,3,75.0,9.099609,88.0,32.0,35.00000,,,,0.799927,139.0,,,,,1,2,0.0,20.097656,19.0,0.698975,0.411987,0,0,0.9,0.5,False,False,False,False,False,True,False,False,False,False,True,False,0.0,False,False,False,False,False,False,True,False,True,False,False,False
4,6,93,14.0,1,4,110.0,10.398438,101.0,44.0,38.39844,266.62500,,,0.699951,140.0,7.659180,,,,1,1,55.0,19.398438,27.0,0.284973,0.214996,0,0,0.0,0.0,True,False,False,False,False,True,False,False,False,False,False,True,1.0,False,False,False,False,False,True,False,False,False,False,False,False
5,7,62,14.0,0,9,78.0,11.699219,120.0,28.0,37.39844,309.50000,4.799805,0.399963,1.599854,132.0,7.479492,,,,1,1,0.0,17.296875,46.0,0.892944,0.820923,0,0,,0.7,True,False,False,False,False,True,False,False,True,False,False,False,0.0,False,True,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,9101,66,8.0,0,23,109.0,7.399414,104.0,22.0,35.69531,280.00000,3.699707,0.399963,1.099854,131.0,7.459961,188.0,21.0,,0,1,0.0,16.296875,22.0,0.852905,0.80188,0,0,0.8,0.512,True,False,False,False,False,True,False,False,False,False,False,True,0.0,True,False,False,False,False,False,False,False,False,False,False,False
7736,9102,55,11.0,0,29,43.0,,0.0,8.0,38.59375,218.50000,,,5.899414,135.0,7.289062,190.0,49.0,0.0,0,1,41.0,25.796875,31.0,0.553955,0.485962,0,0,0.5,0.5,False,False,False,False,False,True,False,False,False,False,False,True,0.0,False,False,False,False,False,True,False,False,False,False,False,False
7737,9103,70,,0,8,111.0,8.398438,83.0,24.0,36.69531,180.00000,,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,0,1,0.0,22.699219,39.0,0.741943,0.660889,0,0,0.9,0.8,True,False,False,False,False,True,False,False,False,False,False,True,0.0,True,False,False,False,False,False,False,False,False,False,False,False
7738,9104,47,13.0,1,7,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,0.399963,3.500000,135.0,7.469727,246.0,55.0,,1,1,0.0,40.195312,51.0,0.177979,0.091995,0,0,0.09,0.09,True,False,False,False,False,True,False,False,False,False,False,True,0.0,False,False,False,False,False,False,False,True,False,True,False,False


## Rydder opp ved å fjerne pasient_id og sette oppholdslengde som første rad

In [120]:
col_to_move = pasient_info_df.pop('oppholdslengde')
pasient_info_df.insert(0, 'oppholdslengde', col_to_move)
pasient_info_df.drop(columns=['pasient_id'], inplace=True)
pasient_info_df

Unnamed: 0,oppholdslengde,alder,utdanning,sykehusdød,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,dødsfall,antall_komorbiditeter,koma_score,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,kjønn_male,etnisitet_asian,etnisitet_black,etnisitet_hispanic,etnisitet_other,etnisitet_white,etnisitet_nan,inntekt_$11-$25k,inntekt_$25-$50k,inntekt_>$50k,inntekt_under $11k,inntekt_nan,adl,sykdom_ARF/MOSF w/Sepsis,sykdom_CHF,sykdom_COPD,sykdom_Cirrhosis,sykdom_Colon_Cancer,sykdom_Coma,sykdom_Lung_Cancer,sykdom_MOSF_w/Malig,kreft_metastatic,sykdom_Other_Cancer,dnr_før_innleggelse,dnr_ved_innleggelse
0,4,60,12.0,1,43.0,17.097656,112.0,34.0,34.59375,98.00000,,,5.500000,132.0,7.250000,,,,1,2,44.0,52.695312,74.0,0.001,0.0,0,0,0.0,0.0,False,False,False,False,False,True,False,True,False,False,False,False,1.0,False,False,False,True,False,False,False,False,False,False,False,False
1,17,53,12.0,0,70.0,8.500000,88.0,28.0,37.39844,231.65625,,2.199707,2.000000,134.0,7.459961,,,,1,2,0.0,20.5,45.0,0.790894,0.664917,0,0,0.75,0.5,False,False,False,False,False,True,False,False,False,False,True,False,1.0,False,False,False,True,False,False,False,False,False,False,False,False
2,3,42,11.0,0,75.0,9.099609,88.0,32.0,35.00000,,,,0.799927,139.0,,,,,1,2,0.0,20.097656,19.0,0.698975,0.411987,0,0,0.9,0.5,False,False,False,False,False,True,False,False,False,False,True,False,0.0,False,False,False,False,False,False,True,False,True,False,False,False
4,4,93,14.0,1,110.0,10.398438,101.0,44.0,38.39844,266.62500,,,0.699951,140.0,7.659180,,,,1,1,55.0,19.398438,27.0,0.284973,0.214996,0,0,0.0,0.0,True,False,False,False,False,True,False,False,False,False,False,True,1.0,False,False,False,False,False,True,False,False,False,False,False,False
5,9,62,14.0,0,78.0,11.699219,120.0,28.0,37.39844,309.50000,4.799805,0.399963,1.599854,132.0,7.479492,,,,1,1,0.0,17.296875,46.0,0.892944,0.820923,0,0,,0.7,True,False,False,False,False,True,False,False,True,False,False,False,0.0,False,True,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,23,66,8.0,0,109.0,7.399414,104.0,22.0,35.69531,280.00000,3.699707,0.399963,1.099854,131.0,7.459961,188.0,21.0,,0,1,0.0,16.296875,22.0,0.852905,0.80188,0,0,0.8,0.512,True,False,False,False,False,True,False,False,False,False,False,True,0.0,True,False,False,False,False,False,False,False,False,False,False,False
7736,29,55,11.0,0,43.0,,0.0,8.0,38.59375,218.50000,,,5.899414,135.0,7.289062,190.0,49.0,0.0,0,1,41.0,25.796875,31.0,0.553955,0.485962,0,0,0.5,0.5,False,False,False,False,False,True,False,False,False,False,False,True,0.0,False,False,False,False,False,True,False,False,False,False,False,False
7737,8,70,,0,111.0,8.398438,83.0,24.0,36.69531,180.00000,,0.399963,2.699707,139.0,7.379883,189.0,60.0,3900.0,0,1,0.0,22.699219,39.0,0.741943,0.660889,0,0,0.9,0.8,True,False,False,False,False,True,False,False,False,False,False,True,0.0,True,False,False,False,False,False,False,False,False,False,False,False
7738,7,47,13.0,1,99.0,7.599609,110.0,24.0,36.39844,428.56250,1.199951,0.399963,3.500000,135.0,7.469727,246.0,55.0,,1,1,0.0,40.195312,51.0,0.177979,0.091995,0,0,0.09,0.09,True,False,False,False,False,True,False,False,False,False,False,True,0.0,False,False,False,False,False,False,False,True,False,True,False,False


## Sjekker etter manglende verdier

In [121]:
# Get the count of missing values in each column
missing_columns = pasient_info_df.isna().sum()

# Convert the series to a DataFrame and transpose it
missing_columns_df = pd.DataFrame(missing_columns).T

# Display the DataFrame
missing_columns_df

Unnamed: 0,oppholdslengde,alder,utdanning,sykehusdød,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,dødsfall,antall_komorbiditeter,koma_score,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,kjønn_male,etnisitet_asian,etnisitet_black,etnisitet_hispanic,etnisitet_other,etnisitet_white,etnisitet_nan,inntekt_$11-$25k,inntekt_$25-$50k,inntekt_>$50k,inntekt_under $11k,inntekt_nan,adl,sykdom_ARF/MOSF w/Sepsis,sykdom_CHF,sykdom_COPD,sykdom_Cirrhosis,sykdom_Colon_Cancer,sykdom_Coma,sykdom_Lung_Cancer,sykdom_MOSF_w/Malig,kreft_metastatic,sykdom_Other_Cancer,dnr_før_innleggelse,dnr_ved_innleggelse
0,0,5,1380,0,0,175,0,0,0,1943,2847,2194,57,0,1911,3817,3686,4107,0,0,0,0,0,0,0,0,0,1419,1405,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Imputerer manglende data

## Sjekker data

In [122]:
pasient_info_df.describe()

Unnamed: 0,oppholdslengde,alder,utdanning,sykehusdød,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,kroppstemperatur,lungefunksjon,serumalbumin,bilirubin,kreatinin,natrium,blod_ph,glukose,blodurea_nitrogen,urinmengde,adl
count,7734.0,7729.0,6354.0,7734.0,7734.0,7559.0,7734.0,7734.0,7734.0,5791.0,4887.0,5540.0,7677.0,7734.0,5823.0,3917.0,4048.0,3627.0,7734.0
mean,17.81924,62.752879,11.752597,0.263253,84.418218,12.303919,97.262956,23.341479,37.115469,239.623512,2.952665,2.566387,1.790765,137.575899,7.415417,160.031146,32.58204,2185.398372,1.181536
std,21.762188,15.611741,3.459488,0.440427,27.718441,9.110979,31.491401,9.545735,1.256894,109.927694,0.889216,5.319958,1.712437,6.030537,0.080838,87.828819,26.826593,1458.90094,1.984719
min,3.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,31.69922,12.0,0.399963,0.099991,0.099991,110.0,6.829102,0.0,1.0,0.0,0.0
25%,6.0,53.0,10.0,0.0,63.0,6.899414,72.0,18.0,36.19531,155.09375,2.399902,0.5,0.899902,134.0,7.379883,103.0,14.0,1175.0,0.0
50%,11.0,65.0,12.0,0.0,77.0,10.5,100.0,24.0,36.69531,224.0,2.899902,0.899902,1.199951,137.0,7.419922,135.0,23.0,1963.0,0.0
75%,20.0,74.0,14.0,1.0,107.0,15.298828,120.0,28.0,38.19531,304.75,3.599609,1.899902,1.899902,141.0,7.469727,190.0,43.0,2980.0,1.0
max,343.0,102.0,31.0,1.0,195.0,128.78125,232.0,90.0,41.69531,869.375,29.0,63.0,21.5,181.0,7.769531,1092.0,192.0,9000.0,7.0


## Visualisering av data for å imputere manglende data

In [127]:
manglende_numeriske_data = ['utdanning', 'hvite_blodlegemer', 'lungefunksjon', 'serumalbumin', 'bilirubin', 'kreatinin', 'blod_ph', 'glukose', 'blodurea_nitrogen', 'urinmengde']
manglende_kategoriske_data = ['etnisitet', 'inntekt']

for value in manglende_numeriske_data:
    fig =px.histogram(pasient_info_df, value)
    fig.show()
## TODO finish this
    

In [124]:
# TODO impute missing data, perhaps impute before making dummies

## Laster opp renset datasett

In [83]:
pasient_info_df.to_csv('cleaned_pasient_info_df.csv', index=False)