In [1]:
# Imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_squared_error#, root_mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

# Plotting
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px


  from pandas.core import (


In [2]:
# Hente inn datasettet
combined_df = pd.read_csv('combined_v2.csv')
combined_df.head()

Unnamed: 0.1,Unnamed: 0,pasient_id,alder,kjønn,etnisitet,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,hjertefrekvens,...,adl_stedfortreder,diabetes,demens,kreft,dnr_status,dnr_dag,alder_gruppe,sosiooekonomisk_status,overlevelses_proxy,nyrefunksjons_proxy
0,0,2,60.33899,female,white,1,4.0,43.0,17.097656,112.0,...,1.0,0,0,no,,,4,6.5,21.116052,5.5
1,1,3,52.74698,female,white,0,17.0,70.0,8.5,88.0,...,0.0,0,0,no,,,3,6.0,11.367635,2.0
2,2,4,42.38498,female,white,0,3.0,75.0,9.099609,88.0,...,0.0,0,0,metastatic,,,2,5.5,6.93477,0.799927
3,4,6,93.01599,male,white,1,4.0,110.0,10.398438,101.0,...,1.0,0,0,no,,,5,,7.816401,0.699951
4,5,7,62.37097,male,white,0,9.0,78.0,11.699219,120.0,...,1.0,0,0,no,,,4,8.0,13.142148,1.599854


In [3]:
# Sjekker om er riktig etter fiks
combined_df.drop('Unnamed: 0', axis=1, inplace=True)

combined_df.head()

Unnamed: 0,pasient_id,alder,kjønn,etnisitet,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,hjertefrekvens,respirasjonsfrekvens,...,adl_stedfortreder,diabetes,demens,kreft,dnr_status,dnr_dag,alder_gruppe,sosiooekonomisk_status,overlevelses_proxy,nyrefunksjons_proxy
0,2,60.33899,female,white,1,4.0,43.0,17.097656,112.0,34.0,...,1.0,0,0,no,,,4,6.5,21.116052,5.5
1,3,52.74698,female,white,0,17.0,70.0,8.5,88.0,28.0,...,0.0,0,0,no,,,3,6.0,11.367635,2.0
2,4,42.38498,female,white,0,3.0,75.0,9.099609,88.0,32.0,...,0.0,0,0,metastatic,,,2,5.5,6.93477,0.799927
3,6,93.01599,male,white,1,4.0,110.0,10.398438,101.0,44.0,...,1.0,0,0,no,,,5,,7.816401,0.699951
4,7,62.37097,male,white,0,9.0,78.0,11.699219,120.0,28.0,...,1.0,0,0,no,,,4,8.0,13.142148,1.599854


# Data preprocess

In [4]:
# Preprocess the data
X = combined_df.drop(['pasient_id', 'oppholdslengde'], axis=1)
y = combined_df['oppholdslengde']

In [5]:
X_train_temp, X_test, y_train_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=24)
X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=0.25, random_state=24)

# Grunnlinje

In [6]:
# Lager en grunnlinje modell som bruker gjennomsnittet av målte verdiene som prediksjon
def grunlinje_model(X_train, y_train, X_val):
    # Regner ut gjennomsnittet av målte verdiene
    grunnlinje_pred = np.mean(y_train)
    
    # Bruker gjennomsnittet som prediksjon for alle verdier i valideringssettet
    y_val_pred = np.full_like(y_val, grunnlinje_pred)
        
    return y_val_pred

    # Regner ut RMSE for grunnlinje modellen
grunlinje_rmse = mean_squared_error(y_val, grunlinje_model(X_train, y_train, X_val), squared=False)
print(f'Grunnlinje modell - Validerings RMSE: {grunlinje_rmse:.2f}')    

Grunnlinje modell - Validerings RMSE: 25.05


# Possible to make into bins of 5 for "better result"

In [7]:
"""max_y = y.max()
num_bins = int(np.ceil(max_y / 5))  # calculate the number of bins needed

bins = np.arange(0, max_y + 5, 5)  # create bin boundaries
labels = np.arange(num_bins)  # create labels for the bins

y_binned = pd.cut(y, bins=bins, labels=labels, include_lowest=True)
y = y_binned.astype(int)"""

'max_y = y.max()\nnum_bins = int(np.ceil(max_y / 5))  # calculate the number of bins needed\n\nbins = np.arange(0, max_y + 5, 5)  # create bin boundaries\nlabels = np.arange(num_bins)  # create labels for the bins\n\ny_binned = pd.cut(y, bins=bins, labels=labels, include_lowest=True)\ny = y_binned.astype(int)'

# Lage forskjellige strategier

In [8]:
# Istede for SimpleImputer. For lettere oversikt
def drop_nan(X, y):
    X_strategy = X.dropna()
    y_strategy = y.loc[X_strategy.index]
    return X_strategy, y_strategy

def drop_cols(X, y, threshold=2000): 
    nan_counts = X.isnull().sum()
    X_strategy = X.drop(columns=nan_counts[nan_counts > threshold].index)
    X_strategy = X_strategy.dropna()
    y_strategy = y.loc[X_strategy.index]
    return X_strategy, y_strategy

def fill_median(X, y):
    X_fill = X.copy()
    
    # Fill numeric med median
    median_cols = X_fill.select_dtypes(include=['int64', 'float64']).columns
    X_fill[median_cols] = X_fill[median_cols].fillna(X_fill[median_cols].median())

    # Fill string kolonner med den det er flest av
    string_cols = X_fill.select_dtypes(include=['object']).columns
    for col in string_cols:
        most_frequent = X_fill[col].mode()[0]  # Få verdien som er flest av
        X_fill[col].fillna(most_frequent, inplace=True)

    return X_fill, y

def fill_zero(X, y):
    X_strategy = X.fillna(0)
    y_strategy = y
    return X_strategy, y_strategy

def knn_imputation(X, y):
    # One-hot encode categorical variabler
    categorical_cols = X.select_dtypes(include=['object']).columns
    encoder = OneHotEncoder()
    X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
    
    # Impute missing values using KNNImputer
    imputer = KNNImputer(n_neighbors=1)
    X_imputed = imputer.fit_transform(X_encoded)
    X_imputed = pd.DataFrame(X_imputed, columns=X_encoded.columns)
    
    return X_imputed, y

# Define models

In [9]:
# Definerer model pipelines
model_pipelines = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=24),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=24),
    'Decision Tree': DecisionTreeRegressor(random_state=24),
    'Linear Regression': LinearRegression(),
    'Polynomial Regression': make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
    'Ridge Regression': Ridge(alpha=1.0)
}
# Forksjellige parametere å teste modellen på
param_grids = {
    'Random Forest': {
        'model__n_estimators': [50, 100, 150],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5]
    },
    'Gradient Boosting': {
        'model__n_estimators': [50, 100],
        'model__learning_rate': [0.01, 0.1, 0.2],
        'model__max_depth': [3, 5, 7]
    },
    'Decision Tree': {
        'model__max_depth': [None, 10, 20, 30],
        'model__min_samples_split': [2, 5, 10]
    },
    'Ridge Regression': {
        'model__alpha': [0.1, 1.0, 10.0]
    }
}
def create_model_pipeline(model_name, preprocessor):
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_pipelines[model_name])
    ])
    return model_pipeline

# Define Preprocess data og strategier

In [10]:
# Definerer strategier og modeller
strategies = ['drop_nan','drop_cols', 'fill_median', 'knn_imputation', 'fill_zero']
models = list(model_pipelines.keys())

# Lager en dictionary for å lagre resultatene
results = {}

# Funksjon for å preprocessere data basert på strategien
def preprocess_data(strategy, X, y):
    if strategy == 'drop_nan':
        return drop_nan(X, y)
    elif strategy == 'fill_median':
        return fill_median(X, y)
    elif strategy == 'fill_zero':
        return fill_zero(X, y)
    elif strategy == 'knn_imputation':
        return knn_imputation(X, y)
    elif strategy == 'drop_cols':
        return drop_cols(X, y)
    
# Funksjon for å dele opp data i trenings-, validerings- og testsett
def split_data(X, y, test_size=0.2, val_size=0.25, random_state=24):
    X_train_temp, X_test, y_train_temp, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=val_size, random_state=random_state)
    return X_train, X_val, X_test, y_train, y_val, y_test

# Definere en corr matrix for ekstra visualesering

In [11]:
# Funksjon for å lage en varmekart for korrelasjonsmatrisen
def plot_corr_matrix(X_strategy, strategy):
    numeric_cols = X_strategy.select_dtypes(include=['int64', 'float64']).columns
    corr_matrix = X_strategy[numeric_cols].corr()

    fig = go.Figure(data=go.Heatmap(
        z=corr_matrix.values,
        x=corr_matrix.columns,
        y=corr_matrix.columns,
        zmin=-1, zmax=1
    ))
    
    # Tilpasse layout
    fig.update_layout(
        title=f'{strategy} - Korrelasjonsmatrise',
        xaxis_nticks=len(corr_matrix.columns),
        yaxis_nticks=len(corr_matrix.columns),
        autosize=False,
        width=800,
        height=800
    )
    fig.show()


# Definere preprosess til modellene

In [12]:
# Funksjon for å lage preprocessing-pipeline
def convert_to_str(x):
    return x.astype(str)

def create_preprocessor(X_strategy):
    categorical_cols = X_strategy.select_dtypes(exclude=['int64', 'float64']).columns
    numerical_cols = X_strategy.select_dtypes(include=['int64', 'float64']).columns

    categorical_transformer = Pipeline(steps=[
        ('converter', FunctionTransformer(convert_to_str)),  
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )
    
    return preprocessor

# Tren og evaluer modell med logging av evaluering.

In [13]:
# Funksjon for å trene og evaluere modeller
def train_and_evaluate_models(X_train, y_train, X_val, y_val, models, preprocessor):
    best_rmse = float('inf')
    best_model_name = None
    best_model_pipeline = None

    for model_name in models:
        model_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model_pipelines[model_name])
        ])
        # Sjekke hvilke parameter som er best med Gridsearch
        param_grid = param_grids.get(model_name, {})
        if param_grid:
            grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1) 
            grid_search.fit(X_train, y_train)
            best_model_pipeline = grid_search.best_estimator_
            rmse_val = -grid_search.best_score_
        else:
            model_pipeline.fit(X_train, y_train)
            y_val_pred = model_pipeline.predict(X_val)
            rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
            best_model_pipeline = model_pipeline
        
        print(f'{model_name}: {rmse_val:.2f}')
        
        if rmse_val < best_rmse:
            best_rmse = rmse_val
            best_model_name = model_name

    return best_model_name, best_model_pipeline, best_rmse


# Funksjon for å logge og vise resultater
def log_results(strategy, best_model_name, best_rmse, within_10_diff_percent, rmse_full, within_10_diff_percent_full, results, best_model_pipeline):
    results[strategy] = {
        'best_model': best_model_name,
        'validation_rmse': best_rmse,
        'model_pipeline': best_model_pipeline, # Lagre acutal pipeline
        'within_10_diff_percent': within_10_diff_percent,
        'full_dataset_rmse': rmse_full,
        'within_10_diff_percent_full': within_10_diff_percent_full
    }

    print(f'{strategy} - Best Model: {best_model_name}')
    print(f'{strategy} - Validation RMSE: {best_rmse:.2f}')
    print(f'{strategy} - Within 10 diff: {within_10_diff_percent:.2f}%')
    print(f'{strategy} - Full Dataset RMSE: {rmse_full:.2f}')
    print(f'{strategy} - Within 10 diff (Full Dataset): {within_10_diff_percent_full:.2f}%')


# Evaluere og teste alle modellene

In [14]:
# Hovedsløyfe over strategier
for strategy in strategies:
    # Preprosesserer data
    X_strategy, y_strategy = preprocess_data(strategy, X, y)
    
    # Splitter data inn i trenings-, validerings- og testsett
    X_train, X_val, X_test, y_train, y_val, y_test = split_data(X_strategy, y_strategy)
    
    # Plotter korrelasjonsmatrise
    plot_corr_matrix(X_strategy, strategy)

    # Lager preprocessing pipeline
    preprocessor = create_preprocessor(X_strategy)

    # Trener modeller og evaluerer dem
    best_model_name, best_model_pipeline, best_rmse = train_and_evaluate_models(X_train, y_train, X_val, y_val, models, preprocessor)

    # Evaluerer prediksjoner innenfor 10 enheter fra de faktiske verdiene
    y_val_pred = best_model_pipeline.predict(X_val)
    diff = np.abs(y_val - y_val_pred)
    within_10_diff_percent = np.sum(diff <= 10) / len(y_val) * 100

    # Preprosessering og prediksjon på hele datasettet
    X_strategy, y_strategy = preprocess_data('fill_median', X, y)
    if strategy == 'knn_imputation':
        X_strategy, y_strategy = preprocess_data('knn_imputation', X, y)

    y_full_pred = best_model_pipeline.predict(X_strategy)
    rmse_full = mean_squared_error(y, y_full_pred, squared=False)
    diff_full = np.abs(y - y_full_pred)
    within_10_diff_percent_full = np.sum(diff_full <= 10) / len(y) * 100
    
    # Logger og viser resultater
    log_results(strategy, best_model_name, best_rmse, within_10_diff_percent, rmse_full, within_10_diff_percent_full, results, best_model_pipeline)

Random Forest: 4.97
Gradient Boosting: 4.65
Decision Tree: 5.73
Linear Regression: 15.86
Polynomial Regression: 11.97
Ridge Regression: 4.32
drop_nan - Best Model: Ridge Regression
drop_nan - Validation RMSE: 4.32
drop_nan - Within 10 diff: 77.78%
drop_nan - Full Dataset RMSE: 24.53
drop_nan - Within 10 diff (Full Dataset): 65.57%



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





Random Forest: 23.19
Gradient Boosting: 22.86
Decision Tree: 28.35
Linear Regression: 22.93
Polynomial Regression: 3412297456299.22
Ridge Regression: 22.98
drop_cols - Best Model: Gradient Boosting
drop_cols - Validation RMSE: 22.86
drop_cols - Within 10 diff: 55.41%
drop_cols - Full Dataset RMSE: 20.36
drop_cols - Within 10 diff (Full Dataset): 60.37%



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





Random Forest: 18.77
Gradient Boosting: 18.72
Decision Tree: 21.61
Linear Regression: 23.50
Polynomial Regression: 730232722.04
Ridge Regression: 19.08
fill_median - Best Model: Gradient Boosting
fill_median - Validation RMSE: 18.72
fill_median - Within 10 diff: 61.90%
fill_median - Full Dataset RMSE: 20.30
fill_median - Within 10 diff (Full Dataset): 62.03%



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





Random Forest: 18.82
Gradient Boosting: 18.83
Decision Tree: 21.81
Linear Regression: 23.47
Polynomial Regression: 8577245304.55
Ridge Regression: 19.06



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





knn_imputation - Best Model: Random Forest
knn_imputation - Validation RMSE: 18.82
knn_imputation - Within 10 diff: 62.29%
knn_imputation - Full Dataset RMSE: 20.29
knn_imputation - Within 10 diff (Full Dataset): 61.88%


# Predikere ny data med full dataset basert på beste strategi

In [15]:
import pandas as pd

# Lese inn data og se på de første radene.
samp_demographic_df = pd.read_csv('sample_data\demographic.csv')
samp_hospital_df = pd.read_csv('sample_data\hospital.csv')
samp_physiological_df = pd.read_csv('sample_data\physiological.txt', sep='\t')
samp_severity_df = pd.read_json('sample_data\severity.json')

In [16]:
# Siden severity.df er på annen måte satt opp, må fikse det med explode listene.
exp = [col for col in samp_severity_df.columns if samp_severity_df[col].apply(lambda x: isinstance(x, list)).any()]
samp_severity_df = samp_severity_df.explode(exp)

In [17]:
# Merge dataframe
sample_combined_df = pd.merge(samp_demographic_df, samp_hospital_df, on='pasient_id', how='left')
sample_combined_df = pd.merge(sample_combined_df, samp_physiological_df, on='pasient_id', how='left')
sample_combined_df = pd.merge(sample_combined_df, samp_severity_df, on='pasient_id', how='left')

# Fjerne duplicates basert på 'pasient_id' som kan skje i mergen.
sample_combined_df = sample_combined_df.drop_duplicates(subset=['pasient_id'], keep='first')
sample_combined_df.head(5)

Unnamed: 0,pasient_id,alder,kjønn,utdanning,inntekt,etnisitet,sykehusdød,blodtrykk,hvite_blodlegemer,hjertefrekvens,...,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status,dnr_dag
0,1,62.84998,male,11.0,$11-$25k,other,0,97.0,6.0,69.0,...,20.0,0.262939,0.036995,0,0,metastatic,0.5,0.25,no dnr,5.0
1,22,48.70398,male,16.0,,other,0,66.0,12.5,125.0,...,35.0,0.535889,0.213989,0,0,metastatic,0.5,0.125,dnr ved innleggelse,11.0
2,25,29.36099,female,17.0,$25-$50k,white,0,96.0,10.599609,112.0,...,10.0,0.853882,0.674927,0,0,metastatic,0.5,0.1,no dnr,4.0
3,26,53.84,male,,under $11k,white,0,134.0,7.799805,106.0,...,16.0,0.939941,0.897949,0,0,no,0.9,0.6,dnr ved innleggelse,4.0
4,29,30.10799,male,,,asian,0,92.0,8.798828,112.0,...,21.0,0.823975,0.764893,0,0,no,,,no dnr,10.0


# Samme variabel utvinning som i datatilbredningen

In [18]:
# 1. Aldersgrupper
sample_combined_df['alder_gruppe'] = pd.cut(sample_combined_df['alder'], bins=[0, 18, 30, 45, 60, 75, np.inf], 
                            labels=[0, 1, 2, 3, 4, 5])

inntekt_mapping = {
    "under $11k": 0,
    "$11-$25k": 1,
    "$25-$50k": 2,
    ">$50k": 3
}

# Apply the mapping to the 'inntekt' column
sample_combined_df['inntekt'] = sample_combined_df['inntekt'].map(inntekt_mapping)
# 1. Proxy for sosioøkonomisk status (basert på 'inntekt' og 'utdanning')
# Kombinerer 'inntekt' og 'utdanning' til en enkel sosioøkonomisk score
sample_combined_df['sosiooekonomisk_status'] = sample_combined_df[['inntekt', 'utdanning']].apply(lambda x: x.mean() if pd.notnull(x['inntekt']) and pd.notnull(x['utdanning']) else np.nan, axis=1)
# 2. en proxy for overlevelsesestimat basert på alle relevante estimater
sample_combined_df['overlevelses_proxy'] = sample_combined_df[['overlevelsesestimat_2mnd', 'overlevelsesestimat_6mnd', 
                               'lege_overlevelsesestimat_2mnd', 'lege_overlevelsesestimat_6mnd','fysiologisk_score',
                               'apache_fysiologisk_score']].mean(axis=1)

# 3. en proxy for nyrefunksjon basert på sammenhe
sample_combined_df['nyrefunksjons_proxy'] = sample_combined_df[['kreatinin', 'blodurea_nitrogen']].mean(axis=1)

# Bruke beste modell og strategi

In [19]:
# Hent den beste modellen for fill_zero-strategien
print(results['fill_median'])
# Bruk av pippelinen:
best_model_pipeline_fill_median = results['fill_median']['model_pipeline'] 

{'best_model': 'Gradient Boosting', 'validation_rmse': 18.72030383480165, 'model_pipeline': Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['alder', 'sykehusdød', 'blodtrykk', 'hvite_blodlegemer',
       'hjertefrekvens', 'respirasjonsfrekvens', 'kroppstemperatur',
       'lungefunksjon', 'serumalbumin', 'bilirubin', 'natrium', 'blod_ph',
       'glukose', 'urinmengde', 'dødsfall', 'antall_komorbidi...
       'overlevelses_proxy', 'nyrefunksjons_proxy'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('converter',
                                                                   FunctionTransformer(func=<function convert_to_str

In [20]:
sample_combined_df.drop('inntekt', axis=1, inplace=True)
sample_combined_df.drop('utdanning', axis=1, inplace=True)
sample_combined_df.drop('overlevelsesestimat_2mnd', axis=1, inplace=True)
sample_combined_df.drop('lege_overlevelsesestimat_2mnd', axis=1, inplace=True)
sample_combined_df.drop('overlevelsesestimat_6mnd', axis=1, inplace=True)
sample_combined_df.drop('lege_overlevelsesestimat_6mnd', axis=1, inplace=True)
sample_combined_df.drop('fysiologisk_score', axis=1, inplace=True)
sample_combined_df.drop('apache_fysiologisk_score', axis=1, inplace=True)
sample_combined_df.drop('kreatinin', axis=1, inplace=True)
sample_combined_df.drop('blodurea_nitrogen', axis=1, inplace=True)

In [21]:
#drop = ['pasient_id', 'oppholdslengde']
#for i in drop:
    #sample_combined_df.drop(i, axis=1, inplace=True)

In [22]:
# Preprosesserer det nye datasettet
# Pasient_id er ikke sample. Så blir ikke noe Y
X_new_processed, y_new = fill_median(sample_combined_df, sample_combined_df)

# Gjør prediksjoner på det nye datasettet med den beste fill_zero-modellen
y_new_predictions = best_model_pipeline_fill_median.predict(X_new_processed)
# Skriv ut eller lagre prediksjonene
print("Prediksjoner for det nye datasettet:", y_new_predictions)


Prediksjoner for det nye datasettet: [-0.52868453  8.12755954 11.15141871 ... 27.25163199  6.44481717
 26.81884447]



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [23]:
# Kalkulere statistikk for prediksjonene for sammmenligninger
mean = np.mean(y_new_predictions)
median = np.median(y_new_predictions)
std = np.std(y_new_predictions)
min_val = np.min(y_new_predictions)
max_val = np.max(y_new_predictions) 
print(f'Mean: {mean:.2f}')
print(f'Median: {median:.2f}')
print(f'Standard deviation: {std:.2f}')
print(f'Minimum value: {min_val:.2f}')
print(f'Maximum value: {max_val:.2f}')

Mean: 15.88
Median: 15.23
Standard deviation: 7.53
Minimum value: -2.15
Maximum value: 47.99


In [24]:
print(X.columns)

Index(['alder', 'kjønn', 'etnisitet', 'sykehusdød', 'blodtrykk',
       'hvite_blodlegemer', 'hjertefrekvens', 'respirasjonsfrekvens',
       'kroppstemperatur', 'lungefunksjon', 'serumalbumin', 'bilirubin',
       'natrium', 'blod_ph', 'glukose', 'urinmengde', 'sykdomskategori_id',
       'sykdomskategori', 'dødsfall', 'sykdom_underkategori',
       'antall_komorbiditeter', 'koma_score', 'adl_pasient',
       'adl_stedfortreder', 'diabetes', 'demens', 'kreft', 'dnr_status',
       'dnr_dag', 'alder_gruppe', 'sosiooekonomisk_status',
       'overlevelses_proxy', 'nyrefunksjons_proxy'],
      dtype='object')


In [25]:
# Lag en DataFrame med pasient_id og prediksjoner
# Fjern negative prediksjoner
y_new_predictions = np.where(y_new_predictions < 0, 0, y_new_predictions)
predictions_df = pd.DataFrame({
    'pasient_id': sample_combined_df['pasient_id'], 
    'prediction': np.round(y_new_predictions)
})
# Lagre DataFrame til predictions.csv
predictions_df.to_csv('predictions.csv', index=False)

# Setting up for using flask with pickle

Måtte endre lambda funksjon siden den kan ikke være i pickle


In [26]:
import pickle
# Pickle.dump for lagring av modellen
pickle.dump(best_model_pipeline_fill_zero, open('model.pkl', 'wb'))