In [39]:
# Imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_squared_error#, root_mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

# Plotting
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px


In [40]:
# Hente inn datasettet
combined_df = pd.read_csv('combined.csv')

In [41]:
# Sjekker om er riktig
combined_df.drop('Unnamed: 0', axis=1, inplace=True)
combined_df.drop('dnr_status', axis=1, inplace=True)
combined_df.drop('dnr_dag', axis=1, inplace=True)
combined_df.drop('adl_pasient', axis=1, inplace=True)

combined_df.head()

Unnamed: 0,pasient_id,alder,kjønn,utdanning,inntekt,etnisitet,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,...,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd
0,2,60.33899,female,12.0,$11-$25k,white,1,4.0,43.0,17.097656,...,1.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0
1,3,52.74698,female,12.0,under $11k,white,0,17.0,70.0,8.5,...,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5
2,4,42.38498,female,11.0,under $11k,white,0,3.0,75.0,9.099609,...,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5
3,6,93.01599,male,14.0,,white,1,4.0,110.0,10.398438,...,1.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0
4,7,62.37097,male,14.0,$25-$50k,white,0,9.0,78.0,11.699219,...,1.0,17.296875,46.0,0.892944,0.820923,0,0,no,,0.7


# Data preprocess

In [42]:
# Preprocess the data
X = combined_df.drop(['pasient_id', 'oppholdslengde'], axis=1)
y = combined_df['oppholdslengde']

In [43]:
X_train_temp, X_test, y_train_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=24)
X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=0.25, random_state=24)

# Grunnlinje

In [44]:
# Lager en grunnlinje modell som bruker gjennomsnittet av målte verdiene som prediksjon
def grunlinje_model(X_train, y_train, X_val):
    # Regner ut gjennomsnittet av målte verdiene
    grunnlinje_pred = np.mean(y_train)
    
    # Bruker gjennomsnittet som prediksjon for alle verdier i valideringssettet
    y_val_pred = np.full_like(y_val, grunnlinje_pred)
        
    return y_val_pred

    # Regner ut RMSE for grunnlinje modellen
grunlinje_rmse = mean_squared_error(y_val, grunlinje_model(X_train, y_train, X_val), squared=False)
print(f'Grunnlinje modell - Validerings RMSE: {grunlinje_rmse:.2f}')    

Grunnlinje modell - Validerings RMSE: 25.05



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



# Possible to make into bins of 5 for "better result"

In [45]:
"""max_y = y.max()
num_bins = int(np.ceil(max_y / 5))  # calculate the number of bins needed

bins = np.arange(0, max_y + 5, 5)  # create bin boundaries
labels = np.arange(num_bins)  # create labels for the bins

y_binned = pd.cut(y, bins=bins, labels=labels, include_lowest=True)
y = y_binned.astype(int)"""

'max_y = y.max()\nnum_bins = int(np.ceil(max_y / 5))  # calculate the number of bins needed\n\nbins = np.arange(0, max_y + 5, 5)  # create bin boundaries\nlabels = np.arange(num_bins)  # create labels for the bins\n\ny_binned = pd.cut(y, bins=bins, labels=labels, include_lowest=True)\ny = y_binned.astype(int)'

# Lage forskjellige strategier

In [46]:
# Istede for SimpleImputer. For lettere oversikt
def drop_nan(X, y):
    X_strategy = X.dropna()
    y_strategy = y.loc[X_strategy.index]
    return X_strategy, y_strategy

def drop_cols(X, y, threshold=4000):
    nan_counts = X.isnull().sum()
    X_strategy = X.drop(columns=nan_counts[nan_counts > threshold].index)
    X_strategy = X_strategy.fillna(0)
    y_strategy = y
    return X_strategy, y_strategy

def fill_median(X, y):
    X_fill_median = X.copy()
    median_cols = X_fill_median.select_dtypes(include=['int64', 'float64']).columns
    X_fill_median[median_cols] = X_fill_median[median_cols].fillna(X_fill_median[median_cols].median())
    X_strategy = X_fill_median
    y_strategy = y
    return X_strategy, y_strategy

def fill_zero(X, y):
    X_strategy = X.fillna(0)
    y_strategy = y
    return X_strategy, y_strategy

def knn_imputation(X, y):
    # One-hot encode categorical variabler
    categorical_cols = X.select_dtypes(include=['object']).columns
    encoder = OneHotEncoder()
    X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
    
    # Impute missing values using KNNImputer
    imputer = KNNImputer(n_neighbors=1)
    X_imputed = imputer.fit_transform(X_encoded)
    X_imputed = pd.DataFrame(X_imputed, columns=X_encoded.columns)
    
    return X_imputed, y

# Define models

In [47]:
# Definerer model pipelines
model_pipelines = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=24),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=24),
    'Decision Tree': DecisionTreeRegressor(random_state=24),
    'Linear Regression': LinearRegression(),
    'Polynomial Regression': make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
    'Ridge Regression': Ridge(alpha=1.0)
}

def create_model_pipeline(model_name, preprocessor):
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_pipelines[model_name])
    ])
    return model_pipeline

# Define Preprocess data og strategier

In [48]:
# Definerer strategier og modeller
strategies = ['drop_nan', 'drop_cols', 'fill_median', 'knn_imputation', 'fill_zero']
models = list(model_pipelines.keys())

# Lager en dictionary for å lagre resultatene
results = {}

# Funksjon for å preprocessere data basert på strategien
def preprocess_data(strategy, X, y):
    if strategy == 'drop_nan':
        return drop_nan(X, y)
    elif strategy == 'drop_cols':
        return drop_cols(X, y)
    elif strategy == 'fill_median':
        return fill_median(X, y)
    elif strategy == 'fill_zero':
        return fill_zero(X, y)
    elif strategy == 'knn_imputation':
        return knn_imputation(X, y)
    
# Funksjon for å dele opp data i trenings-, validerings- og testsett
def split_data(X, y, test_size=0.2, val_size=0.25, random_state=24):
    X_train_temp, X_test, y_train_temp, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=val_size, random_state=random_state)
    return X_train, X_val, X_test, y_train, y_val, y_test

# Definere en corr matrix for ekstra visualesering

In [49]:
# Funksjon for å lage en varmekart for korrelasjonsmatrisen
def plot_corr_matrix(X_strategy, strategy):
    numeric_cols = X_strategy.select_dtypes(include=['int64', 'float64']).columns
    corr_matrix = X_strategy[numeric_cols].corr()

    fig = go.Figure(data=go.Heatmap(
        z=corr_matrix.values,
        x=corr_matrix.columns,
        y=corr_matrix.columns,
        zmin=-1, zmax=1
    ))
    
    # Tilpasse layout
    fig.update_layout(
        title=f'{strategy} - Korrelasjonsmatrise',
        xaxis_nticks=len(corr_matrix.columns),
        yaxis_nticks=len(corr_matrix.columns),
        autosize=False,
        width=800,
        height=800
    )
    fig.show()


# Definere preprosess til modellene

In [50]:
# Funksjon for å lage preprocessing-pipeline
def create_preprocessor(X_strategy):
    categorical_cols = X_strategy.select_dtypes(exclude=['int64', 'float64']).columns
    numerical_cols = X_strategy.select_dtypes(include=['int64', 'float64']).columns

    categorical_transformer = Pipeline(steps=[
        ('converter', FunctionTransformer(lambda x: x.astype(str))),  
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )
    
    return preprocessor

# Tren og evaluer modell med logging av evaluering.

In [51]:
# Funksjon for å trene og evaluere modeller
def train_and_evaluate_models(X_train, y_train, X_val, y_val, models, preprocessor):
    best_rmse = float('inf')
    best_model_name = None
    best_model_pipeline = None

    for model_name in models:
        model_pipeline = create_model_pipeline(model_name, preprocessor)
        model_pipeline.fit(X_train, y_train)
        y_val_pred = model_pipeline.predict(X_val)
        rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
        
        print(f'{model_name}: {rmse_val:.2f}')
        
        if rmse_val < best_rmse:
            best_rmse = rmse_val
            best_model_name = model_name
            best_model_pipeline = model_pipeline

    return best_model_name, best_model_pipeline, best_rmse


# Funksjon for å logge og vise resultater
def log_results(strategy, best_model_name, best_rmse, within_10_diff_percent, rmse_full, within_10_diff_percent_full, results, best_model_pipeline):
    results[strategy] = {
        'best_model': best_model_name,
        'validation_rmse': best_rmse,
        'model_pipeline': best_model_pipeline,  # Store the actual model pipeline
        'within_10_diff_percent': within_10_diff_percent,
        'full_dataset_rmse': rmse_full,
        'within_10_diff_percent_full': within_10_diff_percent_full
    }

    print(f'{strategy} - Best Model: {best_model_name}')
    print(f'{strategy} - Validation RMSE: {best_rmse:.2f}')
    print(f'{strategy} - Within 10 diff: {within_10_diff_percent:.2f}%')
    print(f'{strategy} - Full Dataset RMSE: {rmse_full:.2f}')
    print(f'{strategy} - Within 10 diff (Full Dataset): {within_10_diff_percent_full:.2f}%')


# Evaluere og teste alle modellene

In [52]:
# Hovedsløyfe over strategier
for strategy in strategies:
    # Preprosesserer data
    X_strategy, y_strategy = preprocess_data(strategy, X, y)
    
    # Splitter data inn i trenings-, validerings- og testsett
    X_train, X_val, X_test, y_train, y_val, y_test = split_data(X_strategy, y_strategy)
    
    # Plotter korrelasjonsmatrise
    plot_corr_matrix(X_strategy, strategy)

    # Lager preprocessing pipeline
    preprocessor = create_preprocessor(X_strategy)

    # Trener modeller og evaluerer dem
    best_model_name, best_model_pipeline, best_rmse = train_and_evaluate_models(X_train, y_train, X_val, y_val, models, preprocessor)

    # Evaluerer prediksjoner innenfor 10 enheter fra de faktiske verdiene
    y_val_pred = best_model_pipeline.predict(X_val)
    diff = np.abs(y_val - y_val_pred)
    within_10_diff_percent = np.sum(diff <= 10) / len(y_val) * 100

    # Preprosessering og prediksjon på hele datasettet
    X_strategy, y_strategy = preprocess_data('fill_median', X, y)
    if strategy == 'knn_imputation':
        X_strategy, y_strategy = preprocess_data('knn_imputation', X, y)

    y_full_pred = best_model_pipeline.predict(X_strategy)
    rmse_full = mean_squared_error(y, y_full_pred, squared=False)
    diff_full = np.abs(y - y_full_pred)
    within_10_diff_percent_full = np.sum(diff_full <= 10) / len(y) * 100
    
    # Logger og viser resultater
    log_results(strategy, best_model_name, best_rmse, within_10_diff_percent, rmse_full, within_10_diff_percent_full, results, best_model_pipeline)


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Random Forest: 18.65



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Gradient Boosting: 17.93
Decision Tree: 26.77
Linear Regression: 17.78
Polynomial Regression: 36.45
Ridge Regression: 17.67
drop_nan - Best Model: Ridge Regression
drop_nan - Validation RMSE: 17.67
drop_nan - Within 10 diff: 58.17%
drop_nan - Full Dataset RMSE: 20.75
drop_nan - Within 10 diff (Full Dataset): 61.43%



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.




'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Random Forest: 23.11



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Gradient Boosting: 22.96
Decision Tree: 29.42
Linear Regression: 23.40
Polynomial Regression: 39783906212.06
Ridge Regression: 23.40
drop_cols - Best Model: Gradient Boosting
drop_cols - Validation RMSE: 22.96
drop_cols - Within 10 diff: 65.78%
drop_cols - Full Dataset RMSE: 18.55
drop_cols - Within 10 diff (Full Dataset): 65.47%



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.




'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Random Forest: 23.34



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Gradient Boosting: 23.29
Decision Tree: 29.32



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Linear Regression: 23.41



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Polynomial Regression: 2907374986.76
Ridge Regression: 23.41
fill_median - Best Model: Gradient Boosting
fill_median - Validation RMSE: 23.29
fill_median - Within 10 diff: 64.88%
fill_median - Full Dataset RMSE: 18.39
fill_median - Within 10 diff (Full Dataset): 67.15%



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Random Forest: 23.42



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Gradient Boosting: 23.31
Decision Tree: 30.54
Linear Regression: 23.45



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Polynomial Regression: 115506436403.06
Ridge Regression: 23.44
knn_imputation - Best Model: Gradient Boosting
knn_imputation - Validation RMSE: 23.31
knn_imputation - Within 10 diff: 64.62%
knn_imputation - Full Dataset RMSE: 18.55
knn_imputation - Within 10 diff (Full Dataset): 67.05%



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.




'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Random Forest: 23.09



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Gradient Boosting: 23.18
Decision Tree: 29.18
Linear Regression: 23.41



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Polynomial Regression: 458617939001.74
Ridge Regression: 23.41
fill_zero - Best Model: Random Forest
fill_zero - Validation RMSE: 23.09
fill_zero - Within 10 diff: 63.20%
fill_zero - Full Dataset RMSE: 15.76
fill_zero - Within 10 diff (Full Dataset): 75.66%



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



# Predikere ny data med full dataset basert på beste strategi

In [53]:
import pandas as pd

# Lese inn data og se på de første radene.
samp_demographic_df = pd.read_csv('sample_data\demographic.csv')
samp_hospital_df = pd.read_csv('sample_data\hospital.csv')
samp_physiological_df = pd.read_csv('sample_data\physiological.txt', sep='\t')
samp_severity_df = pd.read_json('sample_data\severity.json')

In [54]:
# Siden severity.df er på annen måte satt opp, må fikse det med explode listene.
exp = [col for col in samp_severity_df.columns if samp_severity_df[col].apply(lambda x: isinstance(x, list)).any()]
samp_severity_df = samp_severity_df.explode(exp)

In [55]:
# Merge dataframe
sample_combined_df = pd.merge(samp_demographic_df, samp_hospital_df, on='pasient_id', how='left')
sample_combined_df = pd.merge(sample_combined_df, samp_physiological_df, on='pasient_id', how='left')
sample_combined_df = pd.merge(sample_combined_df, samp_severity_df, on='pasient_id', how='left')

# Fjerne duplicates basert på 'pasient_id' som kan skje i mergen.
sample_combined_df = sample_combined_df.drop_duplicates(subset=['pasient_id'], keep='first')
sample_combined_df.head(5)

Unnamed: 0,pasient_id,alder,kjønn,utdanning,inntekt,etnisitet,sykehusdød,blodtrykk,hvite_blodlegemer,hjertefrekvens,...,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status,dnr_dag
0,1,62.84998,male,11.0,$11-$25k,other,0,97.0,6.0,69.0,...,20.0,0.262939,0.036995,0,0,metastatic,0.5,0.25,no dnr,5.0
1,22,48.70398,male,16.0,,other,0,66.0,12.5,125.0,...,35.0,0.535889,0.213989,0,0,metastatic,0.5,0.125,dnr ved innleggelse,11.0
2,25,29.36099,female,17.0,$25-$50k,white,0,96.0,10.599609,112.0,...,10.0,0.853882,0.674927,0,0,metastatic,0.5,0.1,no dnr,4.0
3,26,53.84,male,,under $11k,white,0,134.0,7.799805,106.0,...,16.0,0.939941,0.897949,0,0,no,0.9,0.6,dnr ved innleggelse,4.0
4,29,30.10799,male,,,asian,0,92.0,8.798828,112.0,...,21.0,0.823975,0.764893,0,0,no,,,no dnr,10.0


# Bruke beste modell og strategi

In [56]:
# Hent den beste modellen for fill_zero-strategien
print(results['fill_zero'])
# Bruk av pippelinen:
best_model_pipeline_fill_zero = results['fill_zero']['model_pipeline'] 

{'best_model': 'Random Forest', 'validation_rmse': 23.09145883129321, 'model_pipeline': Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['alder', 'utdanning', 'sykehusdød', 'blodtrykk', 'hvite_blodlegemer',
       'hjertefrekvens', 'respirasjonsfrekvens', 'kroppstemperatur',
       'lungefunksjon', 'serumalbumin', 'bilirubin', 'kreatinin', 'natrium',
       'blod_ph', 'glukose', 'blodurea_nitroge...
                                                  Pipeline(steps=[('converter',
                                                                   FunctionTransformer(func=<function create_preprocessor.<locals>.<lambda> at 0x00000182BA255F80>)),
                                                                  ('onehot',
           

In [57]:
#drop = ['pasient_id', 'oppholdslengde']
#for i in drop:
    #sample_combined_df.drop(i, axis=1, inplace=True)

In [58]:
# Preprosesserer det nye datasettet
X_new_processed, y_new = fill_zero(sample_combined_df, 0)

# Gjør prediksjoner på det nye datasettet med den beste fill_zero-modellen
y_new_predictions = best_model_pipeline_fill_zero.predict(X_new_processed)
# Skriv ut eller lagre prediksjonene
print("Prediksjoner for det nye datasettet:", y_new_predictions)

Prediksjoner for det nye datasettet: [14.09 18.44 12.71 ... 31.58  6.52 38.05]



Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [61]:
# Lag en DataFrame med pasient_id og prediksjoner
predictions_df = pd.DataFrame({
    'pasient_id': sample_combined_df['pasient_id'], 
    'prediction': np.round(y_new_predictions)
})
# Lagre DataFrame til predictions.csv
predictions_df.to_csv('predictions.csv', index=False)