In [35]:
# Imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_squared_error#, root_mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

# Plotting
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px


In [36]:
# Hente inn datasettet
combined_df = pd.read_csv('combined.csv')

In [37]:
# Sjekker om er riktig
combined_df.drop('Unnamed: 0', axis=1, inplace=True)
combined_df.head()

Unnamed: 0,pasient_id,alder,kjønn,utdanning,inntekt,etnisitet,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,...,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd,dnr_status,dnr_dag
0,2,60.33899,female,12.0,$11-$25k,white,1,4.0,43.0,17.097656,...,74.0,0.001,0.0,0,0,no,0.0,0.0,,
1,3,52.74698,female,12.0,under $11k,white,0,17.0,70.0,8.5,...,45.0,0.790894,0.664917,0,0,no,0.75,0.5,,
2,4,42.38498,female,11.0,under $11k,white,0,3.0,75.0,9.099609,...,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5,,
3,6,93.01599,male,14.0,,white,1,4.0,110.0,10.398438,...,27.0,0.284973,0.214996,0,0,no,0.0,0.0,,
4,7,62.37097,male,14.0,$25-$50k,white,0,9.0,78.0,11.699219,...,46.0,0.892944,0.820923,0,0,no,,0.7,,


# Data preprocess

In [38]:
# Preprocess the data
X = combined_df.drop(['pasient_id', 'oppholdslengde'], axis=1)
y = combined_df['oppholdslengde']

# Possible to make into bins of 5 for "better result"

In [39]:
"""max_y = y.max()
num_bins = int(np.ceil(max_y / 5))  # calculate the number of bins needed

bins = np.arange(0, max_y + 5, 5)  # create bin boundaries
labels = np.arange(num_bins)  # create labels for the bins

y_binned = pd.cut(y, bins=bins, labels=labels, include_lowest=True)
y = y_binned.astype(int)"""

'max_y = y.max()\nnum_bins = int(np.ceil(max_y / 5))  # calculate the number of bins needed\n\nbins = np.arange(0, max_y + 5, 5)  # create bin boundaries\nlabels = np.arange(num_bins)  # create labels for the bins\n\ny_binned = pd.cut(y, bins=bins, labels=labels, include_lowest=True)\ny = y_binned.astype(int)'

# Lage forskjellige strategier

In [40]:
# Istede for SimpleImputer. For lettere oversikt
def drop_nan(X, y):
    X_strategy = X.dropna()
    y_strategy = y.loc[X_strategy.index]
    return X_strategy, y_strategy

def drop_cols(X, y, threshold=0):
    nan_counts = X.isnull().sum()
    X_strategy = X.drop(columns=nan_counts[nan_counts > threshold].index)
    X_strategy = X_strategy.fillna(0)
    y_strategy = y
    return X_strategy, y_strategy

def fill_median(X, y):
    X_fill_median = X.copy()
    median_cols = X_fill_median.select_dtypes(include=['int64', 'float64']).columns
    X_fill_median[median_cols] = X_fill_median[median_cols].fillna(X_fill_median[median_cols].median())
    X_strategy = X_fill_median
    y_strategy = y
    return X_strategy, y_strategy

def fill_zero(X, y):
    X_strategy = X.fillna(0)
    y_strategy = y
    return X_strategy, y_strategy

def knn_imputation(X, y):
    # One-hot encode categorical variables
    categorical_cols = X.select_dtypes(include=['object']).columns
    encoder = OneHotEncoder()
    X_encoded = pd.get_dummies(X, columns=categorical_cols)
    
    # Impute missing values using KNNImputer
    imputer = KNNImputer(n_neighbors=5)
    X_imputed = imputer.fit_transform(X_encoded)
    X_imputed = pd.DataFrame(X_imputed, columns=X_encoded.columns)
    
    return X_imputed, y

# Define models

In [41]:
# Define the model pipelines
model_pipelines = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=24),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=24),
    'Decision Tree': DecisionTreeRegressor(random_state=24),
    'Linear Regression': LinearRegression(),
    'Polynomial Regression': make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
    'Ridge Regression': Ridge(alpha=1.0)
}

# Define the function to create a model pipeline
def create_model_pipeline(model_name, preprocessor):
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_pipelines[model_name])
    ])
    return model_pipeline

# Define the function to evaluate a model
def evaluate_model(model_pipeline, X_train, y_train, X_val, y_val):
    model_pipeline.fit(X_train, y_train)
    y_val_pred = model_pipeline.predict(X_val)
    rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
    return rmse_val

In [42]:
# Define the strategies and models
strategies = ['drop_nan', 'drop_cols', 'fill_median', 'fill_zero', 'knn_imputation']
models = list(model_pipelines.keys())

# Create a dictionary to store the results
results = {}

for strategy in strategies:
    if strategy == 'drop_nan':
        X_strategy, y_strategy = drop_nan(X, y)
    elif strategy == 'drop_cols':
        X_strategy, y_strategy = drop_cols(X, y)
    elif strategy == 'fill_median':
        X_strategy, y_strategy = fill_median(X, y)
    elif strategy == 'fill_zero':
        X_strategy, y_strategy = fill_zero(X, y)
    elif strategy == 'knn_imputation':
        X_strategy, y_strategy = knn_imputation(X, y)
        

    # Split the data into training, validation, and testing sets
    X_train_temp, X_test, y_train_temp, y_test = train_test_split(X_strategy, y_strategy, test_size=0.2, random_state=24)
    X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=0.25, random_state=24)

    # Compute the correlation matrix
    numeric_cols = X_strategy.select_dtypes(include=['int64', 'float64']).columns
    corr_matrix = X_strategy[numeric_cols].corr()

    # Create the heatmap using Plotly
    fig = go.Figure(data=go.Heatmap(
        z=corr_matrix.values,
        x=corr_matrix.columns,
        y=corr_matrix.columns,
        zmin=-1, zmax=1
    ))

    # Update layout to add a title and adjust sizing
    fig.update_layout(
        title=f'{strategy} - Correlation Matrix',
        xaxis_nticks=len(corr_matrix.columns),
        yaxis_nticks=len(corr_matrix.columns),
        autosize=False,
        width=800,  # you can adjust the size here
        height=800
    )

    # Show the heatmap
    fig.show()

    categorical_cols = X_strategy.select_dtypes(exclude=['int64', 'float64']).columns
    numerical_cols = [ col for col in X_strategy.columns if col not in categorical_cols]

    categorical_transformer = Pipeline(steps=[
        ('converter', FunctionTransformer(lambda x: x.astype(str))),  # Convert to strings
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )

    best_rmse = float('inf')
    best_model_name = None
    best_model_pipeline = None

    for model_name in models:
        model_pipeline = create_model_pipeline(model_name, preprocessor)
        model_pipeline.fit(X_train, y_train)
        y_val_pred = model_pipeline.predict(X_val)
        rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
        print(f'{model_name}: {rmse_val}')
        if rmse_val < best_rmse:
            best_rmse = rmse_val
            best_model_name = model_name
            best_model_pipeline = model_pipeline


    # Calculate the predicted values for the validation set
    y_val_pred = best_model_pipeline.predict(X_val)

    # Calculate the absolute difference between predicted and actual values
    diff = np.abs(y_val - y_val_pred)

    # Calculate the percentage of values with 10 or less difference
    within_10_diff_percent = np.sum(diff <= 10) / len(y_val) * 100

    results[strategy] = {
        'best_model': best_model_name,
        'validation_rmse': best_rmse,
        'within_10_diff_percent': within_10_diff_percent
    }

    print(f'{strategy} - Best Model: {best_model_name}')
    print(f'{strategy} - Validation RMSE: {best_rmse:.2f}')
    print(f'{strategy} - Within 10 diff: {within_10_diff_percent:.2f}%')
    

Random Forest: 9.993976519217297
Gradient Boosting: 9.52645530985956
Decision Tree: 10.279429296739517
Linear Regression: 15.880797983165149
Polynomial Regression: 12.21768834778805
Ridge Regression: 14.035118824312129
drop_nan - Best Model: Gradient Boosting
drop_nan - Validation RMSE: 9.53
drop_nan - Within 10 diff: 77.78%


Random Forest: 23.347446713646693
Gradient Boosting: 23.11858712795693
Decision Tree: 30.45485280720314
Linear Regression: 23.42199974169499
Polynomial Regression: 155155206683.68515
Ridge Regression: 23.421100966454524
drop_cols - Best Model: Gradient Boosting
drop_cols - Validation RMSE: 23.12
drop_cols - Within 10 diff: 65.46%


Random Forest: 23.176161548041243
Gradient Boosting: 23.064143448343994
Decision Tree: 28.193851687247058
Linear Regression: 23.371542258040392
Polynomial Regression: 18992739657.46476
Ridge Regression: 23.36942771162878
fill_median - Best Model: Gradient Boosting
fill_median - Validation RMSE: 23.06
fill_median - Within 10 diff: 66.11%


Random Forest: 23.02021721545717
Gradient Boosting: 23.09768121270451
Decision Tree: 29.070307778004196
Linear Regression: 23.375993580974523
Polynomial Regression: 33431983522.500435
Ridge Regression: 23.374052870318682
fill_zero - Best Model: Random Forest
fill_zero - Validation RMSE: 23.02
fill_zero - Within 10 diff: 64.88%


Random Forest: 23.07190121229297
Gradient Boosting: 23.01735531831297
Decision Tree: 29.919923314164514
Linear Regression: 23.373661428878542
Polynomial Regression: 7655386479759.703
Ridge Regression: 23.37065010442737
knn_imputation - Best Model: Gradient Boosting
knn_imputation - Validation RMSE: 23.02
knn_imputation - Within 10 diff: 65.65%


# Sammenligne med full dataset