In [24]:
# Imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_squared_error#, root_mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures

# Plotting
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px


In [25]:
# Hente inn datasettet
combined_df = pd.read_csv('combined.csv')

In [26]:
# Sjekker om er riktig
combined_df.drop('Unnamed: 0', axis=1, inplace=True)
combined_df.drop('dnr_status', axis=1, inplace=True)
combined_df.drop('dnr_dag', axis=1, inplace=True)
combined_df.drop('adl_pasient', axis=1, inplace=True)
combined_df.head()

Unnamed: 0,pasient_id,alder,kjønn,utdanning,inntekt,etnisitet,sykehusdød,oppholdslengde,blodtrykk,hvite_blodlegemer,...,adl_stedfortreder,fysiologisk_score,apache_fysiologisk_score,overlevelsesestimat_2mnd,overlevelsesestimat_6mnd,diabetes,demens,kreft,lege_overlevelsesestimat_2mnd,lege_overlevelsesestimat_6mnd
0,2,60.33899,female,12.0,$11-$25k,white,1,4.0,43.0,17.097656,...,1.0,52.695312,74.0,0.001,0.0,0,0,no,0.0,0.0
1,3,52.74698,female,12.0,under $11k,white,0,17.0,70.0,8.5,...,0.0,20.5,45.0,0.790894,0.664917,0,0,no,0.75,0.5
2,4,42.38498,female,11.0,under $11k,white,0,3.0,75.0,9.099609,...,0.0,20.097656,19.0,0.698975,0.411987,0,0,metastatic,0.9,0.5
3,6,93.01599,male,14.0,,white,1,4.0,110.0,10.398438,...,1.0,19.398438,27.0,0.284973,0.214996,0,0,no,0.0,0.0
4,7,62.37097,male,14.0,$25-$50k,white,0,9.0,78.0,11.699219,...,1.0,17.296875,46.0,0.892944,0.820923,0,0,no,,0.7


# Data preprocess

In [27]:
# Preprocess the data
X = combined_df.drop(['pasient_id', 'oppholdslengde'], axis=1)
y = combined_df['oppholdslengde']

# Lage forskjellige strategier

In [28]:
def drop_nan(X, y):
    X_strategy = X.dropna()
    y_strategy = y.loc[X_strategy.index]
    return X_strategy, y_strategy

def drop_cols(X, y, threshold=0):
    nan_counts = X.isnull().sum()
    X_strategy = X.drop(columns=nan_counts[nan_counts > threshold].index)
    X_strategy = X_strategy.fillna(0)
    y_strategy = y
    return X_strategy, y_strategy

def fill_median(X, y):
    X_fill_median = X.copy()
    median_cols = X_fill_median.select_dtypes(include=['int64', 'float64']).columns
    X_fill_median[median_cols] = X_fill_median[median_cols].fillna(X_fill_median[median_cols].median())
    X_strategy = X_fill_median
    y_strategy = y
    return X_strategy, y_strategy

def fill_zero(X, y):
    X_strategy = X.fillna(0)
    y_strategy = y
    return X_strategy, y_strategy

def knn_imputation(X, y):
    # One-hot encode categorical variables
    categorical_cols = X.select_dtypes(include=['object']).columns
    encoder = OneHotEncoder()
    X_encoded = pd.get_dummies(X, columns=categorical_cols)
    
    # Impute missing values using KNNImputer
    imputer = KNNImputer(n_neighbors=5)
    X_imputed = imputer.fit_transform(X_encoded)
    X_imputed = pd.DataFrame(X_imputed, columns=X_encoded.columns)
    
    return X_imputed, y

# Define models

In [29]:
# Define the model pipelines
model_pipelines = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=24),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=24),
    'Decision Tree': DecisionTreeRegressor(random_state=24),
    'Linear Regression': LinearRegression(),
    'Polynomial Regression': make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
    'Ridge Regression': Ridge(alpha=1.0)
}

# Define the function to create a model pipeline
def create_model_pipeline(model_name, preprocessor):
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_pipelines[model_name])
    ])
    return model_pipeline

# Define the function to evaluate a model
def evaluate_model(model_pipeline, X_train, y_train, X_val, y_val):
    model_pipeline.fit(X_train, y_train)
    y_val_pred = model_pipeline.predict(X_val)
    rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
    return rmse_val

In [30]:
# Define the strategies and models
strategies = ['drop_nan', 'drop_cols', 'fill_median', 'fill_zero', 'knn_imputation']
models = list(model_pipelines.keys())

# Create a dictionary to store the results
results = {}

for strategy in strategies:
    if strategy == 'drop_nan':
        X_strategy, y_strategy = drop_nan(X, y)
    elif strategy == 'drop_cols':
        X_strategy, y_strategy = drop_cols(X, y)
    elif strategy == 'fill_median':
        X_strategy, y_strategy = fill_median(X, y)
    elif strategy == 'fill_zero':
        X_strategy, y_strategy = fill_zero(X, y)
    elif strategy == 'knn_imputation':
        X_strategy, y_strategy = knn_imputation(X, y)

    # Split the data into training, validation, and testing sets
    X_train_temp, X_test, y_train_temp, y_test = train_test_split(X_strategy, y_strategy, test_size=0.2, random_state=24)
    X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=0.25, random_state=24)

    # Compute the correlation matrix
    numeric_cols = X_strategy.select_dtypes(include=['int64', 'float64']).columns
    corr_matrix = X_strategy[numeric_cols].corr()

    # Create the heatmap using Plotly
    fig = go.Figure(data=go.Heatmap(
        z=corr_matrix.values,
        x=corr_matrix.columns,
        y=corr_matrix.columns,
        zmin=-1, zmax=1
    ))

    # Update layout to add a title and adjust sizing
    fig.update_layout(
        title=f'{strategy} - Correlation Matrix',
        xaxis_nticks=len(corr_matrix.columns),
        yaxis_nticks=len(corr_matrix.columns),
        autosize=False,
        width=800,  # you can adjust the size here
        height=800
    )

    # Show the heatmap
    fig.show()

    categorical_cols = X_strategy.select_dtypes(exclude=['int64', 'float64']).columns
    numerical_cols = [ col for col in X_strategy.columns if col not in categorical_cols]

    categorical_transformer = Pipeline(steps=[
        ('converter', FunctionTransformer(lambda x: x.astype(str))),  # Convert to strings
        ('onehot', OneHotEncoder())
    ])
    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )

    best_rmse = float('inf')
    best_model_name = None
    best_model_pipeline = None

    for model_name in models:
        model_pipeline = create_model_pipeline(model_name, preprocessor)
        rmse_val = evaluate_model(model_pipeline, X_train, y_train, X_val, y_val)
        print(f'{model_name}: {rmse_val}')
        if rmse_val < best_rmse:
            best_rmse = rmse_val
            best_model_name = model_name
            best_model_pipeline = model_pipeline

    results[strategy] = {
        'best_model': best_model_pipeline,
        'validation_rmse': best_rmse
    }

    print(f'{strategy} - Best Model: {best_model_name}')
    print(f'{strategy} - Validation RMSE: {best_rmse:.2f}')

Random Forest: 18.645658591579046
Gradient Boosting: 17.925548829015245
Decision Tree: 26.7683111843114
Linear Regression: 17.78456449898177
Polynomial Regression: 36.44640473779979
Ridge Regression: 17.671256632649296
drop_nan - Best Model: Ridge Regression
drop_nan - Validation RMSE: 17.67


Random Forest: 23.347446713646693
Gradient Boosting: 23.11858712795693
Decision Tree: 30.45485280720314
Linear Regression: 23.42199974169499
Polynomial Regression: 155155206683.68515
Ridge Regression: 23.421100966454524
drop_cols - Best Model: Gradient Boosting
drop_cols - Validation RMSE: 23.12


Random Forest: 23.341117518483195
Gradient Boosting: 23.28905867073958
Decision Tree: 29.321113286277196
Linear Regression: 23.407745824695382
Polynomial Regression: 27182145484.761013
Ridge Regression: 23.40631490452461
fill_median - Best Model: Gradient Boosting
fill_median - Validation RMSE: 23.29


Random Forest: 23.09145883129321
Gradient Boosting: 23.178276025093645
Decision Tree: 29.17970816669355
Linear Regression: 23.4111549364483
Polynomial Regression: 1581677443460.283
Ridge Regression: 23.410204896844206
fill_zero - Best Model: Random Forest
fill_zero - Validation RMSE: 23.09


Random Forest: 23.22422169665251
Gradient Boosting: 23.137600696099426
Decision Tree: 29.354890105259294
Linear Regression: 23.415712301353796
Polynomial Regression: 7202087745919.706
Ridge Regression: 23.41332569802514
knn_imputation - Best Model: Gradient Boosting
knn_imputation - Validation RMSE: 23.14


# Sammenligne med full dataset

In [31]:
# Finne % av data med differanse mindre enn 10
for strategy in strategies:
    best_model_pipeline = best_models[strategy]
    best_model_pipeline.fit(X_train, y_train)
    y_test_pred = best_model_pipeline.predict(X_test)

    X_test_filled = X_test.copy()
    X_test_filled['predicted_value'] = y_test_pred
    diffs = (X_test_filled['predicted_value']) - y_test
    diff_counts = {}
    for diff in diffs:
        diff = round(diff)  # Round til heltall
        if diff in diff_counts:
            diff_counts[diff] += 1
        else:
            diff_counts[diff] = 1

    total_less_than_10 = sum(count for diff, count in diff_counts.items() if abs(diff) < 10)
    total_diffs = len(diffs)
    percentage_less_than_10 = (total_less_than_10 / total_diffs) * 100

    print(f'{strategy} - Prosent av data med forskjell mindre enn 10 på full dataset (Comibined_df): {percentage_less_than_10:.2f}%')

NameError: name 'best_models' is not defined