In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from permetrics.regression import Metrics
from sklearn import tree

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR
from sklearn.linear_model import Ridge, LinearRegression, Lasso, ElasticNet

from tensorflow import keras
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance

from sklearn.pipeline import Pipeline

import statsmodels.api as sm
from dython import nominal
import graphviz


## Combine all dataframes

In [None]:
# Lead behandling and select relevant columns
behandling = pd.read_csv(Path('../20210324/with_name/behandling_optillfälle.csv'), sep=';')
behandling = behandling[['Der_Behandling_PK',
                         'Der_Opkort_FK',
                         'Der_Anestesikort_FK',
                         'Der_Vårdform_FK',
                         'Der_Prioritet_FK',
                         'BehandlingsStatus',
                         'ASAklass',
                         'ForberedelsetidStartTidpunkt',
                         'ForberedelsetidSlutTidpunkt',
                         'PatientÅlderVidOp',
                         'Veckodag',
                         'Starttimme',
                         'BMI',
                         'Kroppslängd',
                         'Kroppsvikt',
                         'OperationstidStart',
                         'AnestesitidStart'
                        ]]
behandling = behandling[behandling['BehandlingsStatus'] == 'Opererad'] # Remove 'abrutna' operationer as they do not contain all relevant data
print("Behandling length: {}".format(len(behandling)))

# Load ingrepp and select relevant columns
ingrepp = pd.read_csv(Path('../20210324/with_name/op_ingrepp_namn.csv'))
ingrepp = ingrepp[['Der_Behandling_PK',
                   'Ingreppkod',
                   'Primär_Sekundär',
                   'Sida',
                  ]]
ingrepp = ingrepp[ingrepp['Primär_Sekundär'] == 'Primär'] # Might want to include this if we make a more complicated model
print("Ingrepp length: {}".format(len(ingrepp)))

# Load diagnos and select relevant columns
diagnos = pd.read_csv(Path('../20210324/with_name/op_diagnos_namn.csv'))
diagnos = diagnos[['Der_Behandling_PK',
                   'Diagnoskod',
                   'Primär_Sekundär',
                  ]]
diagnos = diagnos[diagnos['Primär_Sekundär'] == 'Primär'] # Might want to include this if we make a more complicated model
print("Diagnos length: {}".format(len(diagnos)))

# Combine the data frames
combined_df = behandling.merge(diagnos, on='Der_Behandling_PK').merge(ingrepp, on='Der_Behandling_PK')
combined_df = combined_df.dropna()
print("Combined length: {}".format(len(combined_df)))

ingreppsgrupp = []
for index, row in combined_df.iterrows():
    ingrepp = row['Ingreppkod']
    ingrepp_group = ingrepp[0:2]
    ingreppsgrupp.append(ingrepp_group)
combined_df['IngreppsGrupp'] = ingreppsgrupp

diagnosgrupp = []
for index, row in combined_df.iterrows():
    diagnos = row['Diagnoskod']
    diagnos_grupp = diagnos[0]
    diagnosgrupp.append(diagnos_grupp)
combined_df['DiagnosGrupp'] = diagnosgrupp

# Calculate and add time to the dataframe
# Bad algoritm for checking min and max time of förbereds
start_pre = combined_df["ForberedelsetidStartTidpunkt"]
slut_pre = combined_df["ForberedelsetidSlutTidpunkt"]
start_an = combined_df['AnestesitidStart']
start_op = combined_df['OperationstidStart']

start_times = []
for time in start_pre:
    minn = int(time[-9:-7])
    hour = int(time[-12:-10])
    minutes = hour*60 + minn
    start_times.append(minutes)
    
stop_times = []
for time in slut_pre:
    minn = int(time[-9:-7])
    hour = int(time[-12:-10])
    minutes = hour*60 + minn
    stop_times.append(minutes)

an_times = []
for time in start_an:
    minn = int(time[-9:-7])
    hour = int(time[-12:-10])
    minutes = hour*60 + minn
    an_times.append(minutes)
    
op_times = []
for time in start_op:
    minn = int(time[-9:-7])
    hour = int(time[-12:-10])
    minutes = hour*60 + minn
    op_times.append(minutes)

times = []
for i in range(len(start_times)):
    #print(stop_times[i], start_times[i], stop_times[i] - start_times[i])
    times.append((stop_times[i] - start_times[i]) + (an_times[i] - op_times[i])) # Förberedelsetid + anestesiförberedelsetid
    
# Add total time to dataframe
combined_df['Tid'] = times

# Remove all fetuers we don't want
features_df = combined_df.drop(["Der_Behandling_PK", 
                               "Der_Opkort_FK",
                               "Der_Anestesikort_FK",
                               "BehandlingsStatus",
                               "ForberedelsetidStartTidpunkt",
                               "ForberedelsetidSlutTidpunkt",
                               "Primär_Sekundär_x",
                               "Primär_Sekundär_y",
                                "AnestesitidStart",
                                "OperationstidStart"
                            ], axis='columns')


'''
diagnosgrupper = {}
for diagnosgrupp, diagnosgrupp_df in features_df.groupby('DiagnosGrupp'):
    diagnosgrupper[diagnosgrupp] = diagnosgrupp_df
grupp_mean = []
grupp_std = []
for grupp in diagnosgrupper.keys():
    df = features_df[features_df['DiagnosGrupp'] == grupp]
    grupp_mean.append(df['time'].mean())
    grupp_std.append(df['time'].std())
#plt.errorbar(diagnosgrupper.keys(), grupp_mean, grupp_std, marker='o', linestyle='None', capsize=3)

ingreppsgrupper = {}
for ingreppsgrupp, ingreppsgrupp_df in features_df.groupby('IngreppsGrupp'):
    ingreppsgrupper[ingreppsgrupp] = ingreppsgrupp_df
grupp_mean = []
grupp_std = []
for grupp in ingreppsgrupper.keys():
    df = features_df[features_df['IngreppsGrupp'] == grupp]
    grupp_mean.append(df['time'].mean())
    grupp_std.append(df['time'].std())
#plt.errorbar(ingreppsgrupper.keys(), grupp_mean, grupp_std, marker='o', linestyle='None', capsize=3)
'''
features_df = features_df.drop(["Diagnoskod", "Ingreppkod"], axis='columns')
features_df = features_df[features_df['IngreppsGrupp'].isin(['NC', 'NH', 'NB', 'NG', 'NF', 'ND'])]

# Instansiate Metrics so we can use MAAPE later
metrics = Metrics()

In [None]:
features_df.head()

### Handle NaN (ONLY DO ONE OF THESE)

**Remove rows with NaN** (Good)

In [None]:
features_df = features_df.dropna()
y = features_df["Tid"]
X = features_df.drop("Tid", axis='columns')

### Encoding (ONLY DO ONE OF THESE)

**Use One Hot Encoding to encode "sida" and "ingreppsgrupp"** (This seems to be the better alternative)

In [None]:
X = pd.get_dummies(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=66)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.2, random_state=66)

### Trying models

In [None]:
# Try dummyregressor as a basecase, anything worse than this is really terrible
regr = DummyRegressor()
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
abs_error = mean_squared_error(y_test, pred, squared=False)
percentage_error = metrics.mean_arctangent_absolute_percentage_error(clean=True, y_pred=np.array(pred), y_true=np.array(y_test))
r2 = r2_score(y_test, pred)
print(f'abs error:{abs_error} \n% error: {percentage_error} \nR-squared: {r2}')

In [None]:
regr = DecisionTreeRegressor()
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
abs_error = mean_squared_error(y_test, pred, squared=False)
percentage_error = metrics.mean_arctangent_absolute_percentage_error(clean=True, y_pred=np.array(pred), y_true=np.array(y_test))
r2 = r2_score(y_test, pred)
print(f'abs error:{abs_error} \n% error: {percentage_error} \nR-squared: {r2}')

In [None]:
forest_regr = RandomForestRegressor(max_depth=22)
forest_regr.fit(X_train, y_train)
pred = forest_regr.predict(X_test)
abs_error = mean_squared_error(y_test, pred, squared=False)
percentage_error = metrics.mean_arctangent_absolute_percentage_error(clean=True, y_pred=np.array(pred), y_true=np.array(y_test))
r2 = r2_score(y_test, pred)
print(f'abs error:{abs_error} \n% error: {percentage_error} \nR-squared: {r2}')

In [None]:
boost_regr = GradientBoostingRegressor(max_depth=3,)
boost_regr.fit(X_train, y_train)
pred = boost_regr.predict(X_test)
abs_error = mean_squared_error(y_test, pred, squared=False)
percentage_error = metrics.mean_arctangent_absolute_percentage_error(clean=True, y_pred=np.array(pred), y_true=np.array(y_test))
r2 = r2_score(y_test, pred)
print(f'abs error:{abs_error} \n% error: {percentage_error} \nR-squared: {r2}')

In [None]:
regr = MLPRegressor(random_state=1, activation='logistic', learning_rate='adaptive', )
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
abs_error = mean_squared_error(y_test, pred, squared=False)
percentage_error = metrics.mean_arctangent_absolute_percentage_error(clean=True, y_pred=np.array(pred), y_true=np.array(y_test))
r2 = r2_score(y_test, pred)
print(f'abs error:{abs_error} \n% error: {percentage_error} \nR-squared: {r2}')

In [None]:
regr = LinearRegression()
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
abs_error = mean_squared_error(y_test, pred, squared=False)
percentage_error = metrics.mean_arctangent_absolute_percentage_error(clean=True, y_pred=np.array(pred), y_true=np.array(y_test))
r2 = r2_score(y_test, pred)
print(f'abs error:{abs_error} \n% error: {percentage_error} \nR-squared: {r2}')

## Keras

In [None]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(31, input_dim=31, kernel_initializer='normal', activation='relu')) #Input layer
    model.add(Dense(10, kernel_initializer='normal', activation='relu')) #Input layer
    model.add(Dense(1, kernel_initializer='normal', activation='relu')) #Output layer
    # Compile model
    opt = keras.optimizers.Adam(clipnorm=1, learning_rate=0.001)
    model.compile(loss='mean_squared_error', optimizer=opt)
    return model
estimator = KerasRegressor(build_fn=baseline_model, epochs=1000, batch_size=32, verbose=1)
kfold = KFold(n_splits=10)
#results = cross_val_score(estimator, X_train, y_train, cv=kfold)


In [None]:
avrg = sum(results)/len(results)
print(f'avrg mean_error_sqr: {avrg}')

In [None]:
estimator.fit(X_train, y_train)
pred = estimator.predict(X_test)

In [None]:
# Get importance of features

In [None]:
result = permutation_importance(regr, X.toarray(), y, n_repeats=10, random_state=0)
print(result.importances_mean)

## Correlation matrix

In [None]:
features_df.head()

In [None]:
features_df = features_df.rename(columns={'Der_Vårdform_FK':'Vårdform', 'Der_Prioritet_FK':'Prioritet', 'Tid':'Förberedeletid'})
fig, ax = plt.subplots(figsize=(15, 15))
nominal.associations(features_df, nominal_columns=['IngreppsGrupp', 'Sida', 'DiagnosGrupp'], ax=ax, theil_u=True)
fig.savefig('corr-matrix')

In [None]:
def forward_regression(X, y,
                       threshold_in,
                       verbose=True):
    initial_list = []
    included = list(initial_list)
    while True:
        changed=False
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]].astype('float64')))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        if not changed:
            break

    return included

In [None]:
forward_regression(X, y, 0.05)

features_df = features_df.dropna()
y = features_df["Tid"]
X = features_df.drop("Tid", axis='columns')### Try with only p<0.05

### Try without Vårdform, prioritet, asaklass, patientålder, kroppsvikt

In [None]:
# Lead behandling and select relevant columns
behandling = pd.read_csv(Path('../20210324/with_name/behandling_optillfälle.csv'), sep=';')
behandling = behandling[['Der_Behandling_PK',
                         'Der_Opkort_FK',
                         'Der_Anestesikort_FK',
                         'Der_Vårdform_FK',
                         'Der_Prioritet_FK',
                         'BehandlingsStatus',
                         'ASAklass',
                         'ForberedelsetidStartTidpunkt',
                         'ForberedelsetidSlutTidpunkt',
                         'PatientÅlderVidOp',
                         'Veckodag',
                         'Starttimme',
                         'BMI',
                         'Kroppslängd',
                         'Kroppsvikt',
                         'OperationstidStart',
                         'AnestesitidStart'
                        ]]
behandling = behandling[behandling['BehandlingsStatus'] == 'Opererad'] # Remove 'abrutna' operationer as they do not contain all relevant data
print("Behandling length: {}".format(len(behandling)))

# Load ingrepp and select relevant columns
ingrepp = pd.read_csv(Path('../20210324/with_name/op_ingrepp_namn.csv'))
ingrepp = ingrepp[['Der_Behandling_PK',
                   'Ingreppkod',
                   'Primär_Sekundär',
                   'Sida',
                  ]]
ingrepp = ingrepp[ingrepp['Primär_Sekundär'] == 'Primär'] # Might want to include this if we make a more complicated model
print("Ingrepp length: {}".format(len(ingrepp)))

# Load diagnos and select relevant columns
diagnos = pd.read_csv(Path('../20210324/with_name/op_diagnos_namn.csv'))
diagnos = diagnos[['Der_Behandling_PK',
                   'Diagnoskod',
                   'Primär_Sekundär',
                  ]]
diagnos = diagnos[diagnos['Primär_Sekundär'] == 'Primär'] # Might want to include this if we make a more complicated model
print("Diagnos length: {}".format(len(diagnos)))

# Combine the data frames
combined_df = behandling.merge(diagnos, on='Der_Behandling_PK').merge(ingrepp, on='Der_Behandling_PK')
combined_df = combined_df.dropna()
print("Combined length: {}".format(len(combined_df)))

ingreppsgrupp = []
for index, row in combined_df.iterrows():
    ingrepp = row['Ingreppkod']
    ingrepp_group = ingrepp[0:2]
    ingreppsgrupp.append(ingrepp_group)
combined_df['IngreppsGrupp'] = ingreppsgrupp

diagnosgrupp = []
for index, row in combined_df.iterrows():
    diagnos = row['Diagnoskod']
    diagnos_grupp = diagnos[0]
    diagnosgrupp.append(diagnos_grupp)
combined_df['DiagnosGrupp'] = diagnosgrupp

# Calculate and add time to the dataframe
# Bad algoritm for checking min and max time of förbereds
start_pre = combined_df["ForberedelsetidStartTidpunkt"]
slut_pre = combined_df["ForberedelsetidSlutTidpunkt"]
start_an = combined_df['AnestesitidStart']
start_op = combined_df['OperationstidStart']

start_times = []
for time in start_pre:
    minn = int(time[-9:-7])
    hour = int(time[-12:-10])
    minutes = hour*60 + minn
    start_times.append(minutes)
    
stop_times = []
for time in slut_pre:
    minn = int(time[-9:-7])
    hour = int(time[-12:-10])
    minutes = hour*60 + minn
    stop_times.append(minutes)

an_times = []
for time in start_an:
    minn = int(time[-9:-7])
    hour = int(time[-12:-10])
    minutes = hour*60 + minn
    an_times.append(minutes)
    
op_times = []
for time in start_op:
    minn = int(time[-9:-7])
    hour = int(time[-12:-10])
    minutes = hour*60 + minn
    op_times.append(minutes)

times = []
for i in range(len(start_times)):
    #print(stop_times[i], start_times[i], stop_times[i] - start_times[i])
    times.append((stop_times[i] - start_times[i]) + (an_times[i] - op_times[i])) # Förberedelsetid + anestesiförberedelsetid
    
# Add total time to dataframe
combined_df['Tid'] = times

# Remove all fetuers we don't want
features_df = combined_df.drop(["Der_Behandling_PK", 
                               "Der_Opkort_FK",
                               "Der_Anestesikort_FK",
                               "BehandlingsStatus",
                               "ForberedelsetidStartTidpunkt",
                               "ForberedelsetidSlutTidpunkt",
                               "Primär_Sekundär_x",
                               "Primär_Sekundär_y",
                                "AnestesitidStart",
                                "OperationstidStart"
                            ], axis='columns')


'''
diagnosgrupper = {}
for diagnosgrupp, diagnosgrupp_df in features_df.groupby('DiagnosGrupp'):
    diagnosgrupper[diagnosgrupp] = diagnosgrupp_df
grupp_mean = []
grupp_std = []
for grupp in diagnosgrupper.keys():
    df = features_df[features_df['DiagnosGrupp'] == grupp]
    grupp_mean.append(df['time'].mean())
    grupp_std.append(df['time'].std())
#plt.errorbar(diagnosgrupper.keys(), grupp_mean, grupp_std, marker='o', linestyle='None', capsize=3)

ingreppsgrupper = {}
for ingreppsgrupp, ingreppsgrupp_df in features_df.groupby('IngreppsGrupp'):
    ingreppsgrupper[ingreppsgrupp] = ingreppsgrupp_df
grupp_mean = []
grupp_std = []
for grupp in ingreppsgrupper.keys():
    df = features_df[features_df['IngreppsGrupp'] == grupp]
    grupp_mean.append(df['time'].mean())
    grupp_std.append(df['time'].std())
#plt.errorbar(ingreppsgrupper.keys(), grupp_mean, grupp_std, marker='o', linestyle='None', capsize=3)
'''
features_df = features_df.drop(["Diagnoskod", "Ingreppkod"], axis='columns')
features_df = features_df[features_df['IngreppsGrupp'].isin(['NC', 'NH', 'NB', 'NG', 'NF', 'ND'])]

# Instansiate Metrics so we can use MAAPE later
metrics = Metrics()

In [None]:
features_df = features_df.drop(['PatientÅlderVidOp', 'ASAklass', 'PatientÅlderVidOp'], axis=1)
features_df = features_df.dropna()
y = features_df["Tid"]
X = features_df.drop("Tid", axis='columns')

X = pd.get_dummies(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=66)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.2, random_state=66)

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
nominal.associations(features_df, nominal_columns=['IngreppsGrupp', 'Sida', 'DiagnosGrupp'], ax=ax, theil_u=True)
fig.savefig('corr-matri_reduced')

In [None]:
# Try dummyregressor as a basecase, anything worse than this is really terrible
regr = DummyRegressor()
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
abs_error = mean_squared_error(y_test, pred, squared=False)
percentage_error = metrics.mean_arctangent_absolute_percentage_error(clean=True, y_pred=np.array(pred), y_true=np.array(y_test))
r2 = r2_score(y_test, pred)
print(f'abs error:{abs_error} \n% error: {percentage_error} \nR-squared: {r2}')

In [None]:
regr = DecisionTreeRegressor()
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
abs_error = mean_squared_error(y_test, pred, squared=False)
percentage_error = metrics.mean_arctangent_absolute_percentage_error(clean=True, y_pred=np.array(pred), y_true=np.array(y_test))
r2 = r2_score(y_test, pred)
print(f'abs error:{abs_error} \n% error: {percentage_error} \nR-squared: {r2}')

In [None]:
forest_regr = RandomForestRegressor(max_depth=22)
forest_regr.fit(X_train, y_train)
pred = forest_regr.predict(X_test)
abs_error = mean_squared_error(y_test, pred, squared=False)
percentage_error = metrics.mean_arctangent_absolute_percentage_error(clean=True, y_pred=np.array(pred), y_true=np.array(y_test))
r2 = r2_score(y_test, pred)
print(f'abs error:{abs_error} \n% error: {percentage_error} \nR-squared: {r2}')

In [None]:
boost_regr = GradientBoostingRegressor(max_depth=3,)
boost_regr.fit(X_train, y_train)
pred = boost_regr.predict(X_test)
abs_error = mean_squared_error(y_test, pred, squared=False)
percentage_error = metrics.mean_arctangent_absolute_percentage_error(clean=True, y_pred=np.array(pred), y_true=np.array(y_test))
r2 = r2_score(y_test, pred)
print(f'abs error:{abs_error} \n% error: {percentage_error} \nR-squared: {r2}')

In [None]:
regr = MLPRegressor(random_state=1, activation='logistic', learning_rate='adaptive', )
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
abs_error = mean_squared_error(y_test, pred, squared=False)
percentage_error = metrics.mean_arctangent_absolute_percentage_error(clean=True, y_pred=np.array(pred), y_true=np.array(y_test))
r2 = r2_score(y_test, pred)
print(f'abs error:{abs_error} \n% error: {percentage_error} \nR-squared: {r2}')

In [None]:
regr = LinearRegression()
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
abs_error = mean_squared_error(y_test, pred, squared=False)
percentage_error = metrics.mean_arctangent_absolute_percentage_error(clean=True, y_pred=np.array(pred), y_true=np.array(y_test))
r2 = r2_score(y_test, pred)
print(f'abs error:{abs_error} \n% error: {percentage_error} \nR-squared: {r2}')

## Ordinal Encoding

In [None]:
# Lead behandling and select relevant columns
behandling = pd.read_csv(Path('../20210324/with_name/behandling_optillfälle.csv'), sep=';')
behandling = behandling[['Der_Behandling_PK',
                         'Der_Opkort_FK',
                         'Der_Anestesikort_FK',
                         'Der_Vårdform_FK',
                         'Der_Prioritet_FK',
                         'BehandlingsStatus',
                         'ASAklass',
                         'ForberedelsetidStartTidpunkt',
                         'ForberedelsetidSlutTidpunkt',
                         'PatientÅlderVidOp',
                         'Veckodag',
                         'Starttimme',
                         'BMI',
                         'Kroppslängd',
                         'Kroppsvikt',
                         'OperationstidStart',
                         'AnestesitidStart'
                        ]]
behandling = behandling[behandling['BehandlingsStatus'] == 'Opererad'] # Remove 'abrutna' operationer as they do not contain all relevant data
print("Behandling length: {}".format(len(behandling)))

# Load ingrepp and select relevant columns
ingrepp = pd.read_csv(Path('../20210324/with_name/op_ingrepp_namn.csv'))
ingrepp = ingrepp[['Der_Behandling_PK',
                   'Ingreppkod',
                   'Primär_Sekundär',
                   'Sida',
                  ]]
ingrepp = ingrepp[ingrepp['Primär_Sekundär'] == 'Primär'] # Might want to include this if we make a more complicated model
print("Ingrepp length: {}".format(len(ingrepp)))

# Load diagnos and select relevant columns
diagnos = pd.read_csv(Path('../20210324/with_name/op_diagnos_namn.csv'))
diagnos = diagnos[['Der_Behandling_PK',
                   'Diagnoskod',
                   'Primär_Sekundär',
                  ]]
diagnos = diagnos[diagnos['Primär_Sekundär'] == 'Primär'] # Might want to include this if we make a more complicated model
print("Diagnos length: {}".format(len(diagnos)))

# Combine the data frames
combined_df = behandling.merge(diagnos, on='Der_Behandling_PK').merge(ingrepp, on='Der_Behandling_PK')
combined_df = combined_df.dropna()
print("Combined length: {}".format(len(combined_df)))

ingreppsgrupp = []
for index, row in combined_df.iterrows():
    ingrepp = row['Ingreppkod']
    ingrepp_group = ingrepp[0:2]
    ingreppsgrupp.append(ingrepp_group)
combined_df['IngreppsGrupp'] = ingreppsgrupp

diagnosgrupp = []
for index, row in combined_df.iterrows():
    diagnos = row['Diagnoskod']
    diagnos_grupp = diagnos[0]
    diagnosgrupp.append(diagnos_grupp)
combined_df['DiagnosGrupp'] = diagnosgrupp

# Calculate and add time to the dataframe
# Bad algoritm for checking min and max time of förbereds
start_pre = combined_df["ForberedelsetidStartTidpunkt"]
slut_pre = combined_df["ForberedelsetidSlutTidpunkt"]
start_an = combined_df['AnestesitidStart']
start_op = combined_df['OperationstidStart']

start_times = []
for time in start_pre:
    minn = int(time[-9:-7])
    hour = int(time[-12:-10])
    minutes = hour*60 + minn
    start_times.append(minutes)
    
stop_times = []
for time in slut_pre:
    minn = int(time[-9:-7])
    hour = int(time[-12:-10])
    minutes = hour*60 + minn
    stop_times.append(minutes)

an_times = []
for time in start_an:
    minn = int(time[-9:-7])
    hour = int(time[-12:-10])
    minutes = hour*60 + minn
    an_times.append(minutes)
    
op_times = []
for time in start_op:
    minn = int(time[-9:-7])
    hour = int(time[-12:-10])
    minutes = hour*60 + minn
    op_times.append(minutes)

times = []
for i in range(len(start_times)):
    #print(stop_times[i], start_times[i], stop_times[i] - start_times[i])
    times.append((stop_times[i] - start_times[i]) + (an_times[i] - op_times[i])) # Förberedelsetid + anestesiförberedelsetid
    
# Add total time to dataframe
combined_df['Tid'] = times

# Remove all fetuers we don't want
features_df = combined_df.drop(["Der_Behandling_PK", 
                               "Der_Opkort_FK",
                               "Der_Anestesikort_FK",
                               "BehandlingsStatus",
                               "ForberedelsetidStartTidpunkt",
                               "ForberedelsetidSlutTidpunkt",
                               "Primär_Sekundär_x",
                               "Primär_Sekundär_y",
                                "AnestesitidStart",
                                "OperationstidStart"
                            ], axis='columns')


'''
diagnosgrupper = {}
for diagnosgrupp, diagnosgrupp_df in features_df.groupby('DiagnosGrupp'):
    diagnosgrupper[diagnosgrupp] = diagnosgrupp_df
grupp_mean = []
grupp_std = []
for grupp in diagnosgrupper.keys():
    df = features_df[features_df['DiagnosGrupp'] == grupp]
    grupp_mean.append(df['time'].mean())
    grupp_std.append(df['time'].std())
#plt.errorbar(diagnosgrupper.keys(), grupp_mean, grupp_std, marker='o', linestyle='None', capsize=3)

ingreppsgrupper = {}
for ingreppsgrupp, ingreppsgrupp_df in features_df.groupby('IngreppsGrupp'):
    ingreppsgrupper[ingreppsgrupp] = ingreppsgrupp_df
grupp_mean = []
grupp_std = []
for grupp in ingreppsgrupper.keys():
    df = features_df[features_df['IngreppsGrupp'] == grupp]
    grupp_mean.append(df['time'].mean())
    grupp_std.append(df['time'].std())
#plt.errorbar(ingreppsgrupper.keys(), grupp_mean, grupp_std, marker='o', linestyle='None', capsize=3)
'''
features_df = features_df.drop(["Diagnoskod", "Ingreppkod"], axis='columns')
features_df = features_df[features_df['IngreppsGrupp'].isin(['NC', 'NH', 'NB', 'NG', 'NF', 'ND'])]

# Instansiate Metrics so we can use MAAPE later
metrics = Metrics()

In [None]:
features_df = features_df.dropna()
y = features_df["Tid"]
X = features_df.drop("Tid", axis='columns')

In [None]:
#y = features_df['time']
#X = features_df.drop('time', axis='columns')
enc = OrdinalEncoder()
X = enc.fit_transform(X)
X = pd.DataFrame(X)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=66)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.2, random_state=66)

In [None]:
# Try dummyregressor as a basecase, anything worse than this is really terrible
regr = DummyRegressor()
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
abs_error = mean_squared_error(y_test, pred, squared=False)
percentage_error = metrics.mean_arctangent_absolute_percentage_error(clean=True, y_pred=np.array(pred), y_true=np.array(y_test))
r2 = r2_score(y_test, pred)
print(f'abs error:{abs_error} \n% error: {percentage_error} \nR-squared: {r2}')

In [None]:
regr = DecisionTreeRegressor()
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
abs_error = mean_squared_error(y_test, pred, squared=False)
percentage_error = metrics.mean_arctangent_absolute_percentage_error(clean=True, y_pred=np.array(pred), y_true=np.array(y_test))
r2 = r2_score(y_test, pred)
print(f'abs error:{abs_error} \n% error: {percentage_error} \nR-squared: {r2}')

In [None]:
forest_regr = RandomForestRegressor(max_depth=22)
forest_regr.fit(X_train, y_train)
pred = forest_regr.predict(X_test)
abs_error = mean_squared_error(y_test, pred, squared=False)
percentage_error = metrics.mean_arctangent_absolute_percentage_error(clean=True, y_pred=np.array(pred), y_true=np.array(y_test))
r2 = r2_score(y_test, pred)
print(f'abs error:{abs_error} \n% error: {percentage_error} \nR-squared: {r2}')

In [None]:
boost_regr = GradientBoostingRegressor(max_depth=3,)
boost_regr.fit(X_train, y_train)
pred = boost_regr.predict(X_test)
abs_error = mean_squared_error(y_test, pred, squared=False)
percentage_error = metrics.mean_arctangent_absolute_percentage_error(clean=True, y_pred=np.array(pred), y_true=np.array(y_test))
r2 = r2_score(y_test, pred)
print(f'abs error:{abs_error} \n% error: {percentage_error} \nR-squared: {r2}')

In [None]:
regr = MLPRegressor(random_state=1, activation='logistic', learning_rate='adaptive', )
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
abs_error = mean_squared_error(y_test, pred, squared=False)
percentage_error = metrics.mean_arctangent_absolute_percentage_error(clean=True, y_pred=np.array(pred), y_true=np.array(y_test))
r2 = r2_score(y_test, pred)
print(f'abs error:{abs_error} \n% error: {percentage_error} \nR-squared: {r2}')

In [None]:
regr = LinearRegression()
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
abs_error = mean_squared_error(y_test, pred, squared=False)
percentage_error = metrics.mean_arctangent_absolute_percentage_error(clean=True, y_pred=np.array(pred), y_true=np.array(y_test))
r2 = r2_score(y_test, pred)
print(f'abs error:{abs_error} \n% error: {percentage_error} \nR-squared: {r2}')