In [75]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [76]:
xcel = "SMOTERegressionCases.xlsx"

# Load each sheet separately
df1 = pd.read_excel(xcel, sheet_name='Seed291_thr0.77_perc0.2')
df2 = pd.read_excel(xcel, sheet_name='Seed375_thr0.77_perc0.2')
df3 = pd.read_excel(xcel, sheet_name='Seed847_thr0.77_perc0.2')
df4 = pd.read_excel(xcel, sheet_name='Seed317_thr0.77_perc0.2')
df5 = pd.read_excel(xcel, sheet_name='Seed831_thr0.77_perc0.2')

In [77]:
df1

Unnamed: 0,X,id,Elavil,age,sex,height,weight,smoking_history,previous_er_visit_within_14_days,admission_disposition,...,Respiractin,Immunity.Advance,Vesicare,Zaxine,Quinine.Sulfate,Desvenlafaxine,Glucosamine,Turmeric,Cogentin.Tab,hospital_length_of_stay
0,205.0,213.0,0,45.0,1.0,166.911029,80.265922,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,6.0
1,142.0,148.0,0,73.0,1.0,177.800000,88.500000,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,3.0
2,116.0,119.0,0,89.0,1.0,166.911029,80.265922,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,1.0
3,317.0,328.0,0,89.0,1.0,166.911029,80.265922,0.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,7.0
4,292.0,302.0,0,95.0,1.0,166.911029,60.000000,0.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,170.0,177.0,0,68.0,1.0,177.800000,113.400000,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,59.0
514,300.0,310.0,0,46.0,0.0,155.000000,71.500000,0.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,60.0
515,338.0,350.0,0,59.0,1.0,152.400000,70.400000,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,69.0
516,23.0,25.0,0,64.0,1.0,159.000000,97.000000,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,72.0


### Scaling Age, Height and Weight

In [64]:
from sklearn.preprocessing import StandardScaler

columns_to_standardize = ['age', 'height', 'weight']

# Create a StandardScaler object
scaler = StandardScaler()

# Fit and transform the selected columns using the scaler
df1[columns_to_standardize] = scaler.fit_transform(df1[columns_to_standardize])
df2[columns_to_standardize] = scaler.fit_transform(df2[columns_to_standardize])
df3[columns_to_standardize] = scaler.fit_transform(df3[columns_to_standardize])
df4[columns_to_standardize] = scaler.fit_transform(df4[columns_to_standardize])
df5[columns_to_standardize] = scaler.fit_transform(df5[columns_to_standardize])

### Dropping unwanted columns

In [65]:
df1

Unnamed: 0,X,id,Elavil,age,sex,height,weight,smoking_history,previous_er_visit_within_14_days,admission_disposition,...,Respiractin,Immunity.Advance,Vesicare,Zaxine,Quinine.Sulfate,Desvenlafaxine,Glucosamine,Turmeric,Cogentin.Tab,hospital_length_of_stay
0,205.0,213.0,0,-2.148374,1.0,0.162167,0.389943,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,6.0
1,142.0,148.0,0,0.114194,1.0,1.539684,1.001662,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,3.0
2,116.0,119.0,0,1.407090,1.0,0.162167,0.389943,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,1.0
3,317.0,328.0,0,1.407090,1.0,0.162167,0.389943,0.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,7.0
4,292.0,302.0,0,1.891926,1.0,0.162167,-1.115635,0.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,170.0,177.0,0,-0.289836,1.0,1.539684,2.851512,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,59.0
514,300.0,310.0,0,-2.067568,0.0,-1.344646,-0.261287,0.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,60.0
515,338.0,350.0,0,-1.017090,1.0,-1.673560,-0.343007,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,69.0
516,23.0,25.0,0,-0.613060,1.0,-0.838623,1.633137,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,72.0


In [66]:
df1 = df1.drop(['X','id'],axis=1)
df2 = df2.drop(['X','id'],axis=1)
df3 = df3.drop(['X','id'],axis=1)
df4 = df4.drop(['X','id'],axis=1)
df5 = df5.drop(['X','id'],axis=1)

In [73]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

def test_experiment(df):
    # Assuming 'df' has both features (X) and target (y) variables
    X = df.drop('hospital_length_of_stay', axis=1)  # Replace with actual column name
    y = df['hospital_length_of_stay']               # Replace with actual column name

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the models with specified alpha values
    models = {
        'Lasso alpha 0.1': Lasso(alpha=0.1),
        'Lasso alpha 10': Lasso(alpha=10),
        'Ridge alpha 0.1': Ridge(alpha=0.1),
        'Ridge alpha 10': Ridge(alpha=10),
        'RandomForest': RandomForestRegressor(),
        'MLP': MLPRegressor(max_iter=500)  # Increase max_iter if necessary
    }
    
    # Dictionary to store the results
    results = {}

    # Loop over the models
    for name, model in models.items():
        # Fit the model on the training data
        model.fit(X_train, y_train)
        
        # Make predictions on the training data
        y_train_pred = model.predict(X_train)
        # Make predictions on the test data
        y_test_pred = model.predict(X_test)
        
        # Calculate MSE for training and test data
        train_mse = mean_squared_error(y_train, y_train_pred)
        test_mse = mean_squared_error(y_test, y_test_pred)
        
        # Store the results
        results[name] = {
            'Train MSE': train_mse,
            'Test MSE': test_mse
        }

    return results

# Run the test_experiment function on each dataset
results_df1 = test_experiment(df1)
results_df2 = test_experiment(df2)
results_df3 = test_experiment(df3)
results_df4 = test_experiment(df4)
results_df5 = test_experiment(df5)

# Create a dictionary to aggregate the results
aggregated_results = {}

# Add results from each dataset to the aggregated dictionary
for i, results in enumerate([results_df1, results_df2, results_df3, results_df4, results_df5], start=1):
    for model_name, metrics in results.items():
        if model_name not in aggregated_results:
            aggregated_results[model_name] = {
                'Train MSE': [],
                'Test MSE': []
            }
        aggregated_results[model_name]['Train MSE'].append(metrics['Train MSE'])
        aggregated_results[model_name]['Test MSE'].append(metrics['Test MSE'])

# Calculate mean and standard deviation for each model
final_results = {
    'Mean Train MSE': {},
    'Std Train MSE': {},
    'Mean Test MSE': {},
    'Std Test MSE': {}
}

for model_name, mse_values in aggregated_results.items():
    final_results['Mean Train MSE'][model_name] = sum(mse_values['Train MSE']) / len(mse_values['Train MSE'])
    final_results['Std Train MSE'][model_name] = (sum((x - final_results['Mean Train MSE'][model_name]) ** 2 for x in mse_values['Train MSE']) / len(mse_values['Train MSE'])) ** 0.5
    final_results['Mean Test MSE'][model_name] = sum(mse_values['Test MSE']) / len(mse_values['Test MSE'])
    final_results['Std Test MSE'][model_name] = (sum((x - final_results['Mean Test MSE'][model_name]) ** 2 for x in mse_values['Test MSE']) / len(mse_values['Test MSE'])) ** 0.5

# Convert final results to a DataFrame
results_df = pd.DataFrame(final_results)



In [74]:
results_df.to_csv('results.csv')

In [None]:
re