### Ridge Regression

In [13]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
import plotly.express as px
from sklearn.linear_model import Ridge

In [14]:
# df = pd.read_csv("Datasets/preprocessed_dataset.csv")
df = pd.read_csv("Datasets/new_dataset_added_column_scaled.csv")

In [15]:
df = df.drop(["Unnamed: 0","id"],axis=1)
df

Unnamed: 0,hospital_length_of_stay,age,sex,height,weight,smoking_history,previous_er_visit_within_14_days,admission_disposition,Hypertension,Chronic cardiac disease (not hypertension),...,pao2,pao2_fio2,ph,esr,inr,ferritin,d_dimer,crp,admission_disposition_encoded,intubated_encoded
0,21,0.486777,1,3.549903e-15,0.000000,0.0,0,1,1,0,...,0.000000,0.0,8.243049e-14,2.340288e-15,-1.116803e-01,0.0,0.000000,2.887190e+00,1,0
1,5,-0.306970,0,-1.500191e+00,-0.364575,0.0,0,1,1,0,...,0.000000,0.0,8.243049e-14,2.340288e-15,-2.998049e-01,0.0,1.595746,-3.430231e-01,1,0
2,7,-0.490142,0,3.549903e-15,0.000000,0.0,0,1,1,0,...,0.000000,0.0,8.243049e-14,2.340288e-15,4.177205e-16,0.0,0.000000,-3.603147e-16,1,0
3,9,1.707926,1,1.984550e+00,-0.769298,0.0,0,1,1,0,...,0.000000,0.0,8.243049e-14,2.340288e-15,4.177205e-16,0.0,0.000000,-3.603147e-16,1,0
4,9,1.524754,1,3.549903e-15,0.000000,1.0,0,1,1,1,...,0.000000,0.0,8.243049e-14,2.340288e-15,4.177205e-16,0.0,0.000000,-3.603147e-16,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,19,0.914179,1,-1.112997e+00,-0.834928,0.0,1,1,1,0,...,-0.468484,0.0,2.861595e+00,2.340288e-15,4.177205e-16,0.0,4.202592,-3.603147e-16,1,0
504,9,0.486777,1,3.549903e-15,0.078433,0.0,1,1,1,0,...,0.000000,0.0,8.243049e-14,2.340288e-15,6.408181e-01,0.0,0.000000,-3.603147e-16,1,0
505,7,-1.100717,0,1.634827e+00,0.822248,0.0,1,1,0,0,...,0.000000,0.0,8.243049e-14,2.340288e-15,-2.998049e-01,0.0,-1.454530,-3.603147e-16,1,0
506,5,0.425720,1,3.549903e-15,0.000000,0.0,1,1,0,0,...,0.000000,0.0,8.243049e-14,2.340288e-15,-1.116803e-01,0.0,0.376269,-3.603147e-16,1,0


### Splitting X and y

In [16]:
from sklearn.model_selection import train_test_split
X = df.drop("hospital_length_of_stay",axis =1)
y = df['hospital_length_of_stay']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (355, 782)
X_test shape: (153, 782)
y_train shape: (355,)
y_test shape: (153,)


### Implenting RIDGE regression to the original dataset small value 1

In [22]:
ridge = Ridge(alpha=1)  
ridge.fit(X_train, y_train)
y_pred_train = ridge.predict(X_train)
y_pred_test = ridge.predict(X_test)
mse_y_test = mean_squared_error(y_test, y_pred_test)
mse_y_train = mean_squared_error(y_train, y_pred_train)
print("Mean Squared Error train:", mse_y_train)
print("Mean Squared Error test:", mse_y_test)

Mean Squared Error train: 21.99629426869509
Mean Squared Error test: 212.6573860491631


In [23]:
df_train = pd.DataFrame({'y_train': y_train, 'y_pred_train': y_pred_train})

# Create a scatter plot using Plotly Express
fig = px.scatter(df_train, x='y_train', y='y_pred_train', title='y_train vs y_pred_train')
fig.update_layout(xaxis_title='y_train', yaxis_title='y_pred_train')
fig.show()

df_test = pd.DataFrame({'y_test': y_test, 'y_pred_test': y_pred_test})

# Create a scatter plot using Plotly Express
fig = px.scatter(df_test, x='y_test', y='y_pred_test', color_discrete_sequence=['green'], title='y_test vs y_pred_test')
fig.update_layout(xaxis_title='y_test', yaxis_title='y_pred_test')
fig.show()

### Implenting lasso regression to the original dataset large value 1000

In [24]:
ridge = Ridge(alpha=1000)  
ridge.fit(X_train, y_train)
y_pred_train = ridge.predict(X_train)
y_pred_test = ridge.predict(X_test)
mse_y_test = mean_squared_error(y_test, y_pred_test)
mse_y_train = mean_squared_error(y_train, y_pred_train)
print("Mean Squared Error train:", mse_y_train)
print("Mean Squared Error test:", mse_y_test)

Mean Squared Error train: 119.62973303575951
Mean Squared Error test: 155.36409507106535


In [25]:
df_train = pd.DataFrame({'y_train': y_train, 'y_pred_train': y_pred_train})

# Create a scatter plot using Plotly Express
fig = px.scatter(df_train, x='y_train', y='y_pred_train', title='y_train vs y_pred_train')
fig.update_layout(xaxis_title='y_train', yaxis_title='y_pred_train')
fig.show()

df_test = pd.DataFrame({'y_test': y_test, 'y_pred_test': y_pred_test})

# Create a scatter plot using Plotly Express
fig = px.scatter(df_test, x='y_test', y='y_pred_test', color_discrete_sequence=['green'], title='y_test vs y_pred_test')
fig.update_layout(xaxis_title='y_test', yaxis_title='y_pred_test')
fig.show()

### Limiting the dataset

In [26]:
df_cap = df[df['hospital_length_of_stay']<20]
df_cap

Unnamed: 0,hospital_length_of_stay,age,sex,height,weight,smoking_history,previous_er_visit_within_14_days,admission_disposition,Hypertension,Chronic cardiac disease (not hypertension),...,pao2,pao2_fio2,ph,esr,inr,ferritin,d_dimer,crp,admission_disposition_encoded,intubated_encoded
1,5,-0.306970,0,-1.500191e+00,-0.364575,0.0,0,1,1,0,...,0.000000,0.0,8.243049e-14,2.340288e-15,-2.998049e-01,0.0,1.595746,-3.430231e-01,1,0
2,7,-0.490142,0,3.549903e-15,0.000000,0.0,0,1,1,0,...,0.000000,0.0,8.243049e-14,2.340288e-15,4.177205e-16,0.0,0.000000,-3.603147e-16,1,0
3,9,1.707926,1,1.984550e+00,-0.769298,0.0,0,1,1,0,...,0.000000,0.0,8.243049e-14,2.340288e-15,4.177205e-16,0.0,0.000000,-3.603147e-16,1,0
4,9,1.524754,1,3.549903e-15,0.000000,1.0,0,1,1,1,...,0.000000,0.0,8.243049e-14,2.340288e-15,4.177205e-16,0.0,0.000000,-3.603147e-16,1,0
6,4,-0.978602,1,2.634036e+00,0.592541,0.0,0,1,0,0,...,0.000000,0.0,8.243049e-14,2.340288e-15,4.177205e-16,0.0,-1.704776,-1.511884e+00,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,19,0.914179,1,-1.112997e+00,-0.834928,0.0,1,1,1,0,...,-0.468484,0.0,2.861595e+00,2.340288e-15,4.177205e-16,0.0,4.202592,-3.603147e-16,1,0
504,9,0.486777,1,3.549903e-15,0.078433,0.0,1,1,1,0,...,0.000000,0.0,8.243049e-14,2.340288e-15,6.408181e-01,0.0,0.000000,-3.603147e-16,1,0
505,7,-1.100717,0,1.634827e+00,0.822248,0.0,1,1,0,0,...,0.000000,0.0,8.243049e-14,2.340288e-15,-2.998049e-01,0.0,-1.454530,-3.603147e-16,1,0
506,5,0.425720,1,3.549903e-15,0.000000,0.0,1,1,0,0,...,0.000000,0.0,8.243049e-14,2.340288e-15,-1.116803e-01,0.0,0.376269,-3.603147e-16,1,0


In [27]:
from sklearn.model_selection import train_test_split
X = df_cap.drop("hospital_length_of_stay",axis =1)
y = df_cap['hospital_length_of_stay']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (292, 782)
X_test shape: (126, 782)
y_train shape: (292,)
y_test shape: (126,)


In [28]:
ridge = Ridge(alpha=100)  
ridge.fit(X_train, y_train)
y_pred_train = ridge.predict(X_train)
y_pred_test = ridge.predict(X_test)
mse_y_test = mean_squared_error(y_test, y_pred_test)
mse_y_train = mean_squared_error(y_train, y_pred_train)
print("Mean Squared Error train:", mse_y_train)
print("Mean Squared Error test:", mse_y_test)

Mean Squared Error train: 13.396382568302053
Mean Squared Error test: 22.023713828196254


In [29]:
df_train = pd.DataFrame({'y_train': y_train, 'y_pred_train': y_pred_train})

# Create a scatter plot using Plotly Express
fig = px.scatter(df_train, x='y_train', y='y_pred_train', title='y_train vs y_pred_train')
fig.update_layout(xaxis_title='y_train', yaxis_title='y_pred_train')
fig.show()

df_test = pd.DataFrame({'y_test': y_test, 'y_pred_test': y_pred_test})

# Create a scatter plot using Plotly Express
fig = px.scatter(df_test, x='y_test', y='y_pred_test', color_discrete_sequence=['green'], title='y_test vs y_pred_test')
fig.update_layout(xaxis_title='y_test', yaxis_title='y_pred_test')
fig.show()