### Import data

In [None]:
import pandas as pd

cohort = pd.read_csv('./eicu_cohort.csv')

### Check missing data

In [None]:
cohort.isnull().sum()

### Convert "actualhospitalmortality" to categorical data

In [None]:
cohort['actualhospitalmortality'] = pd.Categorical(cohort['actualhospitalmortality'])

### Encode "actualhospitalmortality"

In [None]:
cohort['actualhospitalmortality_enc'] = cohort['actualhospitalmortality'].cat.codes

### Encode "gender"

In [None]:
cohort['gender'] = pd.Categorical(cohort['gender'])
cohort['gender_enc'] = cohort['gender'].cat.codes

### Drop the original "actualhospitalmortality" and "gender"

In [None]:
cohort_enc = cohort.drop(['actualhospitalmortality', 'gender'], axis=1)

### Partition data

In [None]:
from sklearn.model_selection import train_test_split

x = cohort_enc.drop(['actualhospitalmortality_enc'], axis=1)
y = cohort['actualhospitalmortality_enc']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=42)

### Impute missing data

In [None]:
x_train = x_train.fillna(x_train.median())
x_test = x_test.fillna(x_train.median())

### Normalisation using MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(x_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

### Linear regression with gradient descent - code implementation

Try changing the learning rate to a larger value. 

In [None]:
import numpy as np

# Example dataset
X = np.array([1, 2, 3, 4, 5])
y = np.array([2, 3, 5, 7, 11])

# Initialize parameters
m = 0
c = 0
alpha = 0.01
iterations = 1000
n = len(y)

# Gradient Descent
for _ in range(iterations):
    # Compute the predictions
    y_pred = m * X + c
    
    # Compute the loss
    loss = (1/n) * np.sum((y - y_pred)**2)
    
    # Compute the gradients
    D_m = (-2/n) * np.sum((y - y_pred) * X)
    D_c = (-2/n) * np.sum(y - y_pred)
    
    # Update the parameters
    m = m - alpha * D_m
    c = c - alpha * D_c
    
    # Print cost every 100 iterations for monitoring
    if _ % 100 == 0:
        print(f"Iteration {_}, Loss: {loss}, m: {m}, c: {c}")

print(f"Final parameters: m: {m}, c: {c}")

### Fit data into a Linear Regression model only using "apachescore" and "actualhospitalmortality"

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

x_apache = cohort_enc['apachescore'].values.reshape(-1, 1)
y = cohort['actualhospitalmortality_enc']

x_apache_train, x_apache_test, y_train, y_test = train_test_split(x_apache, y, train_size=.7, random_state=42)

scaler = MinMaxScaler()
scaler.fit(x_apache_train)
x_apache_train = scaler.transform(x_apache_train)
x_apache_test = scaler.transform(x_apache_test)

model = LinearRegression()
model = model.fit(x_apache_train, y_train)
y_pred = model.predict(x_apache_test)

### Plot the fitted linear regression line

In [None]:
import matplotlib.pyplot as plt

plt.scatter(x_apache_test, y_test, color='blue', label='Data points')
plt.plot(x_apache_test, y_pred, color='red', label='Fitted line')

plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression Fit')
plt.legend()

plt.show()

### Apply logistic function to y_pred

In [None]:
import numpy as mp 

y_pred_logistic = 1 / (1 + np.exp(-y_pred))

### Plot y_pred_logistic against x_apache_test

In [None]:
import matplotlib.pyplot as plt

plt.scatter(x_apache_test, y_test, color='blue', label='Data points')
plt.plot(x_apache_test, y_pred_logistic, color='red', label='Fitted line')

plt.xlabel('X')
plt.ylabel('y')
plt.title('Logistic Regression Fit')
plt.legend()

plt.show()

### Fit data into LogisticRegression()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

x_apache = cohort_enc['apachescore'].values.reshape(-1, 1)
y = cohort['actualhospitalmortality_enc']

x_apache_train, x_apache_test, y_train, y_test = train_test_split(x_apache, y, train_size=.7, random_state=42)

scaler = MinMaxScaler()
scaler.fit(x_apache_train)
x_apache_train = scaler.transform(x_apache_train)
x_apache_test = scaler.transform(x_apache_test)

model = LogisticRegression()
model = model.fit(x_apache_train, y_train)
y_pred_logistic2 = model.predict(x_apache_test)

print(y_pred_logistic)

### Plot the fitted logistic regression line

In [None]:
import matplotlib.pyplot as plt

plt.scatter(x_apache_test, y_test, color='blue', label='Data points')
plt.plot(x_apache_test, y_pred_logistic2, color='red', label='Fitted line')

plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression Fit')
plt.legend()

plt.show()

### Plot the logistic regression predictions in scatter plot

In [None]:
import matplotlib.pyplot as plt

plt.scatter(x_apache_test, y_test, color='blue', label='Data points')
plt.scatter(x_apache_test, y_pred_logistic2, color='red', label='Predicted points')

plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression Fit')
plt.legend()

plt.show()

### Create a confusion matrix for the logistic regression model

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred_logistic2, labels=[0, 1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["ALIVED", "EXPIRED"])

disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()

### Calculate accuracy, precision, recall, specificity, and F1 score

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred_logistic2)
precision = precision_score(y_test, y_pred_logistic2)
recall = recall_score(y_test, y_pred_logistic2)
f1 = f1_score(y_test, y_pred_logistic2)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

### Plot the ROC-AUC

In [None]:
metrics.RocCurveDisplay.from_estimator(model, x_apache_test, y_test)

### Fit our data into linear regression with all 13 features

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

x = cohort_enc.drop('actualhospitalmortality_enc', axis=1)
y = cohort_enc['actualhospitalmortality_enc']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.7, random_state=42)

x_train = x_train.fillna(x_train.median())
x_test = x_test.fillna(x_train.median())

scaler = MinMaxScaler()
scaler.fit(x_train)
x_apache_train = scaler.transform(x_train)
x_apache_test = scaler.transform(x_test)

model = LinearRegression()
model = model.fit(x_train, y_train)
y_pred = model.predict(x_test)