# Data

In [12]:
import pandas as pd

data = pd.read_csv('medical_cost_data.csv')

y_colname = 'charges'
y = data[y_colname]
X = data.drop([y_colname], axis=1)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
cols_to_transform = ['sex', 'region']


# Function to one-hot-encode categorical features
def ohe_transform_cat_feats(ohe, X):
    cols_to_transform = ['sex', 'region']
    # Use the OneHotEncoder to create a new matrix
    X_gen_trans = ohe.transform(X[cols_to_transform]).toarray()
    X_gen_trans = X_gen_trans.T

    # Add the matrix rows as columns to the X DataFrame
    new_col_names = ohe.get_feature_names_out()
    for i, col in enumerate(new_col_names):
        X[col] = X_gen_trans[i]

    # Remove the unnecessary column and adjust the data types
    # Explain why it is important to keep your dataset lean! (computational cost can explode quickly when scaling)
    X = X.drop(cols_to_transform, axis=1)
    X.loc[:, new_col_names] = X.loc[:, new_col_names].astype('int')
    return X


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
ohe.fit(X_train[cols_to_transform])

X_train = ohe_transform_cat_feats(ohe, X_train)
X_test = ohe_transform_cat_feats(ohe, X_test)
X_train = X_train.dropna()
X_test = X_test.dropna()
X_test

Unnamed: 0.1,Unnamed: 0,age,height,weight,bmi,caloric_intake,mean_heart_rate,glucose_levels,children,work_hours,income,sex_female,sex_male,region_northeast,region_northwest,region_southeast,region_southwest
764,764,45,181,82.475818,25.175,2814,83.612513,4.373338,2,8.473943,1000.509934,1.0,0.0,1.0,0.0,0.0,0.0
887,887,36,194,112.983272,30.020,2833,66.371988,4.652292,0,8.642828,1162.168417,1.0,0.0,0.0,1.0,0.0,0.0
890,890,64,170,77.697650,26.885,2933,92.959231,4.117142,0,10.276023,1014.796519,1.0,0.0,0.0,1.0,0.0,0.0
1293,1293,46,188,90.993128,25.745,1763,59.467169,4.609396,3,8.742813,1001.131353,0.0,1.0,0.0,1.0,0.0,0.0
259,259,19,162,83.770848,31.920,1940,83.358230,5.063206,0,9.766670,1001.728314,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,644,43,179,113.136771,35.310,2995,96.382310,4.285086,2,6.660268,1004.907353,0.0,1.0,0.0,0.0,1.0,0.0
602,602,56,173,75.720370,25.300,2277,52.323727,5.252695,0,8.462220,1003.221429,1.0,0.0,0.0,0.0,0.0,1.0
731,731,53,165,58.261500,21.400,1821,59.682528,5.076635,1,9.615741,1220.491675,0.0,1.0,0.0,0.0,0.0,1.0
321,321,26,177,92.859156,29.640,2473,85.658888,4.583662,4,8.954053,1000.017595,1.0,0.0,1.0,0.0,0.0,0.0


# First Neural Network

In [13]:
# Import pytorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

# Set random seed for reproducibility
torch.manual_seed(42)

<torch._C.Generator at 0x140d3ce30>

In [14]:
# PyTorch (and TensorFlow) work with 'Tensors'!
# Convert 'X' to a tensor
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

# Convert 'y' to a tensor
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1, 1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).reshape(-1, 1)

In [15]:
n_hidden = X_train.shape[1] * 2


# Define the neural network architecture
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(X_train.shape[1], n_hidden)
        self.linear2 = nn.Linear(n_hidden, 4)
        self.linear3 = nn.Linear(4, 1)

    def forward(self, x):
        x = self.flatten(x)
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        x = F.relu(x)
        x = self.linear3(x)
        return x

In [16]:
# Create data loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=50, shuffle=False)

# Train the network

In [17]:
def train_network(model, train_loader, num_epochs=200, verbose=True):
    # Define the loss function and optimizer
    loss_fn = nn.MSELoss()  # Choose the loss function carefully, why aren't we using the R2 score?
    # optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.6)
    optimizer = optim.Adam(model.parameters(), lr=0.1)

    # Train the neural network
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            # Zero your gradients for every batch!
            optimizer.zero_grad()

            # Make predictions for this batch
            outputs = model(inputs)

            # Compute the loss and its gradients
            loss = loss_fn(outputs, labels)
            loss.backward()

            # Adjust learning weights
            optimizer.step()
            running_loss += loss.item()
        if epoch % 10 == 0 and verbose:
            print(f"Epoch {epoch + 1} loss: {running_loss / len(train_loader)}")

    return model


# Create the neural network
model = NeuralNetwork()

# Use the above function to train the network
model = train_network(model, train_loader)

Epoch 1 loss: 218731299.55555555
Epoch 11 loss: 127670828.8888889
Epoch 21 loss: 76452212.0
Epoch 31 loss: 56564997.0
Epoch 41 loss: 55245453.55555555
Epoch 51 loss: 59290923.222222224
Epoch 61 loss: 57000869.333333336
Epoch 71 loss: 55635067.11111111
Epoch 81 loss: 55306570.777777776
Epoch 91 loss: 54935325.0
Epoch 101 loss: 54166257.55555555
Epoch 111 loss: 53291012.55555555
Epoch 121 loss: 53241483.11111111
Epoch 131 loss: 52733422.666666664
Epoch 141 loss: 52272643.333333336
Epoch 151 loss: 53618543.333333336
Epoch 161 loss: 51979720.44444445
Epoch 171 loss: 51103327.44444445
Epoch 181 loss: 51267744.44444445
Epoch 191 loss: 51225273.0


## Neural Network Playground

https://playground.tensorflow.org

# Evaluate

In [19]:
# Evaluate the neural network on test data
loss_fn = nn.MSELoss()
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    test_loss = loss_fn(test_outputs, y_test_tensor)
print("Test loss (MSE):", test_loss.item())

Test loss (MSE): 56646748.0


# Evaluate using the R2 score

In [20]:
from sklearn.metrics import r2_score
import numpy as np


# Define a function for model scoring
def score_model(model, X_test_tensor, y_test):
    # Make predictions using the model
    with torch.no_grad():
        y_pred = model(X_test_tensor)
    y_pred = y_pred.numpy()
    y_test_np = np.array(y_test.values)
    # Compute the R2 score
    score = r2_score(y_test_np, y_pred)
    return score


score_model(model, X_test_tensor, y_test)

0.6134506251022499

# Compare to other models

In [21]:
from sklearn.neural_network import \
    MLPRegressor  # https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html

regr = MLPRegressor(random_state=42, max_iter=5000, learning_rate_init=0.1)
regr.fit(X_train, y_train)

y_pred = regr.predict(X_test)

score = r2_score(y_test, y_pred)
score

0.5404269965780658

In [22]:
from sklearn.ensemble import \
    GradientBoostingRegressor  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html

gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)

y_pred = gbr.predict(X_test)

score = r2_score(y_test, y_pred)
score

0.7570735169219105

# Assignment: add one or more Dropout layers

In [24]:
n_hidden = X_train.shape[1] * 2


# Define the neural network architecture
class NeuralNetworkWithDropout(nn.Module):
    def __init__(self):
        super(NeuralNetworkWithDropout, self).__init__()
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(X_train.shape[1], n_hidden)
        self.dropout1 = nn.Dropout(p=0.5)  # Adding the first dropout layer with a dropout probability of 0.5
        self.linear2 = nn.Linear(n_hidden, 4)
        self.dropout2 = nn.Dropout(p=0.5)  # Adding the second dropout layer with a dropout probability of 0.5
        self.linear3 = nn.Linear(4, 1)

    def forward(self, x):
        x = self.flatten(x)
        x = self.linear1(x)
        x = F.relu(x)
        x = self.dropout1(x)  # Applying dropout after the first linear layer
        x = self.linear2(x)
        x = F.relu(x)
        x = self.dropout2(x)  # Applying dropout after the second linear layer
        x = self.linear3(x)
        return x


# Create the neural network
model = NeuralNetworkWithDropout()

# Use the above function to train the network
model = train_network(model, train_loader)

# Score the model performance
score_model(model, X_test_tensor, y_test)

Epoch 1 loss: 248531732.8888889
Epoch 11 loss: 208150588.44444445
Epoch 21 loss: 214490024.0
Epoch 31 loss: 215228393.7777778
Epoch 41 loss: 219806314.66666666
Epoch 51 loss: 185647411.1111111
Epoch 61 loss: 182193380.8888889
Epoch 71 loss: 187527616.44444445
Epoch 81 loss: 206875252.8888889
Epoch 91 loss: 211799203.1111111
Epoch 101 loss: 186291504.0
Epoch 111 loss: 199123481.7777778
Epoch 121 loss: 210563665.33333334
Epoch 131 loss: 186837834.66666666
Epoch 141 loss: 210233454.2222222
Epoch 151 loss: 211570598.2222222
Epoch 161 loss: 202407586.2222222
Epoch 171 loss: 182315229.33333334
Epoch 181 loss: 192921482.2222222
Epoch 191 loss: 190037414.66666666


-0.4485606451966506

# Assignment: Add one or more Batch normalization layers

In [25]:
n_hidden = X_train.shape[1] * 2


# Define the neural network architecture
class NeuralNetworkWithBN(nn.Module):
    def __init__(self):
        super(NeuralNetworkWithBN, self).__init__()
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(X_train.shape[1], n_hidden)
        self.bn1 = nn.BatchNorm1d(n_hidden)  # Adding batch normalization after the first linear layer
        self.linear2 = nn.Linear(n_hidden, 4)
        self.bn2 = nn.BatchNorm1d(4)  # Adding batch normalization after the second linear layer
        self.linear3 = nn.Linear(4, 1)

    def forward(self, x):
        x = self.flatten(x)
        x = self.linear1(x)
        x = self.bn1(x)  # Applying batch normalization after the first linear layer
        x = F.relu(x)
        x = self.linear2(x)
        x = self.bn2(x)  # Applying batch normalization after the second linear layer
        x = F.relu(x)
        x = self.linear3(x)
        return x


# Create the neural network
model = NeuralNetworkWithBN()

# Use the above function to train the network
model = train_network(model, train_loader, verbose=False)

# Score the model performance
score_model(model, X_test_tensor, y_test)

0.6377187514133751

# Tuning NN hyperparameters

In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error


# First, define a wrapper class for the PyTorch model
class NeuralNetwork(nn.Module, BaseEstimator, RegressorMixin):
    def __init__(self, input_size=10, hidden_size=64, pre_final_hidden_size=4,
                 output_size=1, lr=0.01, dropout_prob=0.01):
        super(NeuralNetwork, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.pre_final_hidden_size = pre_final_hidden_size
        self.output_size = output_size
        self.lr = lr
        self.dropout_prob = dropout_prob
        self.num_epochs = 100
        self.criterion = nn.MSELoss()

        self.flatten = nn.Flatten()
        self.bn0 = nn.BatchNorm1d(X_train.shape[1])
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.dropout1 = nn.Dropout(self.dropout_prob)
        self.linear2 = nn.Linear(hidden_size, pre_final_hidden_size)
        self.linear3 = nn.Linear(pre_final_hidden_size, output_size)

    def forward(self, x):
        x = self.flatten(x)
        x = self.bn0(x)
        x = self.linear1(x)
        x = self.bn1(x)
        x = self.dropout1(x)
        x = F.relu(x)
        x = self.linear2(x)
        x = F.relu(x)
        x = self.linear3(x)
        return x

    def fit(self, X_train, y_train):
        self.optimizer = optim.Adam(self.parameters(), lr=self.lr)
        X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1, 1)
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

        self.train()
        for epoch in range(self.num_epochs):
            for inputs, targets in train_loader:
                self.optimizer.zero_grad()
                outputs = self(inputs)
                loss = self.criterion(outputs, targets)
                loss.backward()
                self.optimizer.step()

        return self

    def predict(self, X_test):
        X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
        self.eval()
        with torch.no_grad():
            predictions = self(X_test_tensor).numpy()
        return predictions


# Second, create a parameter grid specifying the hyperparameters to search over
param_grid = {
    'hidden_size': [64, 128, 256],
    'lr': [0.001, 0.01, 0.1],
    'dropout_prob': [0.001, 0.01]
}

# Third, create an instance of GridSearchCV
model = NeuralNetwork(input_size=X_train.shape[1], output_size=1)
scorer = make_scorer(mean_squared_error, greater_is_better=False)
grid_search = GridSearchCV(model, param_grid, scoring=scorer, cv=3, verbose=2)

# Then, fit GridSearchCV on your training data
grid_search.fit(X_train, y_train)

# Finally, access the best hyperparameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Hyperparameters:", best_params)
print("Best Model:", best_model)


Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END .......dropout_prob=0.001, hidden_size=64, lr=0.001; total time=   0.9s
[CV] END .......dropout_prob=0.001, hidden_size=64, lr=0.001; total time=   0.9s
[CV] END .......dropout_prob=0.001, hidden_size=64, lr=0.001; total time=   0.9s
[CV] END ........dropout_prob=0.001, hidden_size=64, lr=0.01; total time=   0.9s
[CV] END ........dropout_prob=0.001, hidden_size=64, lr=0.01; total time=   0.9s
[CV] END ........dropout_prob=0.001, hidden_size=64, lr=0.01; total time=   0.9s
[CV] END .........dropout_prob=0.001, hidden_size=64, lr=0.1; total time=   0.9s
[CV] END .........dropout_prob=0.001, hidden_size=64, lr=0.1; total time=   0.9s
[CV] END .........dropout_prob=0.001, hidden_size=64, lr=0.1; total time=   0.9s
[CV] END ......dropout_prob=0.001, hidden_size=128, lr=0.001; total time=   0.9s
[CV] END ......dropout_prob=0.001, hidden_size=128, lr=0.001; total time=   0.9s
[CV] END ......dropout_prob=0.001, hidden_size=1

In [27]:
y_pred = best_model.predict(X_test)

score = r2_score(y_test, y_pred)
score

0.6362522453219757

# Assignment:
**Change the hyperparameters of the Neural Network such that the prediction score is closer to the one of the Gradient Boosting model**

# New challenge: Classification

# Data

Predictive maintenance: Equipment failure and downtime can be costly for energy infrastructure, such as power plants and transmission systems. Machine learning algorithms can analyze sensor data, historical maintenance records, and other relevant data to predict equipment failures and identify maintenance requirements proactively. This enables energy companies to schedule maintenance activities more effectively, reduce downtime, and optimize maintenance costs.

In [28]:
new_df = pd.read_csv('sensor_data_maintenance.csv')
new_df.head()

Unnamed: 0.1,Unnamed: 0,Sensor_1,Sensor_2,Sensor_3,Sensor_4,Sensor_5,Sensor_6,Sensor_7,Sensor_8,Sensor_9,...,Sensor_12,Sensor_13,Sensor_14,Sensor_15,Sensor_16,Sensor_17,Sensor_18,Sensor_19,Sensor_20,Needs_maintenance
0,0,-1.583773,-1.244331,-0.113223,-1.087246,-1.18668,-0.690355,-1.038473,0.345244,-1.32552,...,-0.492003,0.990395,0.932278,0.216999,1.020084,-0.026296,0.521321,-0.364502,-1.308543,1
1,1,-0.039139,0.980477,1.379957,0.497047,-1.360191,1.321492,0.62937,0.18228,-1.181035,...,-0.299087,0.911204,0.424911,0.154348,1.792933,0.547381,1.930945,0.781602,0.863269,1
2,2,0.15244,-2.348545,-1.728102,-0.077548,0.554057,-0.225785,0.225522,-0.390812,0.518928,...,-0.020434,0.003841,0.055826,0.339131,-1.764101,-0.06905,-0.282374,-1.052295,-0.299364,0
3,3,1.275449,0.372947,-1.161504,2.362207,-0.299641,0.18473,0.399616,1.297538,0.591856,...,-0.301737,0.323409,0.702024,0.252096,0.733678,-0.533636,1.070523,1.030613,1.093862,1
4,4,-0.24856,-1.078426,-0.057282,-0.123365,-0.088669,-1.155219,2.042568,0.542607,-0.347265,...,-0.319895,1.562805,0.184635,-1.101577,-0.599507,1.657374,-0.113839,-1.590337,-1.940206,0


In [29]:
y = new_df['Needs_maintenance']
X = new_df.drop(['Needs_maintenance'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Train a classification network

In [30]:
class ClassificationNeuralNetwork(nn.Module, BaseEstimator, RegressorMixin):
    def __init__(self, input_size=20, hidden_size=64, pre_final_hidden_size=16,
                 output_size=2, lr=0.001, dropout_prob=0.02):
        super(ClassificationNeuralNetwork, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.pre_final_hidden_size = pre_final_hidden_size
        self.output_size = output_size
        self.lr = lr
        self.dropout_prob = dropout_prob
        self.num_epochs = 30
        self.criterion = nn.CrossEntropyLoss()

        self.flatten = nn.Flatten()
        self.bn0 = nn.BatchNorm1d(X_train.shape[1])
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.dropout1 = nn.Dropout(self.dropout_prob)
        self.linear2 = nn.Linear(hidden_size, pre_final_hidden_size)
        self.linear3 = nn.Linear(pre_final_hidden_size, output_size)

        self.softmax = nn.Softmax(dim=1)  # The last layer should be a softmax or sigmoid

    def forward(self, x):
        x = self.flatten(x)
        x = self.bn0(x)
        x = self.linear1(x)
        x = self.bn1(x)
        x = self.dropout1(x)
        x = F.relu(x)
        x = self.linear2(x)
        x = F.relu(x)
        x = self.linear3(x)
        x = self.softmax(x)
        return x

    def fit(self, X_train, y_train):
        self.optimizer = optim.Adam(self.parameters(), lr=self.lr)
        X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

        self.train()
        for epoch in range(self.num_epochs):
            for inputs, targets in train_loader:
                # forward
                outputs = self(inputs)
                loss = self.criterion(outputs, targets)

                # backward
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

            # print the loss every 10 epochs
            if (epoch + 1) % 10 == 0:
                print(f"Epoch [{epoch + 1}/{self.num_epochs}], Loss: {loss.item():.4f}")

        return self

    def predict(self, X_test):
        X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
        self.eval()
        with torch.no_grad():
            predictions = self(X_test_tensor).numpy()
        return predictions.argmax(1).astype(int)

In [31]:
best_params['input_size'] = X_train.shape[1]
best_params['output_size'] = len(y_train.unique())

In [32]:
model_cl = ClassificationNeuralNetwork(input_size=X_train.shape[1])
model_cl.fit(X_train, y_train)

Epoch [10/30], Loss: 0.5820
Epoch [20/30], Loss: 0.5398
Epoch [30/30], Loss: 0.5641


# Evaluate

In [33]:
from sklearn.metrics import classification_report

y_pred = model_cl.predict(X_test)

scores = classification_report(y_test.values, y_pred)
print(scores)

              precision    recall  f1-score   support

           0       0.76      0.78      0.77      1647
           1       0.77      0.76      0.77      1653

    accuracy                           0.77      3300
   macro avg       0.77      0.77      0.77      3300
weighted avg       0.77      0.77      0.77      3300



# Compare to other models

In [34]:
from sklearn.ensemble import \
    GradientBoostingClassifier  # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)

y_pred = gbc.predict(X_test)

scores = classification_report(y_test, y_pred)
print(scores)

              precision    recall  f1-score   support

           0       0.82      0.81      0.82      1647
           1       0.81      0.83      0.82      1653

    accuracy                           0.82      3300
   macro avg       0.82      0.82      0.82      3300
weighted avg       0.82      0.82      0.82      3300



# Assignment:
**Change the hyperparameters of the Neural Network such that the prediction score is closer to the one of the Gradient Boosting model**

In [35]:
# Find Gradient Boosting Model prediction score
# TODO

# Change the hyperparameters of the NN to improve the prediction score
hyperparameters = {
    'input_size': X_train.shape[1],
    # FIXME check RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x21 and 20x64)
    'hidden_size': 64,
    'pre_final_hidden_size': 16,
    'output_size': len(y_train.unique()),
    'lr': 0.001,
    'dropout_prob': 0.02
}

# PRINT ORIGINAL SCORES
print(f"CL score: {0.78}")
print(f"GBC score: {0.80}")

model_cl_ht = model_cl
model_cl_ht.set_params(**hyperparameters)

model_cl_ht.fit(X_train, y_train)

# FIXME RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x21 and 20x64)

CL score: 0.78
GBC score: 0.8
Epoch [10/30], Loss: 0.4815
Epoch [20/30], Loss: 0.6380
Epoch [30/30], Loss: 0.5109
