# **<center>Modelling** 

In [7]:
import os
from dotenv import load_dotenv
# Resolve an error that I had with Cuda
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

from tqdm import tqdm

## plt.style.use('dark_background')


In [8]:
# Show all the columns in the .head() method
pd.set_option('display.max_columns', None)

In [9]:
# Load in the dotenv variables
load_dotenv()

# Get the the path variable from dotenv
project_path = os.getenv('Project_Path')[2:78]

# Change notebook directory back one so that it can acess the data
os.chdir(project_path)

In [10]:
data = pd.read_csv('./data/processed/train.csv')

In [11]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use the GPU
    print("GPU is available")
else:
    device = torch.device("cpu")  # Use the CPU
    print("GPU is not available, using CPU")

GPU is available


In [12]:
# Set the GPU through the name cuda
torch.device("cuda")

device(type='cuda')

In [13]:
# Create a fit function that takes the X-train, y-train, epochs and Batch Size and fits the model
def train_model(X_train, y_train, epochs, batch_sizes, net):
    y_train = np.array(y_train['Binary'])

    # Create an instance of the model with the correct input_dim
    model = net(X_train.shape[1])

    # Check if the GPU is available and save it to device. If not use the cpu
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Move the model over to the device that is available
    model.to(device)

    # Define the loss function
    criterion = nn.MSELoss()

    # Define the optimizer
    optimizer = optim.Adam(model.parameters())

    X_train_tensor = torch.Tensor(X_train.values).float().to(device)
    y_train_tensor = torch.Tensor(y_train).float().to(device)

    # Training loop
    num_epochs = epochs
    batch_size = batch_sizes
    global losses
    losses = []
    for epoch in range(num_epochs):
        # Shuffle the data at the start of each epoch
        indices = np.random.permutation(len(X_train_tensor))
        shuffled_X = X_train_tensor[indices]
        shuffled_y = y_train_tensor[indices]
        
        # Set the model to training mode
        model.train()

        # Mini-batch training
        for i in range(0, len(X_train_tensor), batch_size):

            batch_X = shuffled_X[i:i+batch_size]
            batch_y = shuffled_y[i:i+batch_size]

            # Forward pass
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Print loss and accuracy
            print(f'\rEpoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(X_train_tensor)}], Loss: {loss.item():.4f}, Accuracy: {accuracy.item():.4f}', end = '\r')
        
        losses.append(loss.item()) # type: ignore
    
    return model

In [5]:
'''
# Create a function that takes in the loss from the train_model as an argument
def loss_plot(loss, param_k = False, ticks_k = False):
    if ticks_k == False:
        # Create a plot of the the epics and the losses for each epoch
        plt.plot(range(1,len(loss) + 1), loss);
        # Match the ticks to the epochs
        plt.xticks(np.arange(1, len(loss) + 1));
    else:
        # Create a plot of the the epics and the losses for each epoch
        plt.plot(range(ticks_k[0], ticks_k[1]), loss); # type: ignore
        # Match the ticks to the epochs
        plt.xticks(np.arange(ticks_k[0], ticks_k[1] + 1)); # type: ignore
    if param_k == False:
        # Label the x-axis of the graph
        plt.xlabel("Epoch #");
    else:
        # Label the x-axis of the graph
        plt.xlabel("K - Value");
    # Label the y-axis of the graph
    plt.ylabel("Cost");
    # Title the graph
    plt.title("Model Cost");
'''    

In [None]:
# Create a function to evaluate the model
def eval_model(X_test, y_test, model, dig):
    # Set the model to evaluation mode
    model.eval()

    # Move the model and data back to the CPU
    model.to("cpu")
    X_test_tensor = torch.Tensor(X_test.values).float()

    # Make Preds a global variable for the roc graph
    global preds
    # Perform predictions on the test set
    with torch.no_grad():
        preds = model(X_test_tensor)

    # Convert the predictions tensor to a numpy array
    preds = preds.numpy()

    # Calculate the rootmean square
    plt.plot(x = np.linspace(1, len(preds), 1), y = preds)
    plt.plot(x = np.linspace(1, len(preds), 1), y = y_test)


In [None]:
# Define the model class Net from the parent class nn.Module
class Net(nn.Module):
    # Initialize the class with 
    def __init__(self, input_dim):
        # Initialize nn.Module with super
        super(Net, self).__init__()
        # Create the first layer of the neural net
        self.fc1 = nn.Linear(input_dim, 12)
        # Create the middle layers of the network with 12 nodes that connect to 12 nodes
        self.fc2 = nn.Linear(12, 12)
        self.fc3 = nn.Linear(12, 12)
        self.fc4 = nn.Linear(12, 12)
        # Create a layer that has 12 nodes that connect to 8 nodes
        self.fc5 = nn.Linear(12, 8)
        # Create the last layer that takes 8 nodes and compresses it down to 1
        self.fc6 = nn.Linear(8, 1)
        self.activation = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    # Define a method for the forward propagation of the model
    def forward(self, x):
        # Create the connections 
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.activation(self.fc4(x))
        x = self.activation(self.fc5(x))
        x = self.sigmoid(self.fc6(x))
        return x

In [None]:
# Initialize a random forest model
rf = RandomForestClassifier(n_jobs = 4)
# Create a set of values to search
param_grid = {'n_estimators': [5, 50, 100, 150, 200, 300, 600]}  
# Create the grid search object with 5 cross validational folds
grid_search = GridSearchCV(rf, param_grid, cv = 2, verbose = 1)  
# Fit the data to the grid search
grid_search.fit(X_train, y_train)
# Save the best parameters  
best_params = grid_search.best_params_
# Print the best parameters
best_params