In [None]:
import torch
from torch import nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from fancyimpute import IterativeImputer

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer


from sklearn.model_selection import train_test_split
print(torch.__version__)

In [None]:
df = pd.read_csv('../data/wombi_employees.csv')
df['most_recent_income']= df['most_recent_income'].replace(',','', regex=True).apply(pd.to_numeric)

In [None]:
# df_new = df[['most_recent_income','problem_solving_skill']]
# df_new1 = df[['technology_skill','total_jobs','english_skill']]

# imputer = IterativeImputer()
# imputed_df = pd.DataFrame(imputer.fit_transform(df_new))
# imputed_df1 = pd.DataFrame(imputer.fit_transform(df_new1))

# imputed_df = imputed_df.rename({0:'most_recent_income',1:'problem_solving_skill'}, axis=1)
# imputed_df1 = imputed_df1.rename({0:'technology_skill',1:'total_jobs',2:'english_skill'}, axis=1)

# df['most_recent_income'] = imputed_df['most_recent_income']
# df['problem_solving_skill'] = imputed_df['problem_solving_skill']
# df['technology_skill'] = imputed_df1['technology_skill']
# df['total_jobs'] = imputed_df1['total_jobs']
# df['english_skill'] = imputed_df1['english_skill']

In [None]:
# df = df[['problem_solving_skill','technology_skill','english_skill','age','score']]

In [None]:
df = df.dropna()

In [None]:
X = df.drop('score',axis=1)
y = df['score']

# print(X.select_dtypes(include=['number']).columns)
X_num = X[['wombus_id', 'age', 'college_degree', 'problem_solving_skill',
       'technology_skill', 'english_skill', 'most_recent_income', 'total_jobs']]
# print(X.select_dtypes(include=['object']).columns)
X_ohe_cat = X[['birth_continent', 'gender', 'shirt_color_preference',
              'remote_work_preference','industry_preference']]
X_ord_cat = X[['customer_exp_preference', 'work_env_preference',
              'personal_growth_preference', 'honest_communication_preference',
              'community_service_preference']]

# Create the numerical pipeline
num_pipeline = make_column_transformer(
       (SimpleImputer(strategy="most_frequent"),['age']),
       (SimpleImputer(strategy="mean"),['problem_solving_skill', 'technology_skill','english_skill']),
       (SimpleImputer(strategy="most_frequent"),['total_jobs']),
       (SimpleImputer(strategy="mean"),['most_recent_income']),
       remainder='drop')

# Create the categorical pipeline for both one-hot encoding and ordinal encoding
cat_ohe_pipeline = Pipeline([
                    ('imputer',SimpleImputer(strategy='most_frequent'))
                    ,('ohe',OneHotEncoder())])
cat_ord_pipeline = Pipeline([
                    ('imputer',SimpleImputer(strategy='most_frequent')),
                    ('ordinal',OrdinalEncoder())])
# Add standardscaler to the numeric pipeline
num_pipeline1 = Pipeline([('imp_num_pipeline',num_pipeline),
                        ('scaler',StandardScaler())])

# Use ColumnTransformer to combine pipelines and create a workflow for the ML process

num_attribs = list(X_num)
cat_ohe_attribs = list(X_ohe_cat)
cat_ord_attribs = list(X_ord_cat)

full_pipeline = ColumnTransformer([
            ('num', num_pipeline1, num_attribs),
            ('cat_ohe', cat_ohe_pipeline,cat_ohe_attribs),
            ('cat_ord',cat_ord_pipeline,cat_ord_attribs)])

# fit transform your X
# X_prepared = num_pipeline1.fit_transform(X)
X_prepared = full_pipeline.fit_transform(X)

What our model does:
* It starts with random values (weights and bias)
* Looks at the training data and adjust the random values to better represent (or get closer to) the ideal value of the weights and bias

<br>and how it does so is by two main algorithm:
* Gradiant Descent (reason why requires_grad = True)... keeps track of the weights and bias parameter and update them in combination of gradient descent and backpropagation.
* Backpropagation

<br>
The whole idea for training is for a model to move from unknown parameter to some known parameters (from a poor representation of data to a better representation of data <br>
One way to measure how poor or how wrong our model predictions are is to use a loss function or cost function. (a function to measure how wrong our model predictions are to true value)<br>
Optimizers = takes into account the loss of a model and adjusts the model's parameters (weights and bias)



#### Training Loop
0. Loop through data
1. Forward pass (this involves data moving through our model's forward())
2. Calculate the loss (compare forward pass predictions to groudn truth labels)
3. Optimizer zero grad
4. Loss backward - moves backward through the network to calculate the gradients of each parameters with respect to the loss (backpropagation)
5. Optimizer Step - use the optimzier to adjust our model's parameters to try and improve the loss (gradient descent)

#### PyTorch Model Building Essentials
* torch.nn - contains all the building for computational graphs (a nearal network can be considered a computational graph)
* torch.nn.Parameter - what parameters should our model try and learn, often a PyTorch layer from torch.nn will set these for us
* torch.nn.Module - The base class for all neural network modules, if you subclass it, you should overwrite forward()
* torch.optim - this is where the optimizer in PyTorch live, they will help with the gradient descent
* def forward() - All nn.Module subclasses require you to overwrite forward(), this method defines what happens in the forward computation

In [None]:
torch.manual_seed(42)

In [None]:
X = torch.from_numpy(X_prepared.astype(np.float32))
y = torch.from_numpy(y.values.astype(np.float32)).view(y.shape[0],1)

In [None]:
input_dim=X.shape[1]
output_dim=1
param_grid = {'hidden_dim':[2,4,6,8,10,12,14,16,18,20], 'dropout_rate':[0.1,0.2,0.3]}
hidden_dim_range = [2,4,6,8,10,12,14,16,18,20]
dropout_rate_range = [0.1, 0.2, 0.3]
best_score = float('inf')

In [None]:
input_dim

In [None]:
train_split = int(0.8*len(X))
X_train, y_train = X[:train_split],y[:train_split]
X_test, y_test = X[train_split:],y[train_split:]

len(X_train), len(y_train), len(X_test), len(y_test)

In [None]:
class MultivariateLinearRegression(nn.Module):
    def __init__(self, input_dim, hidden_layers, output_dim, dropout_rate):
        super(MultivariateLinearRegression, self).__init__()
        self.linear1 = nn.Linear(input_dim,hidden_layers)
        self.linear2 = nn.Linear(hidden_layers,hidden_layers)
        self.linear3 = nn.Linear(hidden_layers,output_dim)
        self.dropout = nn.Dropout(p=dropout_rate)
    def forward(self,x):
        x = torch.relu(self.linear1(x))
        x = self.dropout(x)
        x = torch.relu(self.linear2(x))
        x = self.dropout(x)
        x = torch.relu(self.linear3(x))
        return x

In [None]:
criterion = nn.MSELoss()
# Set up loss fucntion
loss_fn = nn.L1Loss() # minimize the distance between predictions and true value... keeps increase weights until weights reduce loss.. decrease the bias and finds loss increase.. increase bias
#set up optimizer(stocastic gradient descent)

In [15]:
from sklearn.model_selection import ParameterGrid
for params in ParameterGrid(param_grid):
    model = MultivariateLinearRegression(input_dim, output_dim, params['hidden_dim'], params['dropout_rate'])
    num_epochs = 20001
    epoch_count = []
    loss_values = []
    test_loss_values = []
    optimizer = torch.optim.Adam(params=model.parameters(),lr=0.01)
    for epoch in range(num_epochs):
        # inputs, labels = enumerate(zip(X_train, y_train))
        model.train()
        y_pred = model(X_train)
        loss = criterion(y_pred, y_train)
        # print(f"Loss: {loss}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        #testing
        model.eval()
        with torch.inference_mode():
        # with torch.no_grad()
            test_pred = model(X_test)      
            test_loss = criterion(test_pred, y_test)
        if epoch % 1000 == 0:
            epoch_count.append(epoch)
            loss_values.append(loss)
            test_loss_values.append(test_loss)
            print(f"Epoch: {epoch} | Loss: {loss} | Test loss: {test_loss}")
        

Epoch: 1000 | Loss: 1074.91015625 | Test loss: 735.7568969726562
Epoch: 2000 | Loss: 949.537353515625 | Test loss: 684.03759765625
Epoch: 3000 | Loss: 853.565673828125 | Test loss: 664.6493530273438
Epoch: 4000 | Loss: 768.2911376953125 | Test loss: 648.0682983398438
Epoch: 5000 | Loss: 704.399169921875 | Test loss: 635.626708984375
Epoch: 6000 | Loss: 662.5659790039062 | Test loss: 627.2640991210938
Epoch: 7000 | Loss: 637.9906005859375 | Test loss: 622.3877563476562
Epoch: 8000 | Loss: 627.1609497070312 | Test loss: 620.39501953125
Epoch: 9000 | Loss: 624.265625 | Test loss: 620.2694702148438
Epoch: 10000 | Loss: 624.2367553710938 | Test loss: 620.6326904296875
Epoch: 11000 | Loss: 624.3193359375 | Test loss: 620.5252685546875
Epoch: 12000 | Loss: 624.1719360351562 | Test loss: 620.5164794921875
Epoch: 13000 | Loss: 624.2649536132812 | Test loss: 620.5256958007812
Epoch: 14000 | Loss: 624.2413330078125 | Test loss: 620.5756225585938
Epoch: 15000 | Loss: 624.16064453125 | Test loss: 6

In [None]:
model = MultivariateLinearRegression(input_dim, output_dim)

In [None]:
criterion = nn.MSELoss()
# Set up loss fucntion
loss_fn = nn.L1Loss() # minimize the distance between predictions and true value... keeps increase weights until weights reduce loss.. decrease the bias and finds loss increase.. increase bias
#set up optimizer(stocastic gradient descent)
optimizer = torch.optim.Adam(params=model.parameters(),lr=0.01)

In [None]:
num_epochs = 20001
epoch_count = []
loss_values = []
test_loss_values = []

for epoch in range(num_epochs):
    # inputs, labels = enumerate(zip(X_train, y_train))
    model.train()
    y_pred = model(X_train)
    loss = criterion(y_pred, y_train)
    # print(f"Loss: {loss}")

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    #testing
    model.eval()
    with torch.inference_mode():
    # with torch.no_grad()
        test_pred = model(X_test)      
        test_loss = criterion(test_pred, y_test)
    if epoch % 1000 == 0:
        epoch_count.append(epoch)
        loss_values.append(loss)
        test_loss_values.append(test_loss)
        print(f"Epoch: {epoch} | Loss: {loss} | Test loss: {test_loss}")
        # print(model.state_dict())
# print(model.linear.weight)
# print(model.linear.bias)

In [None]:
plt.plot(epoch_count, np.array(torch.tensor(loss_values).numpy()), label='Train loss')
plt.plot(epoch_count, test_loss_values, label="Test loss")
plt.title("Training and Test Loss Curves")
plt.ylabel("Loss")
plt.xlabel("Epochs")
plt.legend()
plt.show()

In [None]:
print("Final loss: ", loss.item())

In [None]:
model.state_dict()

In [None]:
model(X_test)

In [None]:
y_test