In [57]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


In [58]:

data = pd.read_csv('processed_data_final.csv')
X = data.drop(columns=['JobSatisfaction_O'])
y = data['JobSatisfaction_O']

In [59]:
# Identify columns by category
numerical_fixed_columns = [col for col in X.columns if col.endswith('_F') and X[col].dtype == 'float64']
numerical_actionable_columns = [col for col in X.columns if col.endswith('_A') and X[col].dtype == 'float64']
binary_fixed_columns = [col for col in X.columns if col.endswith('_F') and X[col].dtype == 'int64']
binary_actionable_columns = [col for col in X.columns if col.endswith('_A') and X[col].dtype == 'int64']

In [60]:
binary_actionable_groups = {}
for col in binary_actionable_columns:
    question_prefix = '_'.join(col.split('_')[:-2])
    if question_prefix not in binary_actionable_groups:
        binary_actionable_groups[question_prefix] = []
    binary_actionable_groups[question_prefix].append(col)

In [61]:
scaler = MinMaxScaler()
X[numerical_fixed_columns + numerical_actionable_columns] = scaler.fit_transform(X[numerical_fixed_columns + numerical_actionable_columns])

In [62]:
preprocessing_info = {
    'numerical_fixed_columns': numerical_fixed_columns,
    'numerical_actionable_columns': numerical_actionable_columns,
    'binary_fixed_columns': binary_fixed_columns,
    'binary_actionable_groups': binary_actionable_groups,
    'scaler': scaler
}

In [69]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=8)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.111, random_state=8)  

In [70]:
# Convert the data to PyTorch tensors
X_train = torch.tensor(X_train.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_val = torch.tensor(X_val.values, dtype=torch.float32)
y_val = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1)
X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [71]:
print("Number of entries in X_train:", X_train.shape[0])
print("Number of entries in X_val:", X_val.shape[0])
print("Number of entries in X_test:", X_test.shape[0])
print("Number of entries in y_train:", y_train.shape[0])
print("Number of entries in y_val:", y_val.shape[0])
print("Number of entries in y_test:", y_test.shape[0])


Number of entries in X_train: 13374
Number of entries in X_val: 1670
Number of entries in X_test: 1672
Number of entries in y_train: 13374
Number of entries in y_val: 1670
Number of entries in y_test: 1672


In [72]:
# Define the neural network model with dropout layers
class JobSatisfactionNN(nn.Module):
    def __init__(self, input_dim):
        super(JobSatisfactionNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.dropout1 = nn.Dropout(0.5)  # Dropout layer after the first fully connected layer
        self.fc2 = nn.Linear(128, 64)
        self.dropout2 = nn.Dropout(0.5)  # Dropout layer after the second fully connected layer
        self.fc3 = nn.Linear(64, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)  # Applying dropout after activation
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)  # Applying dropout after activation
        x = self.fc3(x)
        return x

In [73]:
# Training configuration
input_dim = X.shape[1]
num_epochs = 500
batch_size = 32
learning_rate = 0.0005
best_val_loss = float('inf')
best_epoch = 0
patience = 20
epochs_no_improve = 0


In [74]:
# Initialize the model, loss function, and optimizer
model = JobSatisfactionNN(input_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.0001)

In [75]:
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    train_loss = criterion(outputs, y_train)
    train_loss.backward()
    optimizer.step()

    # Validation phase
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_loss = criterion(val_outputs, y_val)

    # Early stopping logic based on validation loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_epoch = epoch
        torch.save(model.state_dict(), 'best_model.pth')
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1

    if epochs_no_improve == patience:
        print(f'Early stopping at epoch {epoch+1}')
        break

# Print final training and validation error
print(f'Final Training Loss: {train_loss.item():.4f}')
print(f'Final Validation Loss: {val_loss.item():.4f}')


Early stopping at epoch 108
Final Training Loss: 0.6257
Final Validation Loss: 0.8106


In [76]:
X_train_val = torch.cat((X_train, X_val), 0)
y_train_val = torch.cat((y_train, y_val), 0)

model.load_state_dict(torch.load('best_model.pth'))  
for epoch in range(best_epoch + 1):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_val)
    loss = criterion(outputs, y_train_val)
    loss.backward()
    optimizer.step()

    # Step 4: Test the model
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    test_loss = criterion(test_outputs, y_test)
print(f'Test Loss: {test_loss.item():.4f}')

Test Loss: 0.8644


In [77]:
# Save the best model as the final model
torch.save(torch.load('best_model.pth'), 'final_model.pth')

