In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt



In [14]:

data = pd.read_csv('processed_data_final.csv')
X = data.drop(columns=['JobSatisfaction_O'])
y = data['JobSatisfaction_O']

In [15]:
# Identify columns by category
numerical_fixed_columns = [col for col in X.columns if col.endswith('_F') and X[col].dtype == 'float64']
numerical_actionable_columns = [col for col in X.columns if col.endswith('_A') and X[col].dtype == 'float64']
binary_fixed_columns = [col for col in X.columns if col.endswith('_F') and X[col].dtype == 'int64']
binary_actionable_columns = [col for col in X.columns if col.endswith('_A') and X[col].dtype == 'int64']

In [16]:
binary_actionable_groups = {}
for col in binary_actionable_columns:
    question_prefix = '_'.join(col.split('_')[:-2])
    if question_prefix not in binary_actionable_groups:
        binary_actionable_groups[question_prefix] = []
    binary_actionable_groups[question_prefix].append(col)

In [17]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X[numerical_fixed_columns + numerical_actionable_columns] = scaler.fit_transform(X[numerical_fixed_columns + numerical_actionable_columns])

In [18]:
preprocessing_info = {
    'numerical_fixed_columns': numerical_fixed_columns,
    'numerical_actionable_columns': numerical_actionable_columns,
    'binary_fixed_columns': binary_fixed_columns,
    'binary_actionable_groups': binary_actionable_groups,
    'scaler': scaler
}

In [19]:
# Convert the data to PyTorch tensors
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)


In [20]:
class JobSatisfactionNN(nn.Module):
    def __init__(self, input_dim, dropout_rate=0.5):
        super(JobSatisfactionNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(128, 64)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.fc3 = nn.Linear(64, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

In [21]:
# Training configuration
input_dim = X.shape[1]
num_epochs = 50
batch_size = 32
learning_rate = 0.0005
k_folds = 10

In [22]:
# Initialize the model, loss function, and optimizer
model = JobSatisfactionNN(input_dim, dropout_rate=0.3)
criterion = nn.MSELoss()
# Initialize the optimizer with L2 regularization
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.0001)  

In [23]:
# Set up cross-validation
kf = KFold(n_splits=k_folds, shuffle=True, random_state=5)
fold_train_losses = []
fold_val_losses = []

In [24]:
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f'Fold {fold+1}/{k_folds}')
    
    # Split data
    X_train, X_val = X_tensor[train_idx], X_tensor[val_idx]
    y_train, y_val = y_tensor[train_idx], y_tensor[val_idx]
    
    # Create DataLoader
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    # Train the model
    for epoch in range(num_epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
        
        if (epoch+1) % 10 == 0:
            model.eval()
            with torch.no_grad():
                val_outputs = model(X_val)
                val_loss = criterion(val_outputs, y_val)
            print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}')
    
    # Final validation loss for the fold
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_loss = criterion(val_outputs, y_val)
    print(f'Fold {fold+1} Validation Loss: {val_loss.item():.4f}\n')

Fold 1/10
Epoch 10/50, Loss: 0.1377, Val Loss: 0.9222
Epoch 20/50, Loss: 0.8377, Val Loss: 0.9650
Epoch 30/50, Loss: 0.0110, Val Loss: 1.0240
Epoch 40/50, Loss: 0.5212, Val Loss: 1.0579
Epoch 50/50, Loss: 0.0011, Val Loss: 1.0158
Fold 1 Validation Loss: 1.0158

Fold 2/10
Epoch 10/50, Loss: 0.1695, Val Loss: 0.1639
Epoch 20/50, Loss: 0.1088, Val Loss: 0.2561
Epoch 30/50, Loss: 0.0702, Val Loss: 0.3267
Epoch 40/50, Loss: 0.2359, Val Loss: 0.3884
Epoch 50/50, Loss: 0.0003, Val Loss: 0.4178
Fold 2 Validation Loss: 0.4178

Fold 3/10
Epoch 10/50, Loss: 0.0934, Val Loss: 0.1029
Epoch 20/50, Loss: 0.4788, Val Loss: 0.1630
Epoch 30/50, Loss: 0.2874, Val Loss: 0.2198
Epoch 40/50, Loss: 0.8649, Val Loss: 0.2651
Epoch 50/50, Loss: 0.3196, Val Loss: 0.2864
Fold 3 Validation Loss: 0.2864

Fold 4/10
Epoch 10/50, Loss: 0.1364, Val Loss: 0.0878
Epoch 20/50, Loss: 0.0955, Val Loss: 0.1412
Epoch 30/50, Loss: 0.0277, Val Loss: 0.1672
Epoch 40/50, Loss: 0.0000, Val Loss: 0.2241
Epoch 50/50, Loss: 0.0002, V