# New section

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

input_size = 6

def data_preprocessing(task_1a_dataframe):
    encoded_dataframe = task_1a_dataframe.copy()
    columns_to_drop = ['Age', 'ExperienceInCurrentDomain']
    encoded_dataframe.drop(columns=columns_to_drop, inplace=True)
    categorical_columns = ['City', 'Education', 'PaymentTier', 'Gender', 'EverBenched']

    label_encoders = {}
    for col in categorical_columns:
        label_encoder = LabelEncoder()
        encoded_dataframe[col] = label_encoder.fit_transform(encoded_dataframe[col])
        label_encoders[col] = label_encoder

    return encoded_dataframe

def identify_features_and_targets(encoded_dataframe):
    selected_features = encoded_dataframe.drop(columns=['LeaveOrNot'])
    target_label = encoded_dataframe['LeaveOrNot']

    features_and_targets = [selected_features, target_label]
    return features_and_targets

def load_as_tensors(features_and_targets):
    selected_features, target_label = features_and_targets
    X_train, X_test, y_train, y_test = train_test_split(selected_features, target_label, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

    training_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    data_loader = DataLoader(training_dataset, batch_size=64, shuffle=True)

    tensors_and_iterable_training_data = [X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor, data_loader]

    return tensors_and_iterable_training_data

class SalaryPredictor(nn.Module):
    def __init__(self):
        super(SalaryPredictor, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, 16)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(16, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

def model_loss_function():
    loss_function = nn.BCELoss()
    return loss_function

def model_optimizer(model):
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    return optimizer

def model_number_of_epochs():
    number_of_epochs = 100  # Increase the number of epochs
    return number_of_epochs

def training_function(model, number_of_epochs, tensors_and_iterable_training_data, loss_function, optimizer):
    X_train_data, _, y_train_data, _, data_loader = tensors_and_iterable_training_data

    scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

    for epoch in range(number_of_epochs):
        model.train()
        for batch_X, batch_y in data_loader:
            optimizer.zero_grad()
            predictions = model(batch_X)
            batch_y = batch_y.view(-1, 1)
            loss = loss_function(predictions, batch_y)
            loss.backward()
            optimizer.step()

        scheduler.step()

    return model

def validation_function(trained_model, tensors_and_iterable_training_data):
    _, X_test_data, _, y_test_data, data_loader = tensors_and_iterable_training_data
    trained_model.eval()
    correct_predictions = 0
    total_samples = 0
    with torch.no_grad():
        for batch_X, batch_y in data_loader:
            batch_y = batch_y.view(-1, 1)
            predictions = trained_model(batch_X)
            predicted_labels = (predictions >= 0.5).float()
            correct_predictions += (predicted_labels == batch_y).sum().item()
            total_samples += batch_y.size(0)
    model_accuracy = correct_predictions / total_samples

    return model_accuracy

def tune_random_forest_hyperparameters(X_train, y_train):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    grid_search = GridSearchCV(
        estimator=RandomForestClassifier(random_state=42),
        param_grid=param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )

    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

if __name__ == "__main__":
    task_1a_dataframe = pd.read_csv('/content/task_1a_dataset.csv')
    encoded_dataframe = data_preprocessing(task_1a_dataframe)
    features_and_targets = identify_features_and_targets(encoded_dataframe)
    tensors_and_iterable_training_data = load_as_tensors(features_and_targets)
    scaler = StandardScaler()
    X_train_tensor, X_test_tensor, y_train_tensor, y_test_tensor, data_loader = tensors_and_iterable_training_data

    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train_tensor, y_train_tensor, test_size=0.2, random_state=42)

    # Scale the features
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    # Create DataLoader for validation (not used for training Random Forest)
    val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32))
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

    # Tune hyperparameters of the Random Forest classifier
    best_rf = tune_random_forest_hyperparameters(X_train, y_train)

    # Train the best Random Forest model
    best_rf.fit(X_train, y_train)

    # Evaluate the Random Forest model on the validation set
    val_predictions = best_rf.predict(X_val)
    val_accuracy_rf = accuracy_score(y_val, val_predictions)

    print(f"Validation Accuracy (Random Forest): {val_accuracy_rf:.4f}")

    # Continue with neural network training (if desired)
    model = SalaryPredictor()
    loss_function = model_loss_function()
    optimizer = model_optimizer(model)
    number_of_epochs = model_number_of_epochs()

    # Training loop
    for epoch in range(number_of_epochs):
        model.train()
        for batch_X, batch_y in data_loader:
            optimizer.zero_grad()
            predictions = model(batch_X)
            batch_y = batch_y.view(-1, 1)
            loss = loss_function(predictions, batch_y)
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        correct_predictions = 0
        total_samples = 0
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                batch_y = batch_y.view(-1, 1)
                predictions = model(batch_X)
                predicted_labels = (predictions >= 0.5).float()
                correct_predictions += (predicted_labels == batch_y).sum().item()
                total_samples += batch_y.size(0)
        val_accuracy_nn = correct_predictions / total_samples
        print(f"Epoch {epoch+1}/{number_of_epochs}, Validation Accuracy (NN): {val_accuracy_nn:.4f}")

    print("Training complete.")

    x = torch.tensor(X_train[0], dtype=torch.float32)
    jitted_model = torch.jit.save(torch.jit.trace(model, (x)), "task_1a_trained_model.pth")

    # Choose the best model based on validation accuracy
    if val_accuracy_rf >= val_accuracy_nn:
        best_model = best_rf
    else:
        best_model = model

    # Evaluate the best model on the test set
    X_test = scaler.transform(X_test_tensor)
    test_predictions = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test_tensor, test_predictions)

    print(f"Accuracy on the test set: {test_accuracy:.4f}")

  val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32))


Validation Accuracy (Random Forest): 0.8342
Epoch 1/100, Validation Accuracy (NN): 0.6536
Epoch 2/100, Validation Accuracy (NN): 0.6995
Epoch 3/100, Validation Accuracy (NN): 0.7116
Epoch 4/100, Validation Accuracy (NN): 0.7224
Epoch 5/100, Validation Accuracy (NN): 0.7332
Epoch 6/100, Validation Accuracy (NN): 0.7358
Epoch 7/100, Validation Accuracy (NN): 0.7318
Epoch 8/100, Validation Accuracy (NN): 0.7399
Epoch 9/100, Validation Accuracy (NN): 0.7385
Epoch 10/100, Validation Accuracy (NN): 0.7399
Epoch 11/100, Validation Accuracy (NN): 0.7439
Epoch 12/100, Validation Accuracy (NN): 0.7439
Epoch 13/100, Validation Accuracy (NN): 0.7803
Epoch 14/100, Validation Accuracy (NN): 0.7803
Epoch 15/100, Validation Accuracy (NN): 0.7803
Epoch 16/100, Validation Accuracy (NN): 0.7817
Epoch 17/100, Validation Accuracy (NN): 0.7803
Epoch 18/100, Validation Accuracy (NN): 0.7803
Epoch 19/100, Validation Accuracy (NN): 0.7803
Epoch 20/100, Validation Accuracy (NN): 0.7803
Epoch 21/100, Validation 