Predict test dataset's Saved value on Ranknet model

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

class RankNetDataset(Dataset):
    def __init__(self, df, features):
        self.features = df[features].values
        self.labels = df['Saved'].values
        self.response_ids = df['ResponseID'].values

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        features = torch.tensor(self.features[idx], dtype=torch.float32)
        labels = torch.tensor(self.labels[idx], dtype=torch.float32)
        response_id = self.response_ids[idx]
        return features, labels, response_id

class RankNet(nn.Module):
    def __init__(self, input_dim):
        super(RankNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

class RankNetModel:
    def __init__(self, features, model_path='ranknet_model.pth'):
        self.features = features
        self.model_path = model_path
        self.model = RankNet(len(features))

    def train(self, train_df, val_df, batch_size=32, epochs=50, patience=5, learning_rate=0.0005):
        train_dataset = RankNetDataset(train_df, self.features)
        val_dataset = RankNetDataset(val_df, self.features)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        criterion = nn.BCEWithLogitsLoss()
        optimizer = optim.Adam(self.model.parameters(), lr=learning_rate, weight_decay=1e-4)

        best_val_loss = float('inf')
        patience_counter = 0

        for epoch in range(epochs):
            self.model.train()
            train_loss = 0.0
            for features, labels, _ in train_loader:
                labels = labels.view(-1, 1)
                optimizer.zero_grad()
                outputs = self.model(features)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()
            print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss / len(train_loader):.4f}")

            self.model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for features, labels, _ in val_loader:
                    labels = labels.view(-1, 1)
                    outputs = self.model(features)
                    loss = criterion(outputs, labels)
                    val_loss += loss.item()
            val_loss /= len(val_loader)
            print(f"Validation Loss: {val_loss:.4f}")

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                torch.save(self.model.state_dict(), self.model_path)
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break

    def load_model(self):
        self.model.load_state_dict(torch.load(self.model_path))
        self.model.eval()

    def predict(self, test_df, batch_size=32):
        test_dataset = RankNetDataset(test_df, self.features)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        predictions = []
        with torch.no_grad():
            for features, labels, response_ids in test_loader:
                outputs = torch.sigmoid(self.model(features)).view(-1)
                for i in range(len(response_ids)):
                    predictions.append((response_ids[i], outputs[i].item(), labels[i].item()))

        predictions_df = pd.DataFrame(predictions, columns=['ResponseID', 'Predicted Probability', 'True Label'])

        # Update the logic to group by ResponseID and assign Predicted Saved
        final_predictions = []
        for response_id, group in predictions_df.groupby('ResponseID'):
            group = group.sort_values(by='Predicted Probability', ascending=False)
            group['Predicted Saved'] = [1 if idx == 0 else 0 for idx in range(len(group))]
            final_predictions.append(group)

        return pd.concat(final_predictions)

    def evaluate(self, predictions_df):
        y_true = predictions_df['True Label']
        y_pred = predictions_df['Predicted Saved']

        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        roc_auc = roc_auc_score(y_true, predictions_df['Predicted Probability'])

        print(f"\nModel Performance Metrics:\nAccuracy: {accuracy:.4f}\nPrecision: {precision:.4f}\nRecall: {recall:.4f}\nF1 Score: {f1:.4f}\nROC AUC: {roc_auc:.4f}")

        conf_matrix = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Saved', 'Saved'], yticklabels=['Not Saved', 'Saved'])
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.title('Confusion Matrix')
        plt.show()

In [2]:
import pandas as pd
test = pd.read_csv('1230_tets.csv')
tt = ['if_more_vulnerable', 'if_more_children', 'if_more_senior',
       'if_more_criminal', 'if_more_homeless', 'if_more_profession',
       'if_more_large', 'if_more_animal']
test[tt]=0
test.shape

(40, 14)

In [3]:
# predict on test data set
features = ['if_cross_by_rule', 'if_more_people',
       'if_more_male', 'if_more_female', 
       'if_more_vulnerable', 'if_more_children', 'if_more_senior',
       'if_more_criminal', 'if_more_homeless', 'if_more_profession',
       'if_more_large', 'if_more_animal']

ranknet_model = RankNetModel(features=features, model_path='ranknet_model.pth')

ranknet_model.load_model()

batch_size = 5
predictions_list = []

for start_idx in range(0, len(test), batch_size):
    end_idx = start_idx + batch_size
    batch = test.iloc[start_idx:end_idx]

    predicted_batch = ranknet_model.predict(batch)
    predictions_list.append(predicted_batch)

predicted_df = pd.concat(predictions_list, ignore_index=True)

print(predicted_df.head())

  ResponseID  Predicted Probability  True Label  Predicted Saved
0  tensor(1)               0.807092        -1.0                1
1  tensor(1)               0.233863        -2.0                0
2  tensor(2)               0.556622        -2.0                1
3  tensor(2)               0.466010        -1.0                0
4  tensor(3)               0.486835        -1.0                1


In [4]:
predicted_df

Unnamed: 0,ResponseID,Predicted Probability,True Label,Predicted Saved
0,tensor(1),0.807092,-1.0,1
1,tensor(1),0.233863,-2.0,0
2,tensor(2),0.556622,-2.0,1
3,tensor(2),0.46601,-1.0,0
4,tensor(3),0.486835,-1.0,1
5,tensor(3),0.497218,-2.0,1
6,tensor(4),0.842381,-1.0,1
7,tensor(4),0.169891,-2.0,0
8,tensor(5),0.809281,-1.0,1
9,tensor(5),0.223628,-2.0,0
