In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_hdf('train_embeddings.h5')
test = pd.read_hdf('test_embeddings.h5')
val = pd.read_hdf('val_embeddings.h5')

## Data Prep for Model

In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,text,source,prompt_id,text_length,word_count,bert_embeddings,gpt2_embeddings
0,57594,The food is always hot and made fresh. I prefe...,Flan-T5-XL,0,169,34,"[[0.1458053, 0.018536663, 0.25950676, 0.172973...","[[0.07363588, 0.18456551, -0.7668689, -0.40527..."
1,343858,Seriously the slowest service you could ever h...,Human,0,331,63,"[[0.20390975, 0.0711168, 0.18746778, 0.0818906...","[[-0.017117647, 0.13311762, -0.49339196, -0.29..."
2,462221,This reaction is favored at low pressures but ...,Human,0,610,98,"[[-0.46987852, 0.124872394, 0.20436251, -0.054...","[[-0.2966847, -0.14409573, -0.4495244, -0.1983..."
3,100762,"Justin had owned his car for over five years, ...",GPT-3.5,0,550,109,"[[-0.087581664, 0.021991476, 0.24508698, 0.040...","[[0.23361564, -0.12555604, -0.33635634, -0.224..."
4,639192,I got this. One I think you are mistaken it is...,OPT-2.7B,0,193,36,"[[0.10058918, 0.0625017, 0.18879642, 0.1949409...","[[-0.03273765, 0.09610998, -0.46160632, -0.116..."


In [4]:
train.columns

Index(['Unnamed: 0', 'text', 'source', 'prompt_id', 'text_length',
       'word_count', 'bert_embeddings', 'gpt2_embeddings'],
      dtype='object')

In [20]:
from sklearn.preprocessing import StandardScaler

def data_prep(df):
    # Assuming bert_embeddings are already in a compatible format
    bert_embeddings_array = np.array([np.ravel(embedding) for embedding in df['bert_embeddings'].tolist()])

    # Additional features with scaling
    scaler = StandardScaler()
    additional_features = scaler.fit_transform(df[['text_length', 'word_count']].values)

    # Combine into a single feature array
    X = np.column_stack((bert_embeddings_array, additional_features))
    
    # Labels (consider encoding if they are categorical/non-numeric)
    y = df['source'].values
    return X, y 


In [21]:
X_train, y_train = data_prep(train)
X_val, y_val = data_prep(val)
X_test, y_test = data_prep(test)

In [18]:
! pip install xgboost

[33mDEPRECATION: Loading egg at /home/sadibha2/.conda/envs/localization/lib/python3.12/site-packages/MultiScaleDeformableAttention-1.0-py3.12-linux-x86_64.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0mCollecting xgboost
  Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m0m eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.3


In [22]:
# Create the DMatrix with training data

from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the labels to numeric categories
y_train_encoded = label_encoder.fit_transform(y_train)

# Optionally, transform y_val and other label sets if necessary
y_val_encoded = label_encoder.transform(y_val)



In [20]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train_encoded)



# Define the XGBoost parameters to utilize the GPU
# Use 'gpu_hist' for the 'tree_method' to use GPU accelerated algorithms.
params = {
    'objective': 'multi:softmax',
    'num_class': len(set(y_train)),  # Specify the number of classes if it's a multi-class classification
    'tree_method': 'gpu_hist',       # Use GPU acceleration
    'eval_metric': 'mlogloss',       # Multiclass logloss for evaluation
    'learning_rate': 0.1,            # Learning rate
    'max_depth': 6,                  # Depth of the trees
    'min_child_weight': 1,           # Minimum sum of instance weight (hessian) needed in a child
    'subsample': 0.8,                # Subsample ratio of the training instances
    'colsample_bytree': 0.8,         # Subsample ratio of columns when constructing each tree
    'num_round': 100                 # Number of boosting rounds
}

# Define evaluation set
eval_set = [(dtrain, 'train'), (xgb.DMatrix(X_val, label=y_val_encoded), 'eval')]

# Train the model with evals to watch performance
gpu_model = xgb.train(params, dtrain, num_boost_round=params['num_round'], evals=eval_set)

# Save the model
gpu_model.save_model('xgb_model_gpu.model')




    E.g. tree_method = "hist", device = "cuda"

Parameters: { "num_round" } are not used.



[0]	train-mlogloss:3.06740	eval-mlogloss:3.08083
[1]	train-mlogloss:2.87145	eval-mlogloss:2.89406
[2]	train-mlogloss:2.72593	eval-mlogloss:2.75680
[3]	train-mlogloss:2.60797	eval-mlogloss:2.64629
[4]	train-mlogloss:2.51206	eval-mlogloss:2.55709
[5]	train-mlogloss:2.42938	eval-mlogloss:2.48107
[6]	train-mlogloss:2.35779	eval-mlogloss:2.41603
[7]	train-mlogloss:2.29354	eval-mlogloss:2.35758
[8]	train-mlogloss:2.23553	eval-mlogloss:2.30561
[9]	train-mlogloss:2.18326	eval-mlogloss:2.25893
[10]	train-mlogloss:2.13571	eval-mlogloss:2.21695
[11]	train-mlogloss:2.09207	eval-mlogloss:2.17846
[12]	train-mlogloss:2.05235	eval-mlogloss:2.14444
[13]	train-mlogloss:2.01520	eval-mlogloss:2.11251
[14]	train-mlogloss:1.98110	eval-mlogloss:2.08338
[15]	train-mlogloss:1.94874	eval-mlogloss:2.05581
[16]	train-mlogloss:1.91904	eval-mlogloss:2.03099
[17]	train-mlogloss:1.89089	eval-mlogloss:2.00765
[18]	train-mlogloss:1.86453	eval-mlogloss:1.98618
[19]	train-mlogloss:1.83953	eval-mlogloss:1.96597
[20]	train


    E.g. tree_method = "hist", device = "cuda"



In [21]:
# Create DMatrix for both training and validation sets
dtrain_eval = xgb.DMatrix(X_train, label=y_train_encoded)
dval_eval = xgb.DMatrix(X_val, label=y_val_encoded)

# Predict using the GPU model
y_train_pred = gpu_model.predict(dtrain_eval)
y_val_pred = gpu_model.predict(dval_eval)

# Since the model's objective is 'multi:softmax', it outputs the predicted class directly
# If using 'multi:softprob', you would use `np.argmax()` to get predictions from probabilities


In [22]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
train_accuracy = accuracy_score(y_train_encoded, y_train_pred)
val_accuracy = accuracy_score(y_val_encoded, y_val_pred)

print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")


Training Accuracy: 67.54%
Validation Accuracy: 53.85%


In [23]:
from sklearn.metrics import classification_report

# Print classification report for both training and validation
print("Training Classification Report:")
print(classification_report(y_train_encoded, y_train_pred))

print("Validation Classification Report:")
print(classification_report(y_val_encoded, y_val_pred))


Training Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.38      0.50      3701
           1       1.00      1.00      1.00       722
           2       0.93      0.98      0.95      1033
           3       0.63      0.41      0.49      5877
           4       0.73      0.34      0.46      5857
           5       0.53      0.49      0.51      5850
           6       0.66      0.47      0.55      5743
           7       0.76      0.61      0.68      5586
           8       0.98      0.32      0.49      2057
           9       0.73      0.70      0.71     12974
          10       0.92      0.19      0.32      2269
          11       0.83      0.65      0.73      4031
          12       0.64      0.98      0.77    102576
          13       0.89      0.32      0.47      2710
          14       0.90      0.26      0.41      3108
          15       0.87      0.27      0.41      3130
          16       0.91      0.38      0.54      

## LSTM Model on BERT Embeddings

In [16]:
import torch

# Setting up the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [23]:
from torch.utils.data import DataLoader, TensorDataset

# Assuming X_train includes embeddings concatenated with two additional features
# and is already appropriately scaled/normalized

# Convert data to PyTorch tensors and move to the appropriate device
train_data = TensorDataset(
    torch.tensor(X_train).float().to(device),  # ensure X_train is properly formatted and normalized
    torch.tensor(y_train_encoded).long().to(device)
)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

# Similar for validation data
val_data = TensorDataset(
    torch.tensor(X_val).float().to(device), 
    torch.tensor(y_val_encoded).long().to(device)
)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False)


TextClassifier(
  (lstm): LSTM(768, 256, batch_first=True, bidirectional=True)
  (attention): Attention(
    (softmax): Softmax(dim=1)
  )
  (fc1): Linear(in_features=514, out_features=512, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=512, out_features=33, bias=True)
)

In [28]:
def evaluate(model, val_loader):
    model.eval()  # Set the model to evaluation mode
    total, correct = 0, 0
    with torch.no_grad():
        for embeddings, labels in val_loader:
            outputs = model(embeddings)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    model.train()  # Set the model back to training mode
    return correct / total


In [29]:
for embeddings, labels in train_loader:
    print("Embeddings shape:", embeddings.shape)
    outputs = model(embeddings)


Embeddings shape: torch.Size([32, 770])
Embeddings shape: torch.Size([32, 768])
Additional features shape: torch.Size([32, 2])
LSTM output shape: torch.Size([32, 512])


IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)

In [13]:
import torch.optim as optim
from tqdm import tqdm

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training and validation loop
num_epochs = 100
for epoch in tqdm(range(num_epochs)):
    model.train()  # Ensure the model is in training mode
    running_loss = 0.0
    for i, (embeddings, labels) in enumerate(train_loader, 1):
        optimizer.zero_grad()
        outputs = model(embeddings)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
        if i % 100 == 0:  # Print every 100 mini-batches
            print(f'Epoch {epoch + 1}, Batch {i}, Loss: {running_loss / 100:.4f}')
            running_loss = 0.0

    # Validation accuracy
    val_accuracy = evaluate(model, val_loader)
    print(f'Validation Accuracy after Epoch {epoch + 1}: {val_accuracy:.4f}')


  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
  0%|                                                                                                                    | 0/100 [00:01<?, ?it/s]


RuntimeError: shape '[788480, 1]' is invalid for input of size 786432

In [1]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import torch._dynamo
torch._dynamo.disable()


# Assuming the dataset and labels are loaded correctly and y_train_encoded is available
X_train_tensor = torch.tensor(X_train).float()  # Ensure X_train is a numpy array or similar
y_train_tensor = torch.tensor(y_train_encoded).long()

# If using a GPU, move data to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
X_train_tensor, y_train_tensor = X_train_tensor.to(device), y_train_tensor.to(device)

# DataLoader setup
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define the model
model = TextClassifier(embedding_dim=768, hidden_dim=256, num_classes=len(set(y_train_encoded)))
model.to(device)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training loop
model.train()
for epoch in tqdm(range(100)):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


NameError: name 'X_train' is not defined