In [2]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import r2_score
import pandas as pd

In [None]:
Bert_df = pd.read_csv("model_data.csv")

In [None]:
class CustomTweetDataset(Dataset):
    def __init__(self, tweets, features, labels, tokenizer, max_length):
        self.tweets = tweets
        self.features = features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        tweet = self.tweets[idx]
        feature = self.features[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            tweet,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'tweet': tweet,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'features': torch.tensor(feature, dtype=torch.float),
            'likes': torch.tensor(label, dtype=torch.float),
        }


def create_data_loader(df, tokenizer, max_length, batch_size):
    dataset = CustomTweetDataset(
        tweets=df.lemmatized_text.to_numpy(),
        features=df.iloc[:, 0:-2].to_numpy(),
        labels=df.like_count.to_numpy(),
        tokenizer=tokenizer,
        max_length=max_length,
    )

    return DataLoader(
        dataset,
        batch_size=batch_size,
    )


class BertweetRegressor(torch.nn.Module):
    def __init__(self, bertweet_model, num_features, num_labels=1):
        super(BertweetRegressor, self).__init__()
        self.bertweet = bertweet_model
        self.features_fc = torch.nn.Linear(num_features, num_features)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(bertweet_model.config.hidden_size + num_features, num_labels)

    def forward(self, input_ids, attention_mask, features):
        bertweet_output = self.bertweet(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bertweet_output.pooler_output

        features_output = self.features_fc(features)
        features_output = torch.relu(features_output)

        concat_output = torch.cat((pooled_output, features_output), 1)
        concat_output = self.dropout(concat_output)

        logits = self.classifier(concat_output)
        return logits.squeeze()

In [None]:
# Load pre-trained BERTweet model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
bertweet_model = AutoModel.from_pretrained("vinai/bertweet-base")



In [None]:
max_length = 128
batch_size = 16
num_features = len(Bert_df.columns)-2 # Number of additional features, excluding 'tweet' and 'likes' columns
num_epochs = 10
learning_rate = 2e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = BertweetRegressor(bertweet_model, num_features)
model.to(device)

In [None]:
train_df, val_df = train_test_split(Bert_df, test_size=0.2, random_state=42)

In [None]:
train_data_loader = create_data_loader(train_df, tokenizer, max_length, batch_size)
val_data_loader = create_data_loader(val_df, tokenizer, max_length, batch_size)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.MSELoss().to(device)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=1, verbose=True)



In [None]:
for epoch in range(num_epochs):
  model.train()
  train_losses = []
  for batch in train_data_loader:
      optimizer.zero_grad()

      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      features = batch["features"].to(device)
      likes = batch["likes"].to(device)

      outputs = model(input_ids=input_ids, attention_mask=attention_mask, features=features)
      loss = loss_fn(outputs, likes)
      train_losses.append(loss.item())

      loss.backward()
      optimizer.step()

  train_loss = np.mean(train_losses)
  print(f"Epoch {epoch + 1}/{num_epochs} | Train Loss: {train_loss}")

  model.eval()
  val_losses = []
  val_predictions = []
  val_actuals = []

  with torch.no_grad():
      for batch in val_data_loader:
          input_ids = batch["input_ids"].to(device)
          attention_mask = batch["attention_mask"].to(device)
          features = batch["features"].to(device)
          likes = batch["likes"].to(device)

          outputs = model(input_ids=input_ids, attention_mask=attention_mask, features=features)
          outputs_tensor = torch.tensor(outputs.cpu().numpy(), dtype=torch.float)
          likes_tensor = torch.tensor(likes.cpu().numpy(), dtype=torch.float)
          loss = loss_fn(outputs_tensor, likes_tensor)
          val_losses.append(loss.item())

          val_predictions.extend(outputs.cpu().numpy())
          val_actuals.extend(likes.cpu().numpy())

  val_loss = np.mean(val_losses)
  print(f"Epoch {epoch + 1}/{num_epochs} | Validation Loss: {val_loss}")

  scheduler.step(val_loss)

In [1]:
# Calculate the R2 score
# Calculate the R2 score
val_predictions = np.array(val_predictions)
val_actuals = np.array(val_actuals)
r2 = r2_score(val_actuals, val_predictions)
print(f"R2 Score: {r2}")

R2 Score: 0.24
