In [121]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW
from datasets import load_dataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm_notebook as tqdm

In [122]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [123]:
EPOCHS = 3
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
MAX_SEQ_LENGTH = 128

In [124]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

In [125]:
df = pd.read_csv("/kaggle/input/adobetraindata/behaviour_simulation_train.csv")

In [126]:
bins = [0, 10, 100, 1000, 10000, 100000000]
likes_binned_labels = [f'{bins[i]}-{bins[i + 1]}' for i in range(len(bins) - 2)]
likes_binned_labels.append('10000+')

# Create binned column
df['likes_binned'] = pd.cut(df['likes'], bins=bins, labels=likes_binned_labels, include_lowest=True)
print(df['likes_binned'].value_counts(normalize=True).sort_index())

likes_binned
0-10          0.322850
10-100        0.225707
100-1000      0.327200
1000-10000    0.111780
10000+        0.012463
Name: proportion, dtype: float64


In [127]:
df, _ = train_test_split(df, train_size = 1/30, stratify = df['likes_binned'])
df = df.reset_index(drop = True)

In [128]:
def generate_prompt(row):
    prompt = f"Following is the information about Twitter post. "

    data_description = (f"Text content: {row['content']}, " 
                       f"Inferred company: {row['inferred company']}, " 
                       f"Username: {row['username']}, " 
                       f"Date and time: {row['date']}" 
                   )
    prompt += data_description
    return prompt

In [129]:
df['prompt'] = df.apply(generate_prompt, axis=1)

In [130]:
df.head()

Unnamed: 0,id,date,likes,content,username,media,inferred company,likes_binned,prompt
0,133124,2020-02-18 18:14:04,1,Oyo NULGE Calls Off Sit At Home Order <hyperli...,IndependentNGR,[Photo(previewUrl='https://pbs.twimg.com/media...,independent,0-10,Following is the information about Twitter pos...
1,261362,2018-06-09 07:00:01,44,"SHOCKING: Born Free survey reveals 4,755 dange...",BornFreeFDN,[Photo(previewUrl='https://pbs.twimg.com/media...,free,10-100,Following is the information about Twitter pos...
2,208004,2020-05-14 18:01:00,125,Trudeau urges people to “buy Canadian” as gove...,CBCTheNational,[Video(thumbnailUrl='https://pbs.twimg.com/amp...,cbc,100-1000,Following is the information about Twitter pos...
3,275076,2020-08-10 13:03:08,1,"Asus VivoBook Flip 14 Sports 360-Degree Hinge,...",IndependentNGR,[Photo(previewUrl='https://pbs.twimg.com/media...,independent,0-10,Following is the information about Twitter pos...
4,86651,2019-10-12 14:18:33,0,Lagos State Gov. Launches Endowment Fund For P...,IndependentNGR,[Photo(previewUrl='https://pbs.twimg.com/media...,independent,0-10,Following is the information about Twitter pos...


In [131]:
torch.cuda.empty_cache()

memory_allocated = torch.cuda.memory_allocated(device) / (1024**3)  
memory_reserved = torch.cuda.memory_reserved(device) / (1024**3) 

print(f"Memory Allocated: {memory_allocated:.2f} GB")
print(f"Memory Reserved: {memory_reserved:.2f} GB")

Memory Allocated: 4.00 GB
Memory Reserved: 4.28 GB


In [132]:
df = df[['prompt', 'likes']]

In [133]:
class featuredataset(nn.Module):
    def __init__(self, df):
        super().__init__()
        self.df = df
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        return {
            'text' : self.df['prompt'][index],
            'likes' : self.df['likes'][index]
        }

In [134]:
temp_data = featuredataset(df)

In [135]:
temp_load = DataLoader(temp_data, batch_size = BATCH_SIZE, shuffle = True, num_workers = 0)

In [136]:
input_li = []
attention_mask_li = []

for examples in tqdm(temp_load):
    inputs = tokenizer(
        examples['text'], 
        max_length=MAX_SEQ_LENGTH, 
        padding='max_length', 
        truncation=True, 
        return_tensors="pt"
    )
    
    targets = torch.tensor(examples['likes'], dtype=torch.float).unsqueeze(1)
    input_li.extend(inputs['input_ids'])
    attention_mask_li.extend(inputs['attention_mask'])
    
    del targets, inputs

  0%|          | 0/1250 [00:00<?, ?it/s]

  targets = torch.tensor(examples['likes'], dtype=torch.float).unsqueeze(1)


In [137]:
df['input_ids'] = input_li
df['attention_mask'] = attention_mask_li

In [147]:
class dataset(nn.Module):
    def __init__(self, df):
        super().__init__()
        self.df = df
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        return {
            'input_ids' : self.df['input_ids'][index],
            'attention_mask' : self.df['attention_mask'][index],
            'likes' : torch.tensor(self.df['likes'][index], dtype=torch.float32)
        }

In [148]:
class BartForSentenceRegression(nn.Module):
    def __init__(self):
        super(BartForSentenceRegression, self).__init__()
        self.bart = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
        self.regression_head = nn.Linear(self.bart.config.d_model, 1)  # Regression head

    def forward(self, input_ids, attention_mask):
        # Get the outputs from the BART model
        outputs = self.bart.model(input_ids=input_ids, attention_mask=attention_mask)
        # Use the last hidden state of the <s> token (first token) as the sentence representation
        sentence_representation = outputs.last_hidden_state[:, 0, :]
        # Pass it through the regression head to get the predicted value
        predicted_value = self.regression_head(sentence_representation).squeeze(-1)
        return predicted_value

In [149]:
model = BartForSentenceRegression()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BartForSentenceRegression(
  (bart): BartForConditionalGeneration(
    (model): BartModel(
      (shared): BartScaledWordEmbedding(50265, 768, padding_idx=1)
      (encoder): BartEncoder(
        (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
        (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
        (layers): ModuleList(
          (0-5): 6 x BartEncoderLayer(
            (self_attn): BartSdpaAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Lin

In [156]:
train_data = dataset(df)

In [157]:
train_load = DataLoader(train_data, batch_size = BATCH_SIZE*2, shuffle = True, num_workers = 4)

In [158]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)



In [159]:
for epoch in range(30):
    epoch_loss = 0
    for batch in tqdm(train_load):
        # Move batch to the device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['likes'].to(device)
        
        # Forward pass
        predicted_likes = model(input_ids=input_ids, attention_mask=attention_mask)
        # Calculate MSE loss
        loss = torch.nn.functional.mse_loss(predicted_likes, labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{EPOCHS}, Loss: {avg_loss:.4f}')

print("Training complete!")

  0%|          | 0/625 [00:00<?, ?it/s]

Epoch 1/3, Loss: 8664627.0180


  0%|          | 0/625 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [160]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, data_loader, device):
    model.eval()  # Set the model to evaluation mode
    all_predictions = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation for evaluation
        for batch in tqdm(train_load):
            # Move batch to the device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['likes'].to(device)

            # Forward pass to get predictions
            predicted_likes = model(input_ids=input_ids, attention_mask=attention_mask)

            # Store the predicted and actual values
            all_predictions.extend(predicted_likes.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Convert lists to numpy arrays
    all_predictions = np.array(all_predictions)
    all_labels = np.array(all_labels)

    # Calculate evaluation metrics
    mse = mean_squared_error(all_labels, all_predictions)
    mae = mean_absolute_error(all_labels, all_predictions)
    r2 = r2_score(all_labels, all_predictions)

    print(f'Mean Squared Error: {mse:.4f}')
    print(f'Mean Absolute Error: {mae:.4f}')
    print(f'R-squared Score: {r2:.4f}')

    return all_predictions, all_labels

# Example usage of the evaluation function
all_predictions, all_labels = evaluate_model(model, train_loader, device)


  0%|          | 0/625 [00:00<?, ?it/s]

Mean Squared Error: 17319012.0000
Mean Absolute Error: 741.8267
R-squared Score: -0.0262
