In [52]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
import os
import random
from sklearn.model_selection import GroupShuffleSplit, train_test_split
from tqdm.notebook import tqdm_notebook as tqdm
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7861b47cad90>

In [53]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [54]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ['TORCH_USE_CUDA_DSA'] = "1"

In [55]:
feature_tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
feature_model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=2).to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
df = pd.read_csv(r"/kaggle/input/adobetraindata/behaviour_simulation_train.csv")

In [57]:
def generate_prompt(row):
    prompt = f"Following is the information about Twitter post. "

    data_description = (f"Text content: {row['content']}, " 
                       f"Inferred company: {row['inferred company']}, " 
                       f"Username: {row['username']}, " 
                       f"Date and time: {row['date']}" 
                   )
    prompt += data_description
    return prompt

In [58]:
df['prompt'] = df.apply(generate_prompt, axis=1)

In [59]:
bins = [0, 10000, 100000000]
likes_binned_labels = [f'{bins[i]}-{bins[i + 1]}' for i in range(len(bins) - 2)]
likes_binned_labels.append('10000+')

# Create binned column
df['likes_binned'] = pd.cut(df['likes'], bins=bins, labels=likes_binned_labels, include_lowest=True)

In [60]:
print(df['likes_binned'].value_counts(normalize=True).sort_index())

likes_binned
0-10000    0.987537
10000+     0.012463
Name: proportion, dtype: float64


In [61]:
sample_fraction = 0.01
df_small, _ = train_test_split(df, test_size=1-sample_fraction,shuffle=True,random_state=42, stratify=df['likes_binned'])
df_small = df_small.reset_index(drop=1)
print(df_small['likes_binned'].value_counts(normalize=True))

likes_binned
0-10000    0.987667
10000+     0.012333
Name: proportion, dtype: float64


In [62]:
df = df_small

In [63]:
class FeatureDataset(nn.Module):
    def __init__(self, li):
        super().__init__()
        self.li = li
        
    def __len__(self):
        return len(self.li)
    
    def __getitem__(self, index):
        return {
            'id' : self.li[index][0],
            'text' : self.li[index][1]
        }

In [64]:
def get_one_hot_encoding(s):
    one_hot = [s==label for label in likes_binned_labels]
    return np.array(one_hot, dtype=np.float32)

In [65]:
torch.cuda.empty_cache()

memory_allocated = torch.cuda.memory_allocated(device) / (1024**3)  
memory_reserved = torch.cuda.memory_reserved(device) / (1024**3) 

print(f"Memory Allocated: {memory_allocated:.2f} GB")
print(f"Memory Reserved: {memory_reserved:.2f} GB")

Memory Allocated: 3.03 GB
Memory Reserved: 3.15 GB


In [66]:
class dataset(nn.Module): 
    def __init__(self, li): 
        self.li = li
  
    def __len__(self): 
        return len(self.li)
  
    def __getitem__(self, index):
        x, y = self.li[index]
        if y < 1000:
            y = torch.tensor(1)
        else:
            y = torch.tensor(0)
        return {
            'prompt': x,
            'likes_binned': y
        }

In [67]:
def make_split(li,group=None,normal=False, split_task='company'):
    if normal==True:
        train_li, val_li  = train_test_split(li, train_size = 0.8,shuffle=True,random_state=42,stratify=df_small['likes_binned'])
    elif split_task=='company':
        gss = GroupShuffleSplit(n_splits=2, train_size=.8, random_state=42)
        train_idx, val_idx = next(gss.split(li,groups= df['inferred company']))
        train_li, val_li = [e for i, e in enumerate(li) if i in train_idx] ,[e for i, e in enumerate(li) if i in val_idx]
    elif split_task=='time':
        Y = pd.to_datetime(df_small['date'])
        li = [l for _, l in sorted(zip(Y, li))]
        train_idx=len(li)*8//10
        train_li, val_li = li[:train_idx] , li[train_idx:]
    return train_li, val_li

In [68]:
li = [(prompt, label) for prompt, label in tqdm(zip(df['prompt'], df['likes']))]

0it [00:00, ?it/s]

In [69]:
train_li, val_li= make_split(li)

In [70]:
train_data = dataset(train_li)
val_data = dataset(val_li)

In [71]:
train_load = DataLoader(train_data, batch_size=32,shuffle=True, num_workers=4)
val_load = DataLoader(val_data, batch_size=32, shuffle=False, num_workers=4)

In [72]:
# Set up the optimizer
optimizer = AdamW(feature_model.parameters(), lr=1e-4)

best_loss = float('inf')

# Training loop
epochs = 5
feature_model.train()
for epoch in range(epochs):
    preds, true_labels = [], []
    net_loss = 0.0
    for data in tqdm(train_load):
        prompt = data['prompt']
        labels = data['likes_binned'].to(device)
        optimizer.zero_grad()

        encodings = feature_tokenizer(prompt, truncation=True, padding=True, max_length=128, return_tensors="pt").to(device)
        outputs = feature_model(input_ids=encodings['input_ids'], attention_mask=encodings['attention_mask'], labels=labels)
        loss = outputs.loss
        net_loss += loss.item()/len(train_load)
        
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        preds.extend(predictions.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
        
        loss.backward()
        optimizer.step()
        
        del prompt, labels, encodings, outputs, loss

    print(f"Epoch {epoch + 1}, Training Loss: {net_loss}")
    accuracy = np.sum(true_labels == np.array(preds))/len(preds)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    
    preds, true_labels = [], []
    net_loss = 0.0
    for data in tqdm(val_load):
        prompt = data['prompt']
        labels = data['likes_binned'].to(device)
            
        with torch.no_grad():
            encodings = feature_tokenizer(prompt, truncation=True, padding=True, max_length=128, return_tensors="pt").to(device)
            outputs = feature_model(input_ids=encodings['input_ids'], attention_mask=encodings['attention_mask'], labels=labels)
            loss = outputs.loss
            net_loss += loss.item()/len(val_load)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        preds.extend(predictions.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
        
        del prompt, labels, encodings, outputs, loss
            
    print(f"Epoch {epoch + 1}, Validation Loss: {net_loss}")
    accuracy = np.sum(true_labels == np.array(preds))/len(preds)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    if(net_loss < best_loss):
        print("Model Saved")
        best_loss = net_loss
        feature_model.save_pretrained("/kaggle/working/bert_classifier_model")



  0%|          | 0/84 [00:00<?, ?it/s]

Epoch 1, Training Loss: 0.40129729892526356
Accuracy: 86.09%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 1, Validation Loss: 0.32993098822506994
Accuracy: 90.29%
Model Saved


  0%|          | 0/84 [00:00<?, ?it/s]

Epoch 2, Training Loss: 0.3177593335331904
Accuracy: 86.77%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 2, Validation Loss: 0.374344144355167
Accuracy: 90.29%


  0%|          | 0/84 [00:00<?, ?it/s]

Epoch 3, Training Loss: 0.2735531255602836
Accuracy: 89.25%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 3, Validation Loss: 0.6038504242897033
Accuracy: 87.65%


  0%|          | 0/84 [00:00<?, ?it/s]

Epoch 4, Training Loss: 0.4015017217468648
Accuracy: 86.69%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 4, Validation Loss: 0.3230961696668105
Accuracy: 90.29%
Model Saved


  0%|          | 0/84 [00:00<?, ?it/s]

Epoch 5, Training Loss: 0.4035429254706416
Accuracy: 86.69%


  0%|          | 0/11 [00:00<?, ?it/s]

Epoch 5, Validation Loss: 0.3281292508948933
Accuracy: 90.29%


In [73]:
best_model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/bert_classifier_model", num_labels=2).to(device)

In [74]:
preds, true_labels = [], []
net_loss = 0.0
for batch in tqdm(val_load):
    prompt = data['prompt']
    labels = data['likes_binned'].to(device)
            
    with torch.no_grad():
        encodings = feature_tokenizer(prompt, truncation=True, padding=True, max_length=128, return_tensors="pt").to(device)
        outputs = best_model(input_ids=encodings['input_ids'], attention_mask=encodings['attention_mask'], labels=labels)
        loss = outputs.loss
        net_loss += loss.item()/len(val_load)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    preds.extend(predictions.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

print(f"Val Loss: {net_loss}")
# Calculate accuracy or other metrics suitable for multiclass classification
accuracy = np.sum(true_labels == np.array(preds))/len(preds)
print(f"Accuracy: {accuracy * 100:.2f}%")

  0%|          | 0/11 [00:00<?, ?it/s]

Val Loss: 0.23001472651958466
Accuracy: 95.00%


In [75]:
preds, true_labels = [], []
net_loss = 0.0
for batch in tqdm(train_load):
    prompt = data['prompt']
    labels = data['likes_binned'].to(device)
            
    with torch.no_grad():
        encodings = feature_tokenizer(prompt, truncation=True, padding=True, max_length=128, return_tensors="pt").to(device)
        outputs = best_model(input_ids=encodings['input_ids'], attention_mask=encodings['attention_mask'], labels=labels)
        loss = outputs.loss
        net_loss += loss.item()/len(train_load)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    preds.extend(predictions.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

print(f"Train Loss: {net_loss}")
# Calculate accuracy or other metrics suitable for multiclass classification
accuracy = np.sum(true_labels == np.array(preds))/len(preds)
print(f"Accuracy: {accuracy * 100:.2f}%")

  0%|          | 0/84 [00:00<?, ?it/s]

Train Loss: 0.23001472651958427
Accuracy: 95.00%


In [76]:
torch.save(feature_model, 'bertclassifiermodel.pth')

In [77]:
from IPython.display import FileLinks
FileLinks(r'/kaggle/working/bert_classifier_model.pth')

In [78]:
feature_model.save_pretrained("/kaggle/working/bert_classifier_model.pth")

In [79]:
feature_tokenizer.save_pretrained("/kaggle/working/bert_classifier_tokenizer.pth")

('/kaggle/working/bert_classifier_tokenizer.pth/tokenizer_config.json',
 '/kaggle/working/bert_classifier_tokenizer.pth/special_tokens_map.json',
 '/kaggle/working/bert_classifier_tokenizer.pth/vocab.txt',
 '/kaggle/working/bert_classifier_tokenizer.pth/bpe.codes',
 '/kaggle/working/bert_classifier_tokenizer.pth/added_tokens.json')