In [1]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torchmetrics
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GroupShuffleSplit
from torch.utils.data import Dataset, DataLoader, TensorDataset
from tqdm.auto import tqdm
import torch.optim as optim
import os
import pickle
import time
import gc

In [2]:
LOG=True
CLS=False

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [4]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [5]:
feature_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
feature_model = BertModel.from_pretrained("bert-base-uncased").to(device)
feature_model.eval()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [6]:
df = pd.read_csv(r"/kaggle/input/adobe-train-data/behaviour_simulation_train.csv")

In [7]:
cap_value = 1000000
df['likes_capped'] = df['likes'].apply(lambda x: min(x, cap_value))
print(df['likes_capped'].describe()) 
bins = [0, 100, 1000, 10000, cap_value]
labels = ['0-100', '101-1k','1k-10k','10k+']
df['likes_binned'] = pd.cut(df['likes_capped'], bins=bins, labels=labels, include_lowest=True)

count    300000.000000
mean        773.364793
std        4931.463419
min           0.000000
25%           3.000000
50%          76.000000
75%         364.000000
max      560193.000000
Name: likes_capped, dtype: float64


In [8]:
sample_fraction = 0.01 
df_small, _ = train_test_split(df, test_size=1-sample_fraction,shuffle=True,random_state=42, stratify=df['likes_binned'])
df_small = df_small.reset_index(drop=1)
df_small = df.copy()
print(df_small['likes_binned'].value_counts(normalize=True))

likes_binned
0-100     0.548557
101-1k    0.327200
1k-10k    0.111780
10k+      0.012463
Name: proportion, dtype: float64


In [9]:
class FeatureDataset(nn.Module):
    def __init__(self, li):
        super().__init__()
        self.li = li
        
    def __len__(self):
        return len(self.li)
    
    def __getitem__(self, index):
        if not LOG:
            return {
                'text' : self.li[index][0],
                'likes' : self.li[index][1]
            }
        return {
                'text' : self.li[index][0],
                'likes' : np.log1p(self.li[index][1])
            }

In [10]:
temp_li = FeatureDataset(df_small[['content', 'likes']].to_numpy())

In [11]:
import random
import numpy as np
import torch
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)


temp_li_load = DataLoader(temp_li, batch_size = 1024, shuffle = False)

In [12]:
torch.cuda.empty_cache()

memory_allocated = torch.cuda.memory_allocated(device) / (1024**3)  
memory_reserved = torch.cuda.memory_reserved(device) / (1024**3) 

print(f"Memory Allocated: {memory_allocated:.2f} GB")
print(f"Memory Reserved: {memory_reserved:.2f} GB")

Memory Allocated: 0.41 GB
Memory Reserved: 0.41 GB


In [13]:
import random
import numpy as np
import torch
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
li = []
for data in tqdm(temp_li_load):
    x = data['text']
    y = data['likes']
    with torch.no_grad():
        encoded_input = feature_tokenizer(x, return_tensors='pt', padding=True, truncation=True).to(device)
        output = feature_model(**encoded_input)
        if CLS: features = output.last_hidden_state[:,0,:].to('cpu')
        else :
            features = output.last_hidden_state.mean(dim=1).to('cpu')
    
    li.extend([(feat, label) for feat, label in zip(features, y)])
    del encoded_input, output, features
#     torch.cuda.empty_cache()
#     gc.collect()

  0%|          | 0/293 [00:00<?, ?it/s]

In [14]:
class dataset(Dataset): 
    def __init__(self, li): 
        self.li = li
  
    def __len__(self): 
        return len(self.li)
  
    def __getitem__(self, index):
        x, y = self.li[index]
        return {
            'features': x,
            'likes': torch.tensor(y, dtype = torch.float)
        }

In [15]:
import random
import numpy as np
import torch
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
def make_split(li,group=None,normal=False, split_task='company'):
    if normal==True:
        train_li, val_li  = train_test_split(li, train_size = 0.8,shuffle=True,random_state=42,stratify=df_small['likes_binned'])
    elif split_task=='company':
        gss = GroupShuffleSplit(n_splits=2, train_size=.8, random_state=42)
        train_idx, val_idx = next(gss.split(li,groups= df_small['inferred company']))
        train_li, val_li = [e for i, e in enumerate(li) if i in train_idx] ,[e for i, e in enumerate(li) if i in val_idx]
    elif split_task=='time':
        Y = pd.to_datetime(df_small['date'])
        li = [l for _, l in sorted(zip(Y, li), key=lambda x: x[0])]
        train_idx=len(li)*8//10
        train_li, val_li = li[:train_idx] , li[train_idx:]
    return train_li, val_li

In [16]:
train_li, val_li= make_split(li,split_task='time')

In [17]:
import random
import numpy as np
import torch
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

train_data = dataset(train_li)
val_data = dataset(val_li)

In [18]:
import random
import numpy as np
import torch
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

train_load = DataLoader(train_data, batch_size=32,shuffle=True, num_workers=4)
val_load = DataLoader(val_data, batch_size=32, shuffle=False, num_workers=4)

In [19]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Sequential(
#             nn.BatchNorm1d(768),  
#             nn.ReLU(),
#             nn.Dropout(0.4),
            
            nn.Linear(768, 128),  
            nn.BatchNorm1d(128),  
            nn.ReLU(),           
#             nn.Dropout(0.3),      
            
            nn.Linear(128, 1),  
            #nn.BatchNorm1d(),
#             nn.ReLU(),
#             nn.Dropout(0.3),
            
#             nn.Linear(256, 128),  
#             nn.BatchNorm1d(128),
#             nn.ReLU(),
#             nn.Dropout(0.3),
            
#             nn.Linear(128, 64),   
#             nn.BatchNorm1d(64),
#             nn.ReLU(),
            
#             nn.Linear(64, 1),
#             nn.ReLU()
        )
    
    def forward(self, x):
        x = self.fc(x).squeeze()
        return x

In [20]:
class RMSLELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self, pred, actual):
        return torch.sqrt(self.mse(torch.expm1(pred), torch.expm1(actual)))

In [21]:
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self, pred, actual):
        return torch.sqrt(self.mse(pred, actual))

In [22]:
import random
import numpy as np
import torch
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

model = Model()
model = model.to(device)
n_epochs = 10
exp_loss = RMSLELoss()
loss = RMSELoss()
optimiser = torch.optim.AdamW(model.parameters(),lr=0.1)
# LR_SCHEDULER = optim.lr_scheduler.OneCycleLR(optimiser, max_lr=1e-3, steps_per_epoch=len(train_load), epochs=n_epochs)
# ACC = torchmetrics.regression.MeanSquaredError(squared = False)
# ACC = ACC.to(device)

In [23]:
import warnings
warnings.filterwarnings("ignore")

In [24]:
import sys

import random
import numpy as np
import torch
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

best_loss = np.inf
for epoch in range(n_epochs):
    model.train()
    net_loss = 0.0
    if LOG: net_exp_loss = 0.0
    
    for data in tqdm(train_load):
        x = data['features'].to(device)
        y = data['likes'].to(device)
        
        pred = model(x)
        iter_loss = loss(pred, y)
        optimiser.zero_grad()  
        iter_loss.backward()
        optimiser.step() 
        net_loss += (iter_loss.item()) / len(train_load) 
        if LOG: iter_exp_loss = exp_loss(pred, y)
        if LOG: net_exp_loss += (iter_exp_loss.item()) / len(train_load) 
    if LOG: 
        print(f"Epoch {epoch+1}, Training RMSE: {net_exp_loss:.4f}")
        print(f"Epoch {epoch+1}, Training Log RMSE: {net_loss:.4f}")
    else:
        print(f"Epoch {epoch+1}, Training RMSE: {net_loss:.4f}")
    
#     LR_SCHEDULER.step()
    
    model.eval()
    val_loss = 0.0
    if LOG: val_exp_loss = 0.0
    with torch.no_grad(): 
        for data in tqdm(val_load):
            x = data['features'].to(device)
            y = data['likes'].to(device)
            pred = model(x)
#             ACC.update(pred, y)
            val_loss += (loss(pred, y).item()) / len(val_load)
            if LOG: val_exp_loss += (exp_loss(pred, y).item()) / len(val_load)
    if val_loss<best_loss:
        best_loss=val_loss
        torch.save(model.state_dict(), 'best_bert_string_only_model.pth')    
    if LOG:
        print(f"Epoch {epoch+1}, Validation RMSE: {val_exp_loss:.4f}")
        print(f"Epoch {epoch+1}, Validation Log RMSE: {val_loss:.4f}")
    else:
        print(f"Epoch {epoch+1}, Validation RMSE: {val_loss:.4f}")
    torch.cuda.empty_cache()
    sys.stdout.flush()


  0%|          | 0/7500 [00:00<?, ?it/s]

Epoch 1, Training RMSE: 2251.0012
Epoch 1, Training Log RMSE: 1.9490


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 1, Validation RMSE: 3288.9503
Epoch 1, Validation Log RMSE: 2.0192


  0%|          | 0/7500 [00:40<?, ?it/s]

Epoch 2, Training RMSE: 2191.9258
Epoch 2, Training Log RMSE: 1.9320


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 2, Validation RMSE: 3279.9571
Epoch 2, Validation Log RMSE: 1.9960


  0%|          | 0/7500 [00:00<?, ?it/s]

Epoch 3, Training RMSE: 2192.5414
Epoch 3, Training Log RMSE: 1.9313


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 3, Validation RMSE: 3269.2554
Epoch 3, Validation Log RMSE: 1.9041


  0%|          | 0/7500 [00:00<?, ?it/s]

Epoch 4, Training RMSE: 2187.8021
Epoch 4, Training Log RMSE: 1.9284


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 4, Validation RMSE: 3232.8041
Epoch 4, Validation Log RMSE: 1.9222


  0%|          | 0/7500 [00:00<?, ?it/s]

Epoch 5, Training RMSE: 2191.2831
Epoch 5, Training Log RMSE: 1.9296


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 5, Validation RMSE: 3351.4648
Epoch 5, Validation Log RMSE: 1.9423


  0%|          | 0/7500 [00:00<?, ?it/s]

Epoch 6, Training RMSE: 2181.7676
Epoch 6, Training Log RMSE: 1.9294


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 6, Validation RMSE: 3272.7214
Epoch 6, Validation Log RMSE: 1.9315


  0%|          | 0/7500 [00:00<?, ?it/s]

Epoch 7, Training RMSE: 2191.9894
Epoch 7, Training Log RMSE: 1.9317


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 7, Validation RMSE: 3269.8683
Epoch 7, Validation Log RMSE: 1.9265


  0%|          | 0/7500 [00:00<?, ?it/s]

Epoch 8, Training RMSE: 2194.6331
Epoch 8, Training Log RMSE: 1.9309


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 8, Validation RMSE: 3256.6385
Epoch 8, Validation Log RMSE: 1.9314


  0%|          | 0/7500 [00:00<?, ?it/s]

Epoch 9, Training RMSE: 2188.2635
Epoch 9, Training Log RMSE: 1.9273


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 9, Validation RMSE: 3230.6832
Epoch 9, Validation Log RMSE: 1.8736


  0%|          | 0/7500 [00:00<?, ?it/s]

Epoch 10, Training RMSE: 2185.9671
Epoch 10, Training Log RMSE: 1.9325


  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 10, Validation RMSE: 3215.0273
Epoch 10, Validation Log RMSE: 1.8869


In [25]:
pred_li = []
actual_li = []
model.load_state_dict(torch.load('best_bert_string_only_model.pth'))
model.eval()
test_loss = 0.0
if LOG: test_exp_loss = 0.0
with torch.no_grad(): 
    for data in tqdm(val_load):
        x = data['features'].to(device)
        y = data['likes'].to(device)
        pred = model(x)
        test_loss += (loss(pred, y).item()) / len(val_load) 
        if LOG: test_exp_loss += (exp_loss(pred, y).item()) / len(val_load) 
        pred_li.extend(pred)
        actual_li.extend(y)
if LOG:
    print(f"Epoch {epoch+1}, Test RMSE: {test_exp_loss:.4f}")
    print(f"Epoch {epoch+1}, Test Log RMSE: {test_loss:.4f}")
else:
    print(f"Epoch {epoch+1}, Test RMSE: {test_loss:.4f}")

  0%|          | 0/1875 [00:00<?, ?it/s]

Epoch 10, Test RMSE: 3230.6832
Epoch 10, Test Log RMSE: 1.8736


In [26]:
pred_li = [np.expm1(float(val.cpu().numpy())) if LOG else float(val.cpu().numpy()) for val in pred_li]
actual_li = [np.expm1(float(val.cpu().numpy())) if LOG else float(val.cpu().numpy()) for val in actual_li]


In [27]:
pred_df = pd.DataFrame(pred_li)
actual_df = pd.DataFrame(actual_li)

In [28]:
pred_df.describe()

Unnamed: 0,0
count,60000.0
mean,243.448336
std,524.909731
min,-0.898078
25%,13.539278
50%,86.120695
75%,268.615156
max,27047.817273


In [29]:
actual_df.describe()

Unnamed: 0,0
count,60000.0
mean,1045.567147
std,7186.807148
min,0.0
25%,4.0
50%,101.000001
75%,464.999958
max,447527.966354
