In [92]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torchvision
from torchvision import models, datasets, transforms
from torch.utils.data import Dataset, DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
from tqdm.notebook import tqdm_notebook as tqdm
from PIL import Image
import io, os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from catboost import CatBoostRegressor, CatBoostClassifier, Pool
import optuna
from sklearn.model_selection import TimeSeriesSplit
from transformers import BertModel, BertTokenizer
from imblearn.under_sampling import RandomUnderSampler

import warnings
warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

In [3]:
train_df = pd.read_csv('/kaggle/input/behaviour-simulation-train/behaviour_simulation_train.csv')

In [4]:
# base_images_path = '/kaggle/input/adobe-human-behavior-0-50k/train_images'
# for img_path in os.listdir(base_images_path):
#     img = np.array(Image.open(os.path.join(base_images_path, img_path)))
#     plt.imshow(img)
#     print(img_path.split('.'))
#     break

In [4]:
train_df.head()

Unnamed: 0,id,date,likes,content,username,media,inferred company
0,1,2020-12-12 00:47:00,1,"Spend your weekend morning with a Ham, Egg, an...",TimHortonsPH,[Photo(previewUrl='https://pbs.twimg.com/media...,tim hortons
1,2,2018-06-30 10:04:20,2750,Watch rapper <mention> freestyle for over an H...,IndyMusic,[Photo(previewUrl='https://pbs.twimg.com/media...,independent
2,3,2020-09-29 19:47:28,57,Canadian Armenian community demands ban on mil...,CBCCanada,[Photo(previewUrl='https://pbs.twimg.com/media...,cbc
3,4,2020-10-01 11:40:09,152,"1st in Europe to be devastated by COVID-19, It...",MKWilliamsRome,[Photo(previewUrl='https://pbs.twimg.com/media...,williams
4,5,2018-10-19 14:30:46,41,Congratulations to Pauletha Butts of <mention>...,BGISD,[Photo(previewUrl='https://pbs.twimg.com/media...,independent


In [54]:
bins = [0, 1000, 5000, float('inf')]
labels = ['0-1000', '1000-5000', '>=5000']

train_df['likes_class'] = pd.cut(train_df['likes'], bins=bins, labels=labels, right=False)

mean_likes_by_class = train_df.groupby('likes_class')['likes'].mean().to_dict()

print(mean_likes_by_class)

{'0-1000': 150.29876586446588, '1000-5000': 2114.3986468108255, '>=5000': 16511.041423866227}


In [55]:
train_df['likes_class'].value_counts()

likes_class
0-1000       262694
1000-5000     29412
>=5000         7894
Name: count, dtype: int64

In [57]:
train_df.loc[train_df['likes_class']=='>=5000']['likes'].std()

25525.58016847943

In [8]:
companies = np.unique(train_df['inferred company'])
# companies = os.listdir('/kaggle/input/company-wise-10k-zip')

In [9]:
val_company_indices = np.random.choice(np.arange(0,220), 20, replace = False)

In [10]:
train_companies = [companies[i] for i in range(0,220) if i not in val_company_indices]
val_companies = [companies[i] for i in val_company_indices]

In [11]:
train_indices = []
val_indices = []
train_images = []
val_images = []

In [12]:
for i in range(300000):
    if train_df.loc[i, 'inferred company'] in train_companies:
        train_indices.append(i)
    else:
        val_indices.append(i)

In [73]:
base_path = '/kaggle/input/company-wise-10k-zip'

In [77]:
for company in os.listdir(base_path):
    company_path = os.path.join(base_path, company)
    if company not in val_companies:
        for img_path in os.listdir(company_path):
            train_images.append(Image.open(os.path.join(company_path, img_path)))
            train_indices.append(int(img_path.split('.')[0]))
    else:
        for img_path in os.listdir(company_path):
            val_images.append(Image.open(os.path.join(company_path, img_path)))
            val_indices.append(int(img_path.split('.')[0]))

In [13]:
print(len(train_indices), len(val_indices))

275875 24125


In [79]:
model = models.efficientnet_b0(pretrained=True)

Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth
100%|██████████| 20.5M/20.5M [00:00<00:00, 83.3MB/s]


In [80]:
# model.classifier = nn.Sequential(nn.Linear(1280, 128, bias = True)
# )
model.classifier = nn.Identity()

In [22]:
# model.classifier = nn.Sequential(
# #     nn.Dropout(p=0.2, inplace = True),
#     nn.Linear(in_features=1280, out_features=512, bias=True),
#     nn.ReLU(),
#     nn.Linear(in_features = 512, out_features = 256, bias = True),
#     nn.ReLU(),
#     nn.Linear(in_features = 256, out_features = 1, bias = True),
#     nn.ReLU()
# )

In [81]:
for params in model.parameters():
    params.requires_grad = True

In [82]:
class dataset(Dataset):
    def __init__(self, data, images, indices, transform, train):
        super().__init__()
        self.data = data
        self.transform = transform
        self.images = images
        self.train = train
        self.indices = indices
        
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        img = np.array(self.images[idx])
        y = self.data.iloc[self.indices[idx]]['likes']
        
        if self.train:
            img = self.transform(image=img)['image']
            
        return img, y

In [83]:
train_transform = A.Compose([
                A.Resize(224, 224),
                A.Normalize(),
                ToTensorV2()])

In [84]:
train_data = dataset(train_df, train_images, train_indices, train_transform, True)
val_data = dataset(train_df, val_images, val_indices, train_transform, True)

In [85]:
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=False, num_workers=os.cpu_count())
val_dataloader = DataLoader(val_data, batch_size=32, shuffle=False, num_workers=os.cpu_count())

In [86]:
model = model.to(device)

In [87]:
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self,yhat,y):
        return torch.sqrt(self.mse(yhat,y))

In [88]:
loss_fn = RMSELoss()
optimizer = torch.optim.Adam(params = model.parameters())
# LR_scheduler = troch.optim.lr_scheduler.StepLR(optimizer, 4, 0.1)

In [87]:
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    net_loss = 0.0
    for img, y in tqdm(train_dataloader):
        img = img.to(torch.float32)
        y = y.to(torch.float32)
        img = img.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        y_pred = model(img)
        loss = loss_fn(y_pred, y)
        net_loss += (loss.item()) / len(train_dataloader)
        loss.backward()
        optimizer.step()
    print(f"epoch {epoch+1} | Loss: {net_loss}")
    
    model.eval()
    net_loss_val = 0.0
    with torch.inference_mode():
        for img, y in tqdm(val_dataloader):
            img = img.to(torch.float32)
            y = y.to(torch.float32)
            img = img.to(device)
            y = y.to(device)
            y_val_pred = model(img)
            val_loss = loss_fn(y_val_pred, y)
            net_loss_val += (val_loss.item()) / len(val_dataloader)
        print(f"epoch {epoch+1} | Loss: {net_loss_val}")

In [89]:
train_embeddings = []
val_embeddings = []
with torch.inference_mode():
    for img, _ in tqdm(train_dataloader):
        img = img.to(torch.float32)
        img = img.to(device)
        train_embeddings.append(model(img))
        
    for img, _ in tqdm(val_dataloader):
        img = img.to(torch.float32)
        img = img.to(device)
        val_embeddings.append(model(img))

  0%|          | 0/271 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

In [90]:
train_embeddings = torch.cat(train_embeddings, dim = 0)
val_embeddings = torch.cat(val_embeddings, dim = 0)

In [33]:
# df_train = train_df.iloc[train_indices, [4, 6]]
# df_val = train_df.iloc[val_indices, [4, 6]]

In [91]:
df_train = pd.DataFrame()
df_val = pd.DataFrame()

In [97]:
# df_train['likes'] = train_df.iloc[train_indices, 2]
# df_val['likes'] = train_df.iloc[val_indices, 2]

In [92]:
df_train['likes_class'] = train_df.iloc[train_indices, 2]
df_val['likes_class'] = train_df.iloc[val_indices, 2]

In [93]:
df_train['likes_class'].value_counts()

likes_class
0       227
1       198
2       147
3       136
4       121
       ... 
2570      1
2222      1
7745      1
2029      1
3318      1
Name: count, Length: 2065, dtype: int64

In [94]:
train_embeddings_np = train_embeddings.detach().cpu().numpy()
embedding_columns = [f"embedding_{i}" for i in range(train_embeddings_np.shape[1])]
embeddings_df = pd.DataFrame(train_embeddings_np, columns=embedding_columns)

df_train = pd.concat([df_train.reset_index(drop=True), embeddings_df], axis=1)

In [95]:
val_embeddings_np = val_embeddings.detach().cpu().numpy()
embedding_columns = [f"embedding_{i}" for i in range(val_embeddings_np.shape[1])]
embeddings_df_val = pd.DataFrame(val_embeddings_np, columns=embedding_columns)

df_val = pd.concat([df_val.reset_index(drop=True), embeddings_df_val], axis=1)

In [40]:
# def undersample_majority_vs_minority(df_train, target_column):
#     class_counts = df_train[target_column].value_counts()
#     majority_class_label = class_counts.idxmax()
#     majority_class_size = class_counts.max()

#     df_majority = df_train[df_train[target_column] == majority_class_label]
#     df_minority = df_train[df_train[target_column] != majority_class_label]
    
#     minority_class_size = len(df_minority)
#     df_majority_undersampled = resample(df_majority,
#                                         replace=False,       
#                                         n_samples=minority_class_size,
#                                         random_state=42) 
    
#     df_balanced = pd.concat([df_majority_undersampled, df_minority])
# #     df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)
    
#     return df_balanced

# df_train = undersample_majority_vs_minority(df_train, target_column='likes_class')

# print(df_train['likes_class'].value_counts())

In [41]:
# def undersample_majority_classes(df_train, target_column):
#     class_counts = df_train[target_column].value_counts()
#     minority_class_size = class_counts.min()
#     df_list = []
    
#     for class_label in class_counts.index:
#         df_class = df_train[df_train[target_column] == class_label]

#         if len(df_class) > minority_class_size:
#             df_class = resample(df_class,
#                                 replace=False,     
#                                 n_samples=minority_class_size,  
#                                 random_state=42) 

#         df_list.append(df_class)

#     df_undersampled = pd.concat(df_list)
#     df_undersampled = df_undersampled.sample(frac=1).reset_index(drop=True)
    
#     return df_undersampled

# df_train = undersample_majority_classes(df_train, target_column='likes_class')

# print(df_train['likes_class'].value_counts())

In [42]:
len([cols for cols in df_train.columns if cols != 'likes'])

129

In [86]:
X_train = df_train.loc[:, [cols for cols in df_train.columns if cols != 'likes_class']]
y_train = df_train['likes_class']
X_val = df_val.loc[:, [cols for cols in df_val.columns if cols != 'likes_class']]
y_val = df_val['likes_class']

In [35]:
cat_model1 = CatBoostRegressor(n_estimators = 1000,
                        learning_rate = 0.01,
                        l2_leaf_reg = 0.05,
                        max_depth = 7,
                        loss_function = 'RMSE',
                        eval_metric = 'RMSE',
                        task_type = 'GPU',
                        random_seed = 42,
                        verbose = 100,)
cat_model2 = CatBoostRegressor(n_estimators = 1000,
                        learning_rate = 0.01,
                        l2_leaf_reg = 0.05,
                        max_depth = 7,
                        loss_function = 'RMSE',
                        eval_metric = 'RMSE',
                        task_type = 'GPU',
                        random_seed = 42,
                        verbose = 100,)
cat_model3 = CatBoostRegressor(n_estimators = 1000,
                        learning_rate = 0.01,
                        l2_leaf_reg = 0.05,
                        max_depth = 7,
                        loss_function = 'RMSE',
                        eval_metric = 'RMSE',
                        task_type = 'GPU',
                        random_seed = 42,
                        verbose = 100,)

In [89]:
cls_embed['likes_class'] = train_df['likes_class']

In [90]:
train_df1 = cls_embed[cls_embed['likes_class'] == '0-1000']
train_df2 = cls_embed[cls_embed['likes_class'] == '1000-5000']
train_df3 = cls_embed[cls_embed['likes_class'] == '>=5000']

In [79]:
# X_train1 = X_train.loc[[idx for idx in train_df1.index if idx in train_indices]]
# y_train1 = train_df1.loc[[idx for idx in train_df1.index if idx in train_indices], 'likes']
# X_val1 = X_train.loc[[idx for idx in train_df1.index if idx in val_indices]]
# y_val1 = train_df1.loc[[idx for idx in train_df1.index if idx in val_indices],'likes']

# X_train2 = X_train.loc[[idx for idx in train_df2.index if idx in train_indices]]
# y_train2 = train_df2.loc[[idx for idx in train_df2.index if idx in train_indices], 'likes']
# X_val2 = X_train.loc[[idx for idx in train_df2.index if idx in val_indices]]
# y_val2 = train_df2.loc[[idx for idx in train_df2.index if idx in val_indices],'likes']

# X_train3 = X_train.loc[[idx for idx in train_df3.index if idx in train_indices]]
# y_train3 = train_df3.loc[[idx for idx in train_df3.index if idx in train_indices], 'likes']
# X_val3 = X_train.loc[[idx for idx in train_df3.index if idx in val_indices]]
# y_val3 = train_df3.loc[[idx for idx in train_df3.index if idx in val_indices],'likes']

In [85]:
# # Common filtering to avoid repetitive column selection
# feature_columns = [col for col in train_df1.columns if col != 'likes_class']

# # Use boolean masks to index more efficiently
# train_mask = train_df1.index.isin(train_indices)
# val_mask = train_df1.index.isin(val_indices)

# # For train_df1
# X_train1 = X_train.loc[train_mask]
# y_train1 = train_df1.loc[train_mask, 'likes']
# X_val1 = X_train.loc[val_mask]
# y_val1 = train_df1.loc[val_mask, 'likes']

# # Repeat the same for train_df2 and train_df3
# train_mask2 = train_df2.index.isin(train_indices)
# val_mask2 = train_df2.index.isin(val_indices)

# X_train2 = X_train.loc[train_mask2]
# y_train2 = train_df2.loc[train_mask2, 'likes']
# X_val2 = X_train.loc[val_mask2]
# y_val2 = train_df2.loc[val_mask2, 'likes']

# train_mask3 = train_df3.index.isin(train_indices)
# val_mask3 = train_df3.index.isin(val_indices)

# X_train3 = X_train.loc[train_mask3]
# y_train3 = train_df3.loc[train_mask3, 'likes']
# X_val3 = X_train.loc[val_mask3]
# y_val3 = train_df3.loc[val_mask3, 'likes']

In [None]:
cat_model1.fit(X_train1, y_train1, eval_set = (np.array(X_val1), np.array(y_val1)), verbose_eval = 100)

In [70]:
y_train1.mean()

147.68733769735587

In [60]:
cat_model = CatBoostClassifier(n_estimators = 1500,
                        learning_rate = 0.01,
                        l2_leaf_reg = 0.05,
                        max_depth = 7,
                        loss_function = 'MultiClass',
                        eval_metric = 'MultiClass',
                        task_type = 'GPU',
                        random_seed = 42,
                        verbose = 100,)

In [61]:
cat_model.fit(X_train, y_train, eval_set = (np.array(X_val), np.array(y_val)), verbose_eval = 100)

0:	learn: 1.0850879	test: 1.0854499	best: 1.0854499 (0)	total: 47ms	remaining: 1m 10s
100:	learn: 0.5385351	test: 0.5407354	best: 0.5407354 (100)	total: 4.19s	remaining: 58s
200:	learn: 0.4283088	test: 0.4280916	best: 0.4280916 (200)	total: 8.17s	remaining: 52.8s
300:	learn: 0.3959537	test: 0.3986110	best: 0.3986110 (300)	total: 12s	remaining: 47.7s
400:	learn: 0.3825410	test: 0.3904917	best: 0.3904917 (400)	total: 15.7s	remaining: 43.1s
500:	learn: 0.3745411	test: 0.3882693	best: 0.3882693 (500)	total: 19.5s	remaining: 39s
600:	learn: 0.3686987	test: 0.3875533	best: 0.3875533 (600)	total: 23.2s	remaining: 34.7s
700:	learn: 0.3637366	test: 0.3874277	best: 0.3874182 (693)	total: 26.9s	remaining: 30.6s
800:	learn: 0.3596042	test: 0.3876170	best: 0.3874182 (693)	total: 30.5s	remaining: 26.6s
900:	learn: 0.3559514	test: 0.3875440	best: 0.3874182 (693)	total: 34s	remaining: 22.6s
1000:	learn: 0.3526267	test: 0.3877115	best: 0.3874182 (693)	total: 37.6s	remaining: 18.8s
1100:	learn: 0.349553

<catboost.core.CatBoostClassifier at 0x7c19600a24a0>

In [63]:
y_train.value_counts()

likes_class
0-1000       241062
1000-5000     27469
>=5000         7344
Name: count, dtype: int64

In [29]:
y_pred = cat_model.predict(X_val)

In [30]:
# (np.array(y_pred) == np.array(y_val)).mean()
accuracy_score(y_val, y_pred)

0.8964145077720207

In [31]:
y_train_cont = train_df.loc[train_indices, 'likes']
y_val_cont = train_df.loc[val_indices, 'likes']

In [33]:
mse(y_val_cont, np.ones(y_val_cont.shape)*y_train_cont.mean(), squared = False)

5499.0509467074535

In [34]:
mse(y_val_cont, [mean_likes_by_class[i[0]] for i in y_pred], squared = False)

5529.5190347263815

In [None]:
# train_pool = Pool(X_train, y_train)
# valid_pool = Pool(X_val, y_val)

def objective(trial):
    params = {
#         'iterations': trial.suggest_int('iterations', 500, 2000),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1500),
#         'depth': trial.suggest_int('depth', 4, 10),
        'max_ctr_complexity': trial.suggest_int('max_ctr_complexity', 4, 10),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 0.1, log=True),
        'random_strength': trial.suggest_float('random_strength', 1e-9, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
#         'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
#         'od_wait': trial.suggest_int('od_wait', 10, 50),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'verbose': 0  
    }

    cat_model = CatBoostRegressor(**params, loss_function='RMSE', eval_metric = 'RMSE', random_seed=42)
    cat_model.fit(X_train, y_train, cat_features = [0,1], eval_set=(X_val, y_val), early_stopping_rounds=100, verbose=False)
    
    y_pred = cat_model.predict(X_val)
    rmse = mse(y_val, y_pred, squared=False)
    
    return rmse 

study = optuna.create_study(direction='minimize')  

# Optimize the objective function over 100 trials
study.optimize(objective, n_trials=10, timeout=600) 
print("Best parameters found: ", study.best_params)
print("Best RMSE: ", study.best_value)

# Retrain the model on the full training data with the best parameters
best_params = study.best_params
final_model = CatBoostRegressor(**best_params, loss_function='RMSE',eval_metric = 'RMSE', random_seed=42)

# Train the final model
final_model.fit(X_train, y_train, cat_features = [0,1], eval_set=(X_val, y_val), verbose=100)

# Make predictions and evaluate the final model
y_pred = final_model.predict(X_val)
final_rmse = mse(y_val, y_pred, squared=False)
print(f"Final RMSE on validation set: {final_rmse}")

In [None]:
mse(np.log1p(y_val), np.log1p(y_pred), squared=False)

In [43]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
model.eval()

inputs = tokenizer(list(train_df.loc[train_indices, 'content']), padding=True, truncation=True, return_tensors='pt')

# with torch.no_grad():  
# #     model = model.to(device)
# #     inputs = inputs.to(device)
#     outputs = model(**inputs)
# cls_embeddings = outputs['last_hidden_state'][:, 0, :]  # Shape: (batch_size, hidden_size)

# # Print the CLS embeddings for the first text input
# print(cls_embeddings.shape)  # Should be (batch_size, hidden_size)
# # print(cls_embeddings[0])  # Embedding for the first input text

In [44]:
import gc
def process_in_chunks(model, inputs, batch_size=32):
    model = model.to(device)
    model.eval()  # Set model to evaluation mode
    all_cls_embeddings = []

    # Split inputs into smaller chunks to process them sequentially
    num_batches = len(inputs['input_ids']) // batch_size + 1
    
    with torch.no_grad():  # Disable gradient calculations
        for batch_idx in range(num_batches):
            # Extract a batch of inputs
            batch_inputs = {k: v[batch_idx * batch_size : (batch_idx + 1) * batch_size].to(device)
                            for k, v in inputs.items()}
            
            # Forward pass: get model outputs
            outputs = model(**batch_inputs)
            
            # Extract CLS embeddings (Shape: (batch_size, hidden_size))
            cls_embeddings = outputs['last_hidden_state'][:, 0, :]  
            all_cls_embeddings.append(cls_embeddings.cpu())  # Move to CPU if on GPU
            
            # Clear memory for the current batch
            del batch_inputs, outputs, cls_embeddings
            gc.collect()  # Force garbage collection to free memory
            torch.cuda.empty_cache()  # Clear cache (optional if using GPU)
    
    # Concatenate all CLS embeddings after processing all batches
    all_cls_embeddings = torch.cat(all_cls_embeddings, dim=0)
    
    return all_cls_embeddings

# Example usage with batch processing
cls_embeddings = process_in_chunks(model, inputs, batch_size=32)

# Print shape of the concatenated CLS embeddings
print(cls_embeddings.shape)  # Should be (total_samples, hidden_size)

torch.Size([8670, 768])


In [45]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
model.eval()

inputs = tokenizer(list(train_df.loc[val_indices, 'content']), padding=True, truncation=True, return_tensors='pt')

In [46]:
cls_embeddings_val = process_in_chunks(model, inputs, batch_size=32)

In [47]:
cls_embeddings_val.shape

torch.Size([834, 768])

In [192]:
class bert_downsample(nn.Module):
    def __init__(self, input_size, output_size):
        super(bert_downsample, self, ).__init__()
        self.fc = nn.Linear(in_features = input_size, out_features = output_size, bias = True)
    
    def forward(self, x):
        return self.fc(x)

In [100]:
train_embeddings_cls_np = cls_embeddings.detach().cpu().numpy()
embedding_columns = [f"cls_embedding_{i}" for i in range(train_embeddings_cls_np.shape[1])]
embeddings_df = pd.DataFrame(train_embeddings_cls_np, columns=embedding_columns)

df_train = pd.concat([df_train.reset_index(drop=True), embeddings_df], axis=1)

In [101]:
val_embeddings_cls_np = cls_embeddings_val.detach().cpu().numpy()
embedding_columns = [f"cls_embedding_{i}" for i in range(val_embeddings_cls_np.shape[1])]
embeddings_df = pd.DataFrame(val_embeddings_cls_np, columns=embedding_columns)

df_val = pd.concat([df_val.reset_index(drop=True), embeddings_df], axis=1)

In [21]:
class neural_net(nn.Module):
    def __init__(self, input_size, output_size):
        super(neural_net, self).__init__()
        self.linear1 = nn.Linear(in_features = input_size, out_features = 256, bias = True)
        self.relu1 = nn.ReLU()
        self.linear2 = nn.Linear(in_features = 256, out_features = 128, bias = True)
        self.relu2 = nn.ReLU()
        self.linear3 = nn.Linear(in_features = 128, out_features = output_size, bias = True)
    
    def forward(self, x):
        out1 = self.relu1(self.linear1(x))
        out2 = self.relu2(self.linear2(out1))
        out3 = self.linear3(out2)
        return out3
nn_model = neural_net(input_size = 768, output_size = 1).to(device)

In [22]:
class nn_dataset(Dataset):
    def __init__(self, X, y):
        super().__init__()
        self.x = X
        self.y = y
#         self.transform = transform
#         self.train = train
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        X = torch.tensor(self.x.iloc[idx])
        y = torch.tensor(self.y.iloc[idx])
            
        return X, y

In [14]:
cls_embed = pd.read_csv('/kaggle/input/bertweet-embed/bertweet_full.csv')

In [17]:
cls_embed.head()

Unnamed: 0,cls_embedding_0,cls_embedding_1,cls_embedding_2,cls_embedding_3,cls_embedding_4,cls_embedding_5,cls_embedding_6,cls_embedding_7,cls_embedding_8,cls_embedding_9,...,cls_embedding_758,cls_embedding_759,cls_embedding_760,cls_embedding_761,cls_embedding_762,cls_embedding_763,cls_embedding_764,cls_embedding_765,cls_embedding_766,cls_embedding_767
0,0.023178,0.256953,0.255463,-0.08388,0.15504,-0.116542,-0.004311,-0.172761,0.241986,-0.042757,...,0.096001,0.0646,0.081336,0.042808,-0.054059,-0.045551,-0.008276,-0.076633,-0.044823,0.018629
1,-0.050998,0.342571,0.262547,0.037104,-0.010318,-0.184741,0.08913,-0.08299,0.004877,-0.016541,...,0.227574,0.080511,0.164396,-0.02719,0.179126,-0.132649,0.128336,0.005677,-0.157179,0.019753
2,-0.020711,0.330396,0.212147,0.012729,0.149838,-0.148308,0.078521,-0.032911,0.080688,-0.103758,...,0.089377,0.111796,0.059127,0.094234,-0.050855,-0.073519,0.086718,-0.052929,-0.147353,-0.03161
3,-0.22808,0.218385,0.216307,0.090867,0.020311,-0.110163,0.247068,-0.103976,0.056986,-0.020619,...,0.041639,0.037614,0.173647,-0.006084,0.134102,-0.042047,0.1144,0.037825,-0.045736,-0.242749
4,-0.030641,0.245888,0.326784,-0.088516,-0.073143,-0.171603,0.170291,-0.196158,0.027403,-0.079542,...,0.173601,0.003438,0.187752,0.067493,0.0336,-0.045521,-0.072449,-0.228149,-0.048607,-0.094895


In [16]:
# del cls_embed['idx']
del cls_embed['Unnamed: 0']

In [59]:
X_train = cls_embed.iloc[train_indices].reset_index(drop = True)
y_train = train_df.loc[train_indices, 'likes_class'].reset_index(drop = True)
X_val = cls_embed.iloc[val_indices].reset_index(drop = True)
y_val = train_df.loc[val_indices, 'likes_class'].reset_index(drop = True)

In [38]:
sampler = RandomUnderSampler(sampling_strategy = 'auto', random_state=42)
X_train, y_train = sampler.fit_resample(X_train, y_train)

In [23]:
nn_train_data = nn_dataset(X_train, y_train)
nn_val_data = nn_dataset(X_val, y_val)
nn_train_dataloader = DataLoader(nn_train_data, batch_size = 32, shuffle = False, num_workers = os.cpu_count())
nn_val_dataloader = DataLoader(nn_val_data, batch_size = 32, shuffle = False, num_workers = os.cpu_count())

In [None]:
df['date_column'] = pd.to_datetime(df['date_column'])

df = df.sort_values('date_column')

tscv = TimeSeriesSplit(n_splits=2) 

for train_index, val_index in tscv.split(df):
    train_data = df.iloc[train_index]
    val_data = df.iloc[val_index]

    print(f"TRAIN indices: {train_index}, VALIDATION indices: {val_index}")

In [26]:
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self,yhat,y):
        return torch.sqrt(self.mse(yhat,y))

loss_fn = RMSELoss()
optimizer_nn = torch.optim.Adam(params = nn_model.parameters())
# LR_scheduler = troch.optim.lr_scheduler.StepLR(optimizer, 4, 0.1)

In [28]:
import gc
torch.cuda.empty_cache()
gc.collect()
n_epochs = 10
for epoch in range(n_epochs):
    nn_model.train()
    net_loss = 0.0
    for X, y in tqdm(nn_train_dataloader):
        X = X.to(torch.float32).to(device)
        y = y.to(torch.float32).to(device)
        optimizer_nn.zero_grad()
        y_pred = nn_model(X)
        loss = loss_fn(y_pred, y)
        net_loss += (loss.item()) / len(nn_train_dataloader)
        loss.backward()
        optimizer_nn.step()
        del X,y
    print(f"epoch {epoch+1} | Loss: {net_loss}")
    
    nn_model.eval()
    net_loss_val = 0.0
    with torch.inference_mode():
        for X, y in tqdm(nn_val_dataloader):
            X = X.to(torch.float32).to(device)
            y = y.to(torch.float32).to(device)
            y_val_pred = nn_model(X)
            val_loss = loss_fn(y_val_pred, y)
            net_loss_val += (val_loss.item()) / len(nn_val_dataloader)
            del X,y
        print(f"epoch {epoch+1} | Loss: {net_loss_val}")

  0%|          | 0/8696 [00:00<?, ?it/s]

epoch 1 | Loss: 2342.627122673733


  0%|          | 0/680 [00:00<?, ?it/s]

epoch 1 | Loss: 3465.1581849042077


  0%|          | 0/8696 [00:00<?, ?it/s]

epoch 2 | Loss: 2342.396011306916


  0%|          | 0/680 [00:00<?, ?it/s]

epoch 2 | Loss: 3464.6593038671176


  0%|          | 0/8696 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [141]:
mse(y_val, np.ones(y_val.shape)*y_train.mean(), squared = False)

2650.4214398588574