In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torchvision
from torchvision import models, datasets, transforms
from torch.utils.data import Dataset, DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
from tqdm.notebook import tqdm_notebook as tqdm
from PIL import Image
import io, os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from catboost import CatBoostRegressor, CatBoostClassifier, Pool
import optuna
from sklearn.model_selection import TimeSeriesSplit
from transformers import BertModel, BertTokenizer
from imblearn.under_sampling import RandomUnderSampler

import warnings
warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'

  check_for_updates()


In [4]:
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

In [5]:
train_df = pd.read_csv('/kaggle/input/behaviour-simulation-train/behaviour_simulation_train.csv')
train_df.head()

Unnamed: 0,id,date,likes,content,username,media,inferred company
0,1,2020-12-12 00:47:00,1,"Spend your weekend morning with a Ham, Egg, an...",TimHortonsPH,[Photo(previewUrl='https://pbs.twimg.com/media...,tim hortons
1,2,2018-06-30 10:04:20,2750,Watch rapper <mention> freestyle for over an H...,IndyMusic,[Photo(previewUrl='https://pbs.twimg.com/media...,independent
2,3,2020-09-29 19:47:28,57,Canadian Armenian community demands ban on mil...,CBCCanada,[Photo(previewUrl='https://pbs.twimg.com/media...,cbc
3,4,2020-10-01 11:40:09,152,"1st in Europe to be devastated by COVID-19, It...",MKWilliamsRome,[Photo(previewUrl='https://pbs.twimg.com/media...,williams
4,5,2018-10-19 14:30:46,41,Congratulations to Pauletha Butts of <mention>...,BGISD,[Photo(previewUrl='https://pbs.twimg.com/media...,independent


In [4]:
# base_images_path = '/kaggle/input/adobe-human-behavior-0-50k/train_images'
# for img_path in os.listdir(base_images_path):
#     img = np.array(Image.open(os.path.join(base_images_path, img_path)))
#     plt.imshow(img)
#     print(img_path.split('.'))
#     break

In [4]:
bins = [0, 1000, 5000, float('inf')]
labels = ['0-1000', '1000-5000', '>=5000']

train_df['likes_class'] = pd.cut(train_df['likes'], bins=bins, labels=labels, right=False)

mean_likes_by_class = train_df.groupby('likes_class')['likes'].mean().to_dict()

print(mean_likes_by_class)

{'0-1000': 150.29876586446588, '1000-5000': 2114.3986468108255, '>=5000': 16511.041423866227}


In [5]:
train_df['likes_class'].value_counts()

likes_class
0-1000       262694
1000-5000     29412
>=5000         7894
Name: count, dtype: int64

In [6]:
train_df.loc[train_df['likes_class']=='>=5000']['likes'].std()

25525.58016847943

In [6]:
cls_embed = pd.read_csv('/kaggle/input/bertweet-embed/bertweet_full.csv')
del cls_embed['Unnamed: 0']

In [7]:
cls_embed.head()

Unnamed: 0,cls_embedding_0,cls_embedding_1,cls_embedding_2,cls_embedding_3,cls_embedding_4,cls_embedding_5,cls_embedding_6,cls_embedding_7,cls_embedding_8,cls_embedding_9,...,cls_embedding_758,cls_embedding_759,cls_embedding_760,cls_embedding_761,cls_embedding_762,cls_embedding_763,cls_embedding_764,cls_embedding_765,cls_embedding_766,cls_embedding_767
0,0.023178,0.256953,0.255463,-0.08388,0.15504,-0.116542,-0.004311,-0.172761,0.241986,-0.042757,...,0.096001,0.0646,0.081336,0.042808,-0.054059,-0.045551,-0.008276,-0.076633,-0.044823,0.018629
1,-0.050998,0.342571,0.262547,0.037104,-0.010318,-0.184741,0.08913,-0.08299,0.004877,-0.016541,...,0.227574,0.080511,0.164396,-0.02719,0.179126,-0.132649,0.128336,0.005677,-0.157179,0.019753
2,-0.020711,0.330396,0.212147,0.012729,0.149838,-0.148308,0.078521,-0.032911,0.080688,-0.103758,...,0.089377,0.111796,0.059127,0.094234,-0.050855,-0.073519,0.086718,-0.052929,-0.147353,-0.03161
3,-0.22808,0.218385,0.216307,0.090867,0.020311,-0.110163,0.247068,-0.103976,0.056986,-0.020619,...,0.041639,0.037614,0.173647,-0.006084,0.134102,-0.042047,0.1144,0.037825,-0.045736,-0.242749
4,-0.030641,0.245888,0.326784,-0.088516,-0.073143,-0.171603,0.170291,-0.196158,0.027403,-0.079542,...,0.173601,0.003438,0.187752,0.067493,0.0336,-0.045521,-0.072449,-0.228149,-0.048607,-0.094895


In [8]:
cls_embed['date'] = pd.to_datetime(train_df['date'])
cls_embed = cls_embed.sort_values(by='date').reset_index(drop=True)
cls_embed['likes'] = train_df['likes']
cls_embed['username'] = train_df['username']
cls_embed['inferred company'] = train_df['inferred company']
tscv = TimeSeriesSplit(n_splits=2)

for train_index, val_index in tscv.split(cls_embed):
    train_data = cls_embed.iloc[train_index]
    val_data = cls_embed.iloc[val_index]

    print(f"TRAIN: {train_index}, VAL: {val_index}")

TRAIN: [    0     1     2 ... 99997 99998 99999], VAL: [100000 100001 100002 ... 199997 199998 199999]
TRAIN: [     0      1      2 ... 199997 199998 199999], VAL: [200000 200001 200002 ... 299997 299998 299999]


In [9]:
# X_train = cls_embed.iloc[train_index].set_index([pd.Index(train_index)])
# y_train = train_df.iloc[train_index, -1]
# X_val = cls_embed.iloc[val_index].set_index([pd.Index(val_index)])
# y_val = train_df.iloc[val_index, -1]
X_train = train_data.loc[:,[cols for cols in train_data.columns if cols != 'likes']]
X_val = val_data.loc[:,[cols for cols in train_data.columns if cols != 'likes']]
y_train = train_data['likes']
y_val = val_data['likes']

In [10]:
X_train['year'] = X_train['date'].dt.year
X_train['month'] = X_train['date'].dt.month
X_train['day'] = X_train['date'].dt.day
X_train['day_of_week'] = X_train['date'].dt.dayofweek 

X_train = X_train.drop('date', axis=1)

In [11]:
X_val['year'] = X_val['date'].dt.year
X_val['month'] = X_val['date'].dt.month
X_val['day'] = X_val['date'].dt.day
X_val['day_of_week'] = X_val['date'].dt.dayofweek 

X_val = X_val.drop('date', axis=1)

In [179]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

ros = RandomOverSampler(random_state=42)

# Apply RandomOverSampler to balance the classes
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Checking the distribution of the classes after resampling
print(f"Original class distribution: {Counter(y_train)}")
print(f"Resampled class distribution: {Counter(y_resampled)}")

Original class distribution: Counter({'0-1000': 241062, '1000-5000': 27469, '>=5000': 7344})
Resampled class distribution: Counter({'1000-5000': 241062, '0-1000': 241062, '>=5000': 241062})


In [None]:
sampler = RandomUnderSampler(sampling_strategy = 'auto', random_state=42)
X_train, y_train = sampler.fit_resample(X_train, y_train)

In [84]:
class_weights = {0: 1 / 0.873809, 1: 1 / 0.099570, 2: 1 / 0.026621}
class_weights = {key: value / class_weights[0] for key, value in class_weights.items()}

In [69]:
cat_model = CatBoostClassifier(n_estimators = 1000,
#                         auto_class_weights = 'SqrtBalanced',
#                         class_weights = list(class_weights.values()),
                        cat_features =  [768,769],
                        learning_rate = 0.01,
                        l2_leaf_reg = 0.05,
                        max_depth = 7,
                        loss_function = 'MultiClass',
                        eval_metric = 'MultiClass',
#                         custom_metric = ['Recall'],
                        task_type = 'GPU',
                        random_seed = 42,
                        verbose = 100,)

In [13]:
cat_model = CatBoostRegressor(n_estimators = 1000,
#                         auto_class_weights = 'SqrtBalanced',
#                         class_weights = list(class_weights.values()),
                        cat_features =  [768,769,770,771,772,773],
                        learning_rate = 0.1,
                        l2_leaf_reg = 0.05,
                        max_depth = 9,
                        loss_function = 'RMSE',
                        eval_metric = 'RMSE',
#                         custom_metric = ['Recall'],
                        task_type = 'GPU',
                        random_seed = 42,
                        verbose = 100,)

In [14]:
cat_model.fit(X_train, y_train, eval_set = (np.array(X_val), np.array(y_val)), verbose_eval = 100)

0:	learn: 4631.2361352	test: 5372.5072692	best: 5372.5072692 (0)	total: 68.3ms	remaining: 1m 8s
100:	learn: 3429.6703224	test: 5247.5132028	best: 5236.2120098 (68)	total: 5.81s	remaining: 51.7s
200:	learn: 2894.0024970	test: 5247.5152010	best: 5236.2120098 (68)	total: 11.5s	remaining: 45.6s
300:	learn: 2592.7544265	test: 5241.6378529	best: 5236.2120098 (68)	total: 17.1s	remaining: 39.7s
400:	learn: 2386.5181739	test: 5245.8434125	best: 5236.2120098 (68)	total: 22.8s	remaining: 34.1s
500:	learn: 2221.8153575	test: 5241.5598338	best: 5236.2120098 (68)	total: 28.6s	remaining: 28.5s
600:	learn: 2084.5936328	test: 5238.5234400	best: 5235.5351052 (525)	total: 34.3s	remaining: 22.8s
700:	learn: 1978.1878013	test: 5240.2028165	best: 5235.5351052 (525)	total: 40.2s	remaining: 17.1s
800:	learn: 1879.8008705	test: 5241.7013675	best: 5235.5351052 (525)	total: 46.1s	remaining: 11.4s
900:	learn: 1794.5929583	test: 5242.5412391	best: 5235.5351052 (525)	total: 51.9s	remaining: 5.7s
999:	learn: 1723.19

<catboost.core.CatBoostRegressor at 0x7a467fdd3e20>

In [40]:
# [i for i in range(len(X_val)) if y_val[i] == '0-1000' and y_pred[i] == '>=5000']

In [113]:
y_train.value_counts(normalize = True)

likes_class
0-1000       0.873809
1000-5000    0.099570
>=5000       0.026621
Name: proportion, dtype: float64

In [182]:
y_pred = cat_model.predict(X_val)

In [24]:
y_pred = y_pred.squeeze()

In [25]:
df_y = pd.DataFrame()
df_y['y'] = y_pred
df_y['y'].value_counts()

y
0-1000       22335
1000-5000     1352
>=5000         438
Name: count, dtype: int64

In [185]:
sum([y_pred[i] == '>=5000' and y_val[i] == '>=5000' for i in range(y_val.shape[0])])

289

In [134]:
y_val.value_counts()

likes_class
0-1000       21632
1000-5000     1943
>=5000         550
Name: count, dtype: int64

In [165]:
# (np.array(y_pred) == np.array(y_val)).mean()
accuracy_score(y_val, y_pred)

0.8327461139896373

In [55]:
y_train_cont = train_df.loc[train_indices, 'likes']
y_val_cont = train_df.loc[val_indices, 'likes']

In [186]:
mse(y_val_cont, np.ones(y_val_cont.shape)*y_train_cont.mean(), squared = False)

5499.0509467074535

In [56]:
mse(y_val_cont, [mean_likes_by_class[i] for i in y_pred], squared = False)

5838.973566875456

In [None]:
mse(np.log1p(y_val), np.log1p(y_pred), squared=False)