Whenever there is a <br>
%%script false --no-raise-error <br>
comment it to use the cell, and comment it to skip the cell

In [None]:
import utils
import os
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch

##########################################################
### not needed if you do not want the package
### can be removed without problem
##########################################################
import dpctl
print(dpctl.get_devices())
from sklearnex import patch_sklearn
patch_sklearn()
#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
device = 'cpu'
DIR_PATH = "Data"
DIRPATH_EXTRACTED_FEATURES = "extracted_features"

In [None]:
##########################################################
### section needed for fair comparison with MLP
### can be removed without problem
##########################################################
import pickle

with open('data_split_indices/train_dataset_indices.pkl', 'rb') as f:
    train_dataset_indices = pickle.load(f)

with open('data_split_indices/val_dataset_indices.pkl', 'rb') as f:
    val_dataset_indices = pickle.load(f)

with open('data_split_indices/test_dataset_indices.pkl', 'rb') as f:
    test_dataset_indices = pickle.load(f)

print(len(train_dataset_indices))
print(len(val_dataset_indices))
print(len(test_dataset_indices))

def fix_seed(seed):
    random.seed(seed)
    torch.manual_seed(seed)

fix_seed(utils.SEED)
#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%

In [3]:
class CustomDataset():
    def __init__(self, brands: list, filenames_features: list, filename_target=utils.FILENAME_RELEVANCE_WINDOW, log_target=False, center_target=False, device=device):
        self.log_target = log_target
        self.center_target = center_target

        tensor = torch.concat([torch.concat([torch.load(os.path.join(DIRPATH_EXTRACTED_FEATURES, brand, filename_feature), map_location=device, weights_only=False) 
                                for filename_feature in filenames_features], dim=1)
                                  for brand in brands], dim=0)
        numpy_array = tensor.cpu().detach().numpy()
        features = pd.DataFrame(numpy_array)

        tensor = torch.concat([torch.load(os.path.join(DIRPATH_EXTRACTED_FEATURES, brand, filename_target), map_location=device, weights_only=False) 
                                  for brand in brands], dim=0)
        numpy_array = tensor.cpu().detach().numpy()
        target = pd.DataFrame(numpy_array)
        target.rename(columns={0:'target'}, inplace=True)

        if log_target:
              target = np.log(target)
        if center_target:
            self.centering_shift = target.mean()
            target = target - self.centering_shift

        ###################################################
        ### section needed for fair comparison with MLP
        ### can be removed without problem
        ###################################################
        dataset = pd.concat([features, target], axis=1)
        dataset_test = dataset.loc[test_dataset_indices]
        dataset_rest = dataset.drop(index=test_dataset_indices)

        self.features_final_test = dataset_test.drop(columns=['target'])
        self.target_final_test = dataset_test.filter(['target'])

        features = dataset_rest.drop(columns=['target'])
        target = dataset_rest.filter(['target'])
        #%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#%#

        self.features_train, self.features_test, self.target_train, self.target_test = train_test_split(features, target, test_size = 1/9)
    
    def __len__(self):
        return len(self.target)
    
class CustomRegressor():
    def __init__(self, percentile=100):
        self.percentile = percentile

        self.weight_fun = lambda x: -60 * np.e ** (-0.5 * np.abs(x - 24)) + 100 + 0.0003 * (x - 24) ** 2
        # self.weight_fun = lambda x: -150 * np.e ** (-0.01 * np.abs(x - 26)) + 200

        self.regressor = RandomForestRegressor(
            n_estimators=100,
            criterion='squared_error',
            min_samples_split=150,
            min_samples_leaf=50,
            max_depth=20,
            max_samples=.5,
            oob_score=True,
            random_state=0,
            n_jobs=-1,
            )

    def process_data(self, Data: CustomDataset) -> pd.DataFrame:
        target_train = Data.target_train.rename(columns={0:'target'})
        
        df = pd.concat([Data.features_train, target_train], axis=1)
        target_counts = df['target'].value_counts()

        threshold = np.percentile(target_counts.values, self.percentile)
        to_downsample = target_counts[target_counts > threshold].index

        balanced_data = []
        for value in target_counts.index:
            subset = df[df['target'] == value]
            if value in to_downsample:
                subset = subset.sample(int(threshold))
            balanced_data.append(subset)

        return pd.concat(balanced_data)

    def fit(self, Data: CustomDataset, weighted = False):
        balanced_df = self.process_data(Data)
        
        features_train = balanced_df.drop(columns=['target'])
        target_train = balanced_df['target']
        
        if weighted:
            if Data.log_target:
                weigths = (target_train).map(np.exp).apply(self.weight_fun)
            else:
                weigths = target_train.apply(self.weight_fun)

            print(weigths.sort_values()) # TODO remove line
        else:
            weigths = None

        self.regressor.fit(features_train, target_train, weigths)

    def oob_score(self):
        print(f'Out-of-Bag Score: {self.regressor.oob_score_}')
        
    def score(self, Data: CustomDataset):
        target_test = Data.target_test.rename(columns={0:'target'})
        predictions = self.regressor.predict(Data.features_test)

        self.oob_score()
        mse = mean_squared_error(target_test, predictions)
        r2 = r2_score(target_test, predictions)
        print(f'Mean Squared Error: {mse}')
        print(f'R-squared: {r2}')

    def predict_train(self, Data: CustomDataset):
        return self.regressor.predict(Data.features_train)

    def predict_test(self, Data: CustomDataset):
        return self.regressor.predict(Data.features_test)
    
    def predict(self, Data):
        return self.regressor.predict(Data)
    

In [None]:
filenames_features = [                      # dimensions
    utils.FILENAME_BRAND_OHE,               # 7
    utils.FILENAME_PUBLICATION_TIMESTAMP,   # 1
    utils.FILENAME_PUBLICATION_WEEKDAY_OHE, # 7
    utils.FILENAME_NUM_WORDS,               # 1
    utils.FILENAME_NUM_PARAGRAPH,           # 1
    utils.FILENAME_USER_NEEDS,              # 5
    utils.FILENAME_LDA_TOPICS,              # 36
    utils.FILENAME_IPTC_TOPICS_LEVEL_0,     # 17
    utils.FILENAME_IPTC_TOPICS_LEVEL_1,     # 98
    # utils.FILENAME_MAIN_SECTION_OHE,      # 118
    # utils.FILENAME_MAIN_SECTION_EMB,      # 300
    # utils.FILENAME_SUBSECTIONS_OHE,       # 959
    # utils.FILENAME_SUBSECTIONS_EMB,       # 300
    utils.FILENAME_MAIN_SECTION_SUBSECTIONS_MEAN_EMB, # 300
    # utils.FILENAME_TITLE_EMB,             # 768
    utils.FILENAME_AUTHOR_OHE,              # 2024
]

brands = utils.ALL_BRANDS
# brands = ['ad']
# brands = ['brabantsdagblad']
# brands = ['destentor']
# brands = ['nu']
# brands = ['parool']
# brands = ['trouw']
# brands = ['volkskrant']
# brands = ['ad', 'brabantsdagblad', 'destentor'] # papers with similar data
# brands = ['parool', 'trouw', 'volkskrant']      # papers with similar data

In [None]:
Data = CustomDataset(brands, filenames_features, log_target=True, center_target=False)
Data.features_train.shape

In [6]:
%%script false --no-raise-error
log_twentyfour = np.log(24)
if Data.log_target:
    target_pred = [log_twentyfour for _ in range(len(Data.target_test))]
else:
    target_pred = [24 for _ in range(len(Data.target_test))]

# print(mean_absolute_error(Data.target_test, target_pred))
print(mean_squared_error(Data.target_test, target_pred))
print(r2_score(Data.target_test, target_pred))

In [None]:
regressor = CustomRegressor(percentile=100)

# plots used data
ground_truth_orderd = regressor.process_data(Data)['target'].value_counts().sort_index()
plt.figure(figsize=(12, 6))
plt.scatter(ground_truth_orderd.index, ground_truth_orderd.values)
plt.show()

# plots curve for weigths 
if regressor.weight_fun:
    plt.figure(figsize=(12, 6))
    xgrid = np.linspace(0, 600, 10000)
    ygrid = [regressor.weight_fun(x) for x in xgrid]
    plt.plot(xgrid, ygrid)
    plt.ylim(0)
    plt.show

In [None]:
# %%script false --no-raise-error # to skip cell, simple forests 

regressor.fit(Data, True)
print('regressor:')
regressor.score(Data)

In [9]:
%%script false --no-raise-error # gridsearch

# base
clf = RandomForestRegressor(
    criterion='squared_error',
    oob_score=True,
    n_jobs=-1,
    n_estimators=100,
    min_samples_split=100,
    min_samples_leaf=20,
    max_depth=40,
    max_samples=.9
    )

# to check
param_list = {
    # 'n_estimators': [100, 250],
    # "min_samples_split": np.arange(2, 20, 5),
    # 'min_samples_leaf': [1, 20, 50, 150],
    "max_depth": [20, 60],
    'max_samples': [.5, .9, 1],
}

# checking using grid
search = GridSearchCV(
    clf, 
    param_list,
    n_jobs=-1,
    ).fit(Data.features_train, Data.target_train['target'])

# to check
# param_distributions = {
#     # 'n_estimators': randint(150, 300),
#     "min_samples_split": randint(2, 100),
#     'min_samples_leaf': randint(1, 100),
#     "max_depth": randint(40, 60),
#     }

# checking using random search
# search = HalvingRandomSearchCV(
#     clf, 
#     param_distributions,
#     factor=2,
#     n_candidates=300,
#     random_state=0,
#     max_resources=300,
#     resource='n_estimators',
#     n_jobs=-1,
#     ).fit(Data.features_train, Data.target_train['target'])

print(search.best_params_)
search.score(Data.features_test, Data.target_test)

In [11]:
# %%script false --no-raise-error # to skip cell, simple forests 

y_train = pd.DataFrame(Data.target_train).rename(columns={'target':0})
y_pred_train = pd.DataFrame(regressor.predict(Data.features_train), index=y_train.index)
abs_dif_train = (y_pred_train - y_train).apply(lambda x:x**2)

y_val = pd.DataFrame(Data.target_test).rename(columns={'target':0})
y_pred_val = pd.DataFrame(regressor.predict(Data.features_test), index=y_val.index)
abs_dif_val = (y_pred_val - y_val).apply(lambda x:x**2)

y_test = pd.DataFrame(Data.target_final_test).rename(columns={'target':0})
y_pred_test = pd.DataFrame(regressor.predict(Data.features_final_test), index=y_test.index)
abs_dif_test = (y_pred_test - y_test).apply(lambda x:x**2)


In [None]:
print(regressor.regressor.get_params())
print(f'train \ndiff mean: {abs_dif_train.mean()[0]}, diff std: {abs_dif_train.std()[0]}')
print(f'val \ndiff mean: {abs_dif_val.mean()[0]}, diff std: {abs_dif_val.std()[0]}')
print(f'test \ndiff mean: {abs_dif_test.mean()[0]}, diff std: {abs_dif_test.std()[0]}')
plt.hist(abs_dif_train, bins=50, alpha=0.5, color='b')
plt.title('Difference of the prediction and real training data')
plt.show()
plt.hist(abs_dif_test, bins=50, alpha=0.5, color='b')
plt.title('Difference of the prediction and real test data')
plt.show()

In [None]:
%%script false --no-raise-error # checking worst prediction
df = pd.concat([pd.read_csv(f'Data/uva-relevance-windows-{brand}.csv', sep=';')
                 for brand in brands]).reset_index()

most = abs_dif_test.sort_values(ascending=True, by=0)[0:10:1]
# most = abs_dif_test.sort_values(ascending=False, by=0)[0:1000:100]
# most = abs_dif_test.sort_values(ascending=False, by=0)[0:20000:1000]

df = df.loc[most.index]
# validation = y_test.loc[most.index].apply(lambda x: np.exp(x))
y_prediction = y_pred_test.loc[most.index].apply(lambda x: np.exp(x)).rename(columns={0:'prediction'})

most = most.apply(lambda x: np.sqrt(x))
most_values = pd.DataFrame(most.values, index=df.index).rename(columns={0:'RMSE'})
df = pd.concat([df, y_prediction, most_values], axis=1)

print(len(abs_dif_test))
df

In [None]:
n_bins = 60

plt.rcParams['figure.facecolor'] = ('white') 
plt.gca().set_facecolor('white')

plt.rcParams["figure.figsize"] = (6.8,7)
SMALL_SIZE = 12
MEDIUM_SIZE = 13
BIGGER_SIZE = 18
plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)   # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

bins = np.histogram(np.hstack((y_pred_train,y_train)), bins=n_bins)[1]
plt.hist(y_pred_train, bins=bins, alpha=0.5, color='b', label='Train prediction')
plt.hist(y_train, bins=bins, alpha=0.5, color='r', label='Train ground truth')
plt.legend()
plt.xlabel("Centred log of relevance window in hours")
plt.ylabel("Frequency")
plt.show()

bins = np.histogram(np.hstack((y_pred_test,y_test)), bins=n_bins)[1]
plt.hist(y_pred_test, bins=bins, alpha=0.5, color='b', label='Test prediction')
plt.hist(y_test, bins=bins, alpha=0.5, color='r', label='Test ground truth')
# plt.xlim(-3, 3)
plt.legend()
plt.xlabel("Centred log of relevance window in hours")
plt.ylabel("Frequency")
plt.show()

y_pred_train_trans = y_pred_train
y_train_trans = y_train
y_pred_test_trans = y_pred_test
y_test_trans = y_test

if Data.center_target:
    centering_shift = Data.centering_shift
    y_pred_train_trans = y_pred_train_trans + centering_shift
    y_train_trans = y_train_trans + centering_shift
    y_pred_test_trans = y_pred_test_trans + centering_shift
    y_test_trans = y_test_trans + centering_shift
if Data.log_target:
    y_pred_train_trans = (y_pred_train_trans).map(np.exp)
    y_train_trans = (y_train_trans).map(np.exp)
    y_pred_test_trans = (y_pred_test_trans).map(np.exp)
    y_test_trans = (y_test_trans).map(np.exp)
else:
    y_pred_train_trans = (y_pred_train_trans).map(np.log)
    y_train_trans = (y_train_trans).map(np.log)
    y_pred_test_trans = (y_pred_test_trans).map(np.log)
    y_test_trans = (y_test_trans).map(np.log)


bins = np.histogram(np.hstack((y_pred_train_trans,y_train_trans)), bins=n_bins)[1]
plt.hist(y_pred_train_trans, bins=bins, alpha=0.5, color='b', label='train prediction')
plt.hist(y_train_trans, bins=bins, alpha=0.5, color='r', label='train ground truth')
plt.legend()
plt.xlabel("Relevance window in hours")
plt.ylabel("Frequency")
plt.show()

bins = np.histogram(np.hstack((y_pred_test_trans,y_test_trans)), bins=n_bins)[1]
plt.hist(y_pred_test_trans, bins=bins, alpha=0.5, color='b', label='test prediction')
plt.hist(y_test_trans, bins=bins, alpha=0.5, color='r', label='test ground truth')
plt.legend()
plt.xlabel("Relevance window in hours")
plt.ylabel("Frequency")
plt.show()

In [19]:
from sklearn.tree import plot_tree

def plotting_tree(tree):
    n, tree_to_plot = tree
    plt.figure(figsize=(20, 10))
    plot_tree(tree_to_plot, feature_names=Data.features_train.columns.tolist(), filled=True, rounded=True, fontsize=10)
    plt.title(f"Decision Tree {n} from Random Forest")
    plt.show()

In [None]:
for tree in enumerate(regressor.regressor.estimators_):
    i, _ = tree
    if i == 0:
        plotting_tree(tree)