In [1]:
import os
import json
import pandas as pd
import scipy as sp
from sklearn.preprocessing import LabelEncoder 
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
import numpy as np
import datetime
import pickle
from pprint import pprint
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import seaborn as sns
import gc

In [2]:
class CategoricalFeature:
    def __init__(self, name, vocabulary_size, embedding_size) -> None:
        # + 1 for unknown
        self.lookup_table = nn.Embedding(vocabulary_size + 1, embedding_size)
        self.name = name
        self.vocabulary_size = vocabulary_size

    def to_embeddings(self, idxs):
        if not torch.is_tensor(idxs):
            idxs = torch.tensor(idxs)
        with torch.no_grad():
            embeddings = self.lookup_table(idxs)
        return embeddings

class FeaturesEncoder:
    @staticmethod
    def get_instance(name: str, features_numeric, features_categoric = None):
        if features_categoric is None:
            filepath = os.path.join('.', 'lookup_tables', f'{name}-{"-".join(features_numeric)}.pickle')
        else:
            filepath = os.path.join('.', 'lookup_tables', f'{name}-{"-".join(features_numeric)}-{"-".join(features_categoric)}.pickle')
        if os.path.exists(filepath):
            with open(filepath, 'rb') as handle:
                encoder = pickle.load(handle)
            print(f'{name.capitalize()} loaded from {filepath}')
        else:
            encoder = FeaturesEncoder(features_numeric, features_categoric)
            with open(filepath, 'wb') as handle:
                pickle.dump(encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)
            print(f'{name.capitalize()} initialized and saved to {filepath}')
        return encoder
    
    def __init__(self, features_numeric, features_categoric ) -> None:
        self.features_numeric = features_numeric
        self.features_categoric = [] if features_categoric is None else list(features_categoric.keys())
        self.features_categoric_lookup_table = dict() if features_categoric is None else dict([
            (feature_name, CategoricalFeature(
                name=feature_name,
                vocabulary_size=vocabulary_size,
                embedding_size=embedding_size,
            ))
            for feature_name, (vocabulary_size, embedding_size) in features_categoric.items()
        ])

    def to_emebeddings(self, df: pd.DataFrame):
        numeric_features = df[self.features_numeric].to_numpy()
        df.drop(columns=self.features_numeric, inplace=True)
        numeric_features = torch.from_numpy(numeric_features).float()
        categoric_features = []
        for feature in self.features_categoric:
            encoder = self.features_categoric_lookup_table[feature]
            categoric_feature = df[feature].to_numpy()
            df.drop(columns=[feature], inplace=True)
            categoric_feature = torch.from_numpy(categoric_feature)
            feature_embedding = encoder.to_embeddings(categoric_feature)
            categoric_features.append(feature_embedding)
        # concat (numeric, ...categoric)
        return torch.cat((numeric_features, *categoric_features), 1)

In [10]:
class TrainTestMaker:

    def __init__(self) -> None:
        data = TrainTestMaker.read_all_data()
        self.data: pd.DataFrame = data
        user_features_numeric, item_features_numeric, item_features_categoric = TrainTestMaker.get_features_defs(data)
        self.user_encoder = FeaturesEncoder.get_instance('user', user_features_numeric)
        self.item_encoder = FeaturesEncoder.get_instance('item', item_features_numeric, item_features_categoric)

    @staticmethod
    def read_all_data() -> pd.DataFrame:
        paths = []
        dirs = os.listdir('normalized_train_data')
        for dir in dirs:
            filenames = os.listdir(f'normalized_train_data/{dir}')
            paths += [f'normalized_train_data/{dir}/{filename}' for filename in filenames]
        paths = sorted(paths)
        dfs = []
        for filepath in tqdm(paths, desc='Loading data'):
            df = pd.read_csv(filepath)
            df.drop(columns=['Unnamed: 0'], inplace=True)
            dfs.append(df)
        dfs = pd.concat(dfs)
        dfs.reset_index(drop=True, inplace=True)
        return dfs

    @staticmethod
    def load_encoders():
        features = [
            'user_id_hash',
            'target_id_hash',
            'syndicator_id_hash',
            'campaign_id_hash',
            'target_item_taxonomy',
            'placement_id_hash',
            'publisher_id_hash',
            'source_id_hash',
            'source_item_type',
            'browser_platform',
            'country_code',
            'region',
        ]
        column_encoders = dict()
        for feature in tqdm(features, desc='Loading encoders'):
            with open(f'./label_encoders/{feature}.pickle', 'rb') as handle:
                encoder = pickle.load(handle)
            column_encoders[feature] = encoder
        return column_encoders

    @staticmethod
    def get_features_defs(data):
        user_features_numeric = [
            'page_view_start_time',
            'user_recs',
            'user_clicks', 
            'user_target_recs',
        ]

        item_features_numeric = [
            'page_view_start_time',
            'empiric_calibrated_recs',
            'empiric_clicks',
        ]

        item_features_categoric = {
            # (vocabulary size, embedding size)
            'syndicator_id_hash': (data['syndicator_id_hash'].nunique(), 32),
            'campaign_id_hash': (data['campaign_id_hash'].nunique(), 32),
            'placement_id_hash': (data['placement_id_hash'].nunique(), 32),
            'target_item_taxonomy': (data['target_item_taxonomy'].nunique(), 8),
        }
        return user_features_numeric, item_features_numeric, item_features_categoric


    def _get_split(self):
        max_timestamp = self.data['page_view_start_time'].max()
        three_days_ago = (datetime.datetime.fromtimestamp(max_timestamp/1000) - datetime.timedelta(days=3)).replace(hour=0, minute=0, second=0, microsecond=0)
        three_days_ago = int(datetime.datetime.timestamp(three_days_ago) * 1000)
        return three_days_ago

    def get_trainset_df(self):
        threshold = self._get_split()
        train_set = self.data[self.data['page_view_start_time'] < threshold]
        train_set.reset_index(drop=True, inplace=True)
        return train_set

    def df_to_embeddings(self, df):
        batch_size = 1_000_000
        user_embeddings = []
        item_embeddings = []
        for i in range(0, len(df.index), batch_size):
            batch = df.iloc[i:i + batch_size]
            user_features_df = batch[self.user_encoder.features_numeric + self.user_encoder.features_categoric].copy(deep=True)
            del batch
            batch_user_embeddings = self.user_encoder.to_emebeddings(user_features_df)
            user_embeddings.append(batch_user_embeddings)
            del user_features_df
            batch = df.iloc[i:i + batch_size]
            item_features_df = batch[self.item_encoder.features_numeric + self.item_encoder.features_categoric].copy(deep=True)
            del batch
            batch_item_embeddings = self.item_encoder.to_emebeddings(item_features_df)
            item_embeddings.append(batch_item_embeddings)
            del item_features_df
        user_embeddings_concatenated = torch.cat(user_embeddings)
        del user_embeddings
        item_embeddings_concatenated = torch.cat(item_embeddings)
        del item_embeddings
        data = {
            'user': user_embeddings_concatenated,
            'item': item_embeddings_concatenated,
        }
        if 'is_click' in df.columns:
            labels = df['is_click'].to_numpy().astype('int')
            labels_one_hot = np.zeros((labels.shape[0], 2))
            labels_one_hot[np.arange(labels.shape[0]), labels] = 1
            del labels
            data['label'] = torch.from_numpy(labels_one_hot).float()
        return data
    
    def get_test_set_df(self):
        threshold = self._get_split()
        train_set = self.data[self.data['page_view_start_time'] < threshold]
        last_three_days = self.data[self.data['page_view_start_time'] >= threshold]
        not_cold_users_mask = last_three_days['user_id_hash'].isin(train_set['user_id_hash'])
        test_set_hot_users = last_three_days[not_cold_users_mask]
        test_set_cold_users = last_three_days[~not_cold_users_mask]
        test_set_hot_users.reset_index(drop=True, inplace=True)
        test_set_cold_users.reset_index(drop=True, inplace=True)
        return test_set_hot_users, test_set_cold_users

    def get_train_set(self, max_idx = None):
        print('Extracting Train set')
        train_set_df = self.get_trainset_df()
        print('Extraction completed')
        if max_idx is not None:
            train_set_df = train_set_df.iloc[: max_idx].copy()
        print('Converting to embeddings')
        train_set = self.df_to_embeddings(train_set_df)
        print('Convertion completed')
        return train_set
    
    def get_test_set(self):
        test_set_hot_users_df, test_set_cold_users_df = self.get_test_set_df()
        test_set_hot_users = self.df_to_embeddings(test_set_hot_users_df)
        del test_set_hot_users_df
        test_set_cold_users = self.df_to_embeddings(test_set_cold_users_df)
        del test_set_cold_users_df
        return test_set_hot_users, test_set_cold_users

In [11]:
data_maker = TrainTestMaker()
# train_set = data_maker.get_train_set(3_000_000)

Loading data:   0%|          | 0/69 [00:00<?, ?it/s]

User loaded from ./lookup_tables/user-page_view_start_time-user_recs-user_clicks-user_target_recs.pickle
Item loaded from ./lookup_tables/item-page_view_start_time-empiric_calibrated_recs-empiric_clicks-syndicator_id_hash-campaign_id_hash-placement_id_hash-target_item_taxonomy.pickle


In [12]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        
        self.user_tower = nn.Sequential(
            nn.Linear(4, 4),
            nn.ReLU(),
            nn.Linear(4, 4),
            nn.ReLU(),
            nn.Linear(4, 4),
            nn.ReLU(),
        )

        self.item_tower = nn.Sequential(
            nn.Linear(107, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU()
        )
        self.classifier = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 2),
            nn.ReLU()
        )
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        users, items = x
        users = self.user_tower(users)
        items = self.item_tower(items)
        users = nn.functional.pad(users, (0,60), value=1)
        aggregation = users * items
        return self.classifier(aggregation)


In [13]:
def normalize_test_set():
    encoders = data_maker.load_encoders()
    test_df = pd.read_csv('./test_file.csv')
    test_df['country_code'].fillna('Null', inplace=True)
    test_df['region'].fillna('Null', inplace=True)
    test_df.drop(columns=['user_id_hash', 'target_id_hash'], inplace=True)
    for column, encoder in tqdm(encoders.items(), desc='Normalizing test file'):
        if column in test_df.columns:
            print(f'Encoding {column}')
            unique_values = test_df[column].unique()
            value_to_code = dict()
            for value in unique_values:
                try:
                    mapped_value = encoder.transform([value])[0]
                except ValueError as err:
                    mapped_value = encoder.classes_.shape[0]
                
                value_to_code[value] = mapped_value
            test_df[column] = test_df[column].apply(lambda value: value_to_code[value])
    return test_df
# test_df = normalize_test_set()
# test_df.to_csv('./test_file_normalized.csv')

In [14]:
class TorchDataset(Dataset):
    def __init__(self, idxs, data):
        super().__init__()
        self.idxs = idxs
        self.data = data
        self.size = idxs.shape[0]

    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        users_main_data = self.data['user']
        items_main_data = self.data['item']
        
        real_idxs = self.idxs[idx]
        
        users = users_main_data[real_idxs]
        items = items_main_data[real_idxs]
        
        return users, items

class TestsetEval:

    def __init__(self, model_filepath) -> None:
        # model = torch.load(model_filepath)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model = torch.load(model_filepath, map_location=self.device)
        model = model.to(self.device)
        model.eval()
        self.model = model

    @staticmethod
    def load_normalized_test_set():
        test_df = pd.read_csv('./test_file_normalized.csv')
        test_df.drop(columns=['Unnamed: 0'], inplace=True)
        return test_df
    

    def predict(self):
        print('Loading normalized testset')
        test_df = TestsetEval.load_normalized_test_set()
        print('Generating embeddings')
        dataset = data_maker.df_to_embeddings(test_df)
        print('Embeddings generated')
        test_dataset = TorchDataset(torch.arange(dataset['user'].size(0)), dataset)
        test_loader = DataLoader(test_dataset, batch_size=64, num_workers=1, shuffle=False)
        with torch.no_grad():
            self.model.eval()
            outputs = []
            probabilities_acc = []
            for users, items in tqdm(test_loader, total=len(test_loader), desc=f'Prediction'):
                users, items = users.to(self.device), items.to(self.device)
                logits = self.model((users, items))
                logits = self.model.softmax(logits)
                probabilities = logits.cpu()
                predictions = logits.argmax(dim=1).cpu()
                outputs.append(predictions)
                probabilities_acc.append(probabilities)
            return torch.cat(outputs).numpy(), torch.cat(probabilities_acc).numpy()
        
testset_eval = TestsetEval('./models/torch-models-2024-02-16T17-36-50/model-final.pt')
predictions, probabilities = testset_eval.predict()

Loading normalized testset
Generating embeddings
Embeddings generated


Prediction:   0%|          | 0/6946 [00:00<?, ?it/s]

In [19]:
def report_submission():
    pred = probabilities[:,1]
    pred_df = pd.DataFrame(pred)
    pred_df.reset_index(inplace=True)
    pred_df.columns = ['Id','Predicted']
    pred_df.to_csv('my_submission.csv',index=False)
    return pd.read_csv('./my_submission.csv').head(15)
report_submission()

array([0., 0.], dtype=float32)