In [None]:
import os
import itertools

import pandas as pd
import numpy as np


from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

from rkcompetition.utils.data_preprocessing import *

In [None]:
def get_dict_combinations(dictionary):
    keys, values = zip(*params.items())
    permutations_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]
    return permutations_dicts


def finetune_embeddings(df, params):
    results = pd.DataFrame(columns = ['embedding_components', 'n_clusters', 'outlier_strength', 'min_number_of_clicks_for_user', 'perplexity', 'f1'])

    params_combinations = get_dict_combinations(params)
    df = outer_actions_long

    for param in params_combinations:
        transformed_data = transform_event_to_cluster(df, **param)
        # merge data to obtain dataset
        transformed_data = transformed_data.merge(train, left_index = True, right_on = 'contract_id', how = 'left')
        transformed_data = transformed_data.loc[transformed_data.blocked >= 0]
        transformed_data = transformed_data.drop(['contract_id'], axis = 1)
        transformed_data.columns = [str(col) for col in transformed_data.columns]
        # dataset
        y = transformed_data.pop('blocked')
        x = transformed_data


        lr = LogisticRegression(max_iter=10000, class_weight = 'balanced', C = 0.5)

        scaler = MinMaxScaler()

        pipe = Pipeline(steps=[("scaler", scaler), ("logistic", lr)])

        clf = GridSearchCV(pipe, {},  scoring = 'f1_macro')
        clf.fit(x, y)

        res = {k:v for k,v in param.items()}
        res['f1'] = clf.best_score_
        results = results.append(res, True)
        
    return results

In [None]:
inner_actions, outer_actions, type_contract, train, sample, outer_actions_in_train = read_raw_data('../data/raw/')

In [None]:
# take url only that used in dataset
outer_actions = outer_actions.loc[outer_actions.event_type.isin(outer_actions_in_train.event_type)]
outer_actions_long = long_to_wide(outer_actions)


In [None]:
params = {
    'embedding_components': np.arange(2, 6),
    'n_clusters': np.arange(10, 30, 5),
    'outlier_strength': [0, 0.1, 0.15],
    'min_number_of_clicks_for_user': [100, 150, 200],
    'perplexity': [3, 5, 7, 10],
}

optimization = 'inner'

if optimization == 'outer':
    df = outer_actions_in_train
    second_df = outer_actions
else:
    df = inner_actions
    second_df = None

finetune_embeddings(outer_actions_long, params)