In [21]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype

In [22]:
def clicks_weigher(clicks_df, max_clicks_per_session = 100):
    """
    (chain, user) -> interaction weight
    """
    print('Clicks weighter: use user clicks per chain as weight')
    clicks_df = clicks_df[['user_id', 'session_id', 'chain_id']]
    
    # erase sessions with fraud click count
    clicks_df['cnt'] = 1
    by_session = clicks_df[['session_id', 'cnt']].groupby(['session_id']).sum()
    assert len(by_session) == len(by_session.index.unique())
    by_session = by_session.drop(by_session[by_session.cnt > max_clicks_per_session].index)
    clicks_df = clicks_df.join(by_session, on='session_id', how='inner', lsuffix='_caller', rsuffix='_other')
    clicks_df = clicks_df[['user_id', 'chain_id']]
    
    # create table (user_id, chain_id) -> weight
    clicks_df['weight'] = 1
    clicks_df = clicks_df.groupby(['user_id', 'chain_id']).sum()
    clicks_df = clicks_df.reset_index()
    clicks_df = clicks_df[['user_id', 'chain_id', 'weight']]
    clicks_df['weight'] /= clicks_df['weight'].max()
    
    if clicks_df['weight'].min() < 0 or clicks_df['weight'].max() > 1:
        raise RuntimeError("Invalid input: clicks weight must be in [0, 1]")
    return clicks_df


def orders_weigher(orders_df, max_orders_per_chain = 50):
    """
    (chain, user) -> interaction weight
    """
    print('Orders weighter: use successful user orders per chain as weight')
    # TODO: consider other statuses in weight function!
    
    SUCCESS_STATUS = 11
    orders_df = orders_df[['user_id', 'status_id', 'chain_id']]
    orders_df = orders_df.drop(orders_df[orders_df.status_id != SUCCESS_STATUS].index)

    orders_df['weight'] = 1
    orders_df = orders_df.groupby(['user_id', 'chain_id']).sum()
    orders_df = orders_df.reset_index()[['user_id', 'chain_id', 'weight']]
    orders_df = orders_df.drop(orders_df[orders_df.weight > max_orders_per_chain].index)
    orders_df = orders_df.drop(orders_df[orders_df.user_id < 0].index)
    orders_df['weight'] /= orders_df['weight'].max()
    
    if orders_df['weight'].min() < 0 or orders_df['weight'].max() > 1:
        raise RuntimeError("Invalid input: orders weight must be in [0, 1]")
    return orders_df


class InteractionTable:
    
    """
    weight = alpha * click_weight + (1 - alpha) * orders_weight
    alpha in [0, 1], click_weight in [0, 1], orders_weight in [0, 1]
    so final weight in (0, 1]
    """
    def __init__(self, clicks_getter, orders_getter, clicks_weigher, orders_weigher, alpha):

        if alpha < 0 or alpha > 1:
            raise RuntimeError("Invalid input: alpha must be in [0, 1]")
            
        self.alpha = alpha
        clicks_df = self.load(clicks_getter, clicks_weigher, 'Clicks')
        clicks_df['weight'] *= self.alpha
        
        orders_df = self.load(orders_getter, orders_weigher, 'Orders')
        orders_df['weight'] *= (1 - self.alpha)
        
        self.interaction_df = pd.concat([clicks_df, orders_df], ignore_index=True)
        self.chain_index = self.get_uniqs_index(self.interaction_df.chain_id)
        self.user_index = self.get_uniqs_index(self.interaction_df.user_id)
        self.sparse_interaction_matrix = self.get_sparse_interaction_matrix(self.interaction_df)
    
    def load(self, getter, weigher, label):
        df = getter()
        if 'user_id' not in df.columns:
            df['user_id'] = df['customer_id'].astype('int64')
        print(f'{label} df loaded: size={len(df)},',
              f' uniq_users={len(df.user_id.unique())},',
              f' uniq_chains={len(df.chain_id.unique())}')
        df = weigher(df)
        print(f'{label} df weighted: size={len(df)},',
              f'uniq_users={len(df.user_id.unique())},',
              f'uniq_chains={len(df.chain_id.unique())}')
        return df
        
    def get_sparse_interaction_matrix(self, df):
        """
        https://stackoverflow.com/questions/31661604/efficiently-create-sparse-pivot-tables-in-pandas
        user-chain pivot sparse matrix
        """
        chain_c = CategoricalDtype(sorted(df.chain_id.unique()), ordered=True)
        user_c = CategoricalDtype(sorted(df.user_id.unique()), ordered=True)
        
        row = df.chain_id.astype(chain_c).cat.codes
        col = df.user_id.astype(user_c).cat.codes
        assert row.min() >= 0 and col.min() >= 0
        
        sparse_matrix = csr_matrix((df.weight, (row, col)), \
                                   shape=(chain_c.categories.size, user_c.categories.size))
        return sparse_matrix
    
    def get_uniqs_index(self, df_column):
        """
        mapping ids (user or chain) -> uniq id starting from 0
        """
        uniqs = sorted(df_column.unique())
        uniqs_index = dict(zip(uniqs, [x for x in range(len(uniqs))]))
        return uniqs_index

In [23]:
# TODO: replace with filtered datasets

def get_clicks():
    path = '../data/clicks/click.pkl'
    return pd.read_pickle(path).head(100)

def get_orders():
    path = '../data/orders/orders.pkl'
    return pd.read_pickle(path).head(100)

In [25]:
interactions = InteractionTable(get_clicks,
                                get_orders,
                                clicks_weigher,
                                orders_weigher,
                                alpha=0.5)

Clicks df loaded: size=100,  uniq_users=49,  uniq_chains=83
Clicks weighter: use user clicks per chain as weight
Clicks df weighted: size=88, uniq_users=49, uniq_chains=83
Orders df loaded: size=100,  uniq_users=100,  uniq_chains=78
Orders weighter: use successful user orders per chain as weight
Orders df weighted: size=100, uniq_users=100, uniq_chains=78


In [26]:
interactions.sparse_interaction_matrix

<155x149 sparse matrix of type '<class 'numpy.float64'>'
	with 188 stored elements in Compressed Sparse Row format>

In [20]:
interactions.user_index

{15855: 0,
 23861: 1,
 66488: 2,
 181963: 3,
 291982: 4,
 299414: 5,
 313305: 6,
 313577: 7,
 349556: 8,
 372465: 9,
 418672: 10,
 481760: 11,
 511049: 12,
 522380: 13,
 567284: 14,
 592923: 15,
 763243: 16,
 908906: 17,
 997069: 18,
 1090032: 19,
 1127443: 20,
 1127918: 21,
 1150303: 22,
 1222606: 23,
 1278734: 24,
 1314993: 25,
 1321269: 26,
 1390611: 27,
 1408463: 28,
 1523036: 29,
 1529256: 30,
 1607567: 31,
 1611397: 32,
 1620004: 33,
 1704053: 34,
 1750546: 35,
 1801957: 36,
 1823083: 37,
 1865693: 38,
 1870957: 39,
 1986981: 40,
 2072901: 41,
 2088451: 42,
 2132931: 43,
 2222406: 44,
 2241515: 45,
 2254256: 46,
 2254551: 47,
 2329998: 48,
 2330749: 49,
 2388618: 50,
 2425938: 51,
 2464236: 52,
 2514428: 53,
 2543320: 54,
 2551572: 55,
 2592128: 56,
 2667232: 57,
 2675714: 58,
 2699630: 59,
 2715642: 60,
 2723474: 61,
 2733365: 62,
 2735757: 63,
 2752760: 64,
 2772791: 65,
 2801407: 66,
 2816615: 67,
 2870674: 68,
 2887180: 69,
 2888318: 70,
 2897592: 71,
 2899448: 72,
 2900808: 