In [1]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import pymoo
from utils import generate_data

from datetime import datetime, timedelta


DB_DATETIME = '%Y-%m-%d %H:%M:%S'

In [2]:
%load_ext cython

In [20]:
%%cython -a
from tqdm import tqdm
from datetime import datetime, timedelta
import numpy as np
DB_DATETIME = '%Y-%m-%d %H:%M:%S'
def cython_simulate_filtering(data):
    
    #cdef int total_completes = total_revenue = total_clicks = total_terms = total_nqs = 0
    
    cdef int total_completes = 0
    cdef int total_revenue = 0
    cdef int total_clicks = 0
    cdef int total_terms = 0
    cdef int total_nqs = 0
    
    # window to look at for nqs
    time_window = timedelta(days=7)
    
    # window to block user
    block_window = timedelta(days=7)
    
    for group_user in tqdm(list(data.keys())):    
        
        block_time = None
        is_filtered = False
        click_cache = {}
        
        for click in data[group_user]:
            
            click_time = datetime.strptime(click['click_timestamp'], DB_DATETIME)
            
            if is_filtered:
                
                if block_time + block_window < click_time:
                    is_filtered = False
                    block_time = None
                    
                else:
                    continue
                    
            total_clicks += 1
            
            # record metrics
            if click['result'] == 2:
                total_completes += 1
                total_revenue += click['cpi_cents']
                
            if click['term_timestamp'] != 0:
                total_terms += 1
                
            if click['result'] == 3:
                total_nqs += 1
                click_cache[click_time] = 1
                
            else:
                click_cache[click_time] = 0
              
            
            # remove clicks from cache when beyond time window
            to_delete = []
            for time in click_cache.keys():
                if click_time - time_window > time:
                    to_delete.append(time)
            
            for time in to_delete:
                del click_cache[time]

            nq_count = sum(list(click_cache.values()))
            click_count = len(click_cache)
            nq_percent = round(100*nq_count/click_count)
            
            if nq_count > 10 and nq_percent > 3:
                is_filtered = True
                block_time = click_time
            
    print('total revenue: ${}'.format(round(total_revenue/100, 2)))
    print('total completes: {}'.format(total_completes))
    print('total clicks: {}'.format(total_clicks))
    print('epc: {}%'.format(round(100*total_completes/total_clicks, 2)))
    print('total nqs: {}'.format(total_nqs))
    print('nq rate: {}%'.format(round(100*total_nqs/total_clicks, 2)))

In [10]:
#df_small = pd.read_csv('./data/nqs-2021-10-03.csv')
#df_big = pd.read_csv('./data/nq-dataset-2021.csv')

data = generate_data(path='./data/nq-dataset-2021.csv', user_pct=0.2)

(17398595, 6)
(209872,)


In [23]:
# define filter sim
def simulate_filtering(data, x=[7, 7, 5, 20], out={}):
    
    # define simulation a params
    days_to_look = x[0]
    days_to_block = x[1]
    min_click_to_block = x[3]
    min_nq_pct_to_block = x[4]
    
    # define metrics variables
    total_completes = 0
    total_revenue = 0
    total_clicks = 0
    total_terms = 0
    total_nqs = 0
    
    # window to look at for nqs
    time_window = timedelta(days=days_to_look)
    
    # window to block user
    block_window = timedelta(days=days_to_block)
    
    for group_user in tqdm(list(data.keys())):    
        
        block_time = None
        is_filtered = False
        click_cache = {}
        
        for click in data[group_user]:
            
            click_time = datetime.strptime(click['click_timestamp'], DB_DATETIME)
            
            if is_filtered:
                
                if block_time + block_window < click_time:
                    is_filtered = False
                    block_time = None
                    
                else:
                    continue
                    
            total_clicks += 1
            
            # record metrics
            if click['result'] == 2:
                total_completes += 1
                total_revenue += click['cpi_cents']
                
            if click['term_timestamp'] != 0:
                total_terms += 1
                
            if click['result'] == 3:
                total_nqs += 1
                click_cache[click_time] = 1
                
            else:
                click_cache[click_time] = 0
              
            
            # remove clicks from cache when beyond time window
            to_delete = []
            for time in click_cache.keys():
                if click_time - time_window > time:
                    to_delete.append(time)
            
            for time in to_delete:
                del click_cache[time]

            nq_count = sum(click_cache.values())
            click_count = len(click_cache)
            nq_percent = round(100*nq_count/click_count)
            
            if nq_percent > min_nq_pct_to_block and click_count > min_click_to_block:
                is_filtered = True
                block_time = click_time
            
    print('total revenue: ${}'.format(round(total_revenue/100, 2)))
    print('total completes: {}'.format(total_completes))
    print('total clicks: {}'.format(total_clicks))
    print('epc: {}%'.format(round(100*total_completes/total_clicks, 2)))
    print('total nqs: {}'.format(total_nqs))
    print('nq rate: {}%'.format(round(100*total_nqs/total_clicks, 2)))
    
    ir = round(total_completes/total_clicks, 4)
    
    out["F"] = [revenue, ir, total_nqs]
    out["G"] = [g1, g2]
    

In [12]:
simulate_filtering(data)

  0%|          | 0/209872 [00:00<?, ?it/s]

total revenue: $897338.67
total completes: 576632
total clicks: 2744714
epc: 21.01%
total nqs: 674236
nq rate: 24.56%


In [None]:
%timeit simulate_filtering(data)

In [21]:
cython_simulate_filtering(data)

100%|██████████| 209872/209872 [00:33<00:00, 6239.33it/s] 

total revenue: $897338.67
total completes: 576632
total clicks: 2744714
epc: 21.01%
total nqs: 674236
nq rate: 24.56%





In [81]:
from pymoo.core.problem import ElementwiseProblem


# define problem
class FilteringNQs(ElementwiseProblem):

    def __init__(self):
        super().__init__(n_var=2,
                         n_obj=2,
                         n_constr=2,
                         xl=np.array([-2,-2]),
                         xu=np.array([2,2])
                         data = None
                        )
        
        if data == None
            self.data = generate_data(path='./data/nq-dataset-2021.csv', user_pct=0.2)
        
        else:
            self.data = data

    def _evaluate(self, x, out, *args, **kwargs):
        

        out["F"] = [f1, f2]
        out["G"] = [g1, g2]


problem = FilteringNQs()

NameError: name 'MyProblem' is not defined

In [29]:
import numpy as np
from pymoo.core.problem import ElementwiseProblem

class MyProblem(ElementwiseProblem):

    def __init__(self):
        super().__init__(n_var=2,
                         n_obj=2,
                         n_constr=2,
                         xl=np.array([-2,-2]),
                         xu=np.array([2,2]))
    
        print('hello!!')
        
    def _evaluate(self, x, out, *args, **kwargs):
        f1 = 100 * (x[0]**2 + x[1]**2)
        f2 = (x[0]-1)**2 + x[1]**2

        g1 = 2*(x[0]-0.1) * (x[0]-0.9) / 0.18
        g2 = - 20*(x[0]-0.4) * (x[0]-0.6) / 4.8

        out["F"] = [f1, f2]
        out["G"] = [g1, g2]


problem = MyProblem()

hello!!


In [30]:
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.factory import get_sampling, get_crossover, get_mutation

algorithm = NSGA2(
    pop_size=40,
    n_offsprings=10,
    sampling=get_sampling("real_random"),
    crossover=get_crossover("real_sbx", prob=0.9, eta=15),
    mutation=get_mutation("real_pm", eta=20),
    eliminate_duplicates=True
)

In [31]:
from pymoo.factory import get_termination

termination = get_termination("n_gen", 40)

In [32]:
from pymoo.optimize import minimize

res = minimize(problem,
               algorithm,
               termination,
               seed=1,
               save_history=True,
               verbose=True)

X = res.X
F = res.F

n_gen |  n_eval |   cv (min)   |   cv (avg)   |  n_nds  |     eps      |  indicator  
    1 |      40 |  0.00000E+00 |  2.36399E+01 |       1 |            - |            -
    2 |      50 |  0.00000E+00 |  1.15486E+01 |       1 |  0.00000E+00 |            f
    3 |      60 |  0.00000E+00 |  5.277918607 |       1 |  0.00000E+00 |            f
    4 |      70 |  0.00000E+00 |  2.406068542 |       2 |  1.000000000 |        ideal
    5 |      80 |  0.00000E+00 |  0.908316880 |       3 |  0.869706146 |        ideal
    6 |      90 |  0.00000E+00 |  0.264746300 |       3 |  0.00000E+00 |            f
    7 |     100 |  0.00000E+00 |  0.054063822 |       4 |  0.023775686 |        ideal
    8 |     110 |  0.00000E+00 |  0.003060876 |       5 |  0.127815454 |        ideal
    9 |     120 |  0.00000E+00 |  0.00000E+00 |       6 |  0.085921728 |        ideal
   10 |     130 |  0.00000E+00 |  0.00000E+00 |       7 |  0.015715204 |            f
   11 |     140 |  0.00000E+00 |  0.00000E+00 |       