In [1]:
import pandas as pd
import numpy as np

## Pandas groupby operations

These can be applied to a pandas.groupby object using the following syntax:

        df.groupby(by=column_name).apply(lambda g: divide(g, num_col, denom_col))

In [2]:
def divide(g, num_col, denom_col):
    return df.loc[g.index, num_col].sum()/df.loc[g.index, denom_col].sum().astype(float) 

def wavg(g, weight_col):
    return np.average(g, weights = df.loc[g.index, weight_col])

def pct_won(g):
    return divide(g, 'num_won', 'num_bids')*100
    
def w_avg_bid(g):
    return wavg(g, 'num_bids')

def w_avg_paid(g):
    return wavg(g, 'num_won')

def ctr(g):
    return divide(g, 'clicks', 'imps')*100

def ecpa(g):
    # total revenue divided by # cons
    return divide(g, 'revenue', 'cons')

def revenue(g):
    return df.loc[g.index, 'eCPM'].unique()[0]*df.loc[g.index, 'num_won'].sum()/1000
    
def margin(g):
    # revenue - total spend divided by revenue
    return 100*(df.loc[g.index, 'revenue'].sum() - df.loc[g.index, 'total_spend'].sum())/df.loc[g.index, 'revenue'].sum().astype(float)

def convert_to_minutes(txt):
    if 'day' in txt:
        split = [item for item in txt.split(' ') if item != '']
        start = int(split[0])*24*60
        end = int(split[1])*24*60
        return str(start) + ' ' + str(end) + ' ' + split[2].replace('day', 'minute')
    return txt



## Read data

In [3]:
df = pd.read_csv('../data/sample_data.csv', sep = ',', header = 0, index_col = 0)
df.head()

Unnamed: 0,site_id,strategy_id,list_type,line_id,adv_id,adv_vertical,name,goal,price,limit,...,win_rate_site,win_rate_strat,cvr_strat,cvr,line_cvr,hist_zscore,overlap,target_variable,win_rate_site_table,win_rate_strat_table
0,82932,313729,testing,20049,206.0,Travel,Nicole,0.0,3.95,10000.0,...,0.423778,0.111431,0.0,0.001197,0.0,2.708366,0.001066,0,0.450094,0.249479
1,90474,313729,testing,20049,206.0,Travel,Nicole,0.0,3.95,10000.0,...,0.16301,0.111431,0.0,0.001239,0.0,1.188635,0.000703,0,0.15805,0.249479
2,92345,313729,testing,20049,206.0,Travel,Nicole,0.0,3.95,10000.0,...,0.318358,0.111431,0.0,0.000729,0.0,1.503285,0.000873,0,0.360591,0.249479
3,92415,313729,testing,20049,206.0,Travel,Nicole,0.0,3.95,10000.0,...,0.133199,0.111431,0.0,0.005894,0.0,35.153628,0.004614,0,0.113717,0.249479
4,92425,313729,testing,20049,206.0,Travel,Nicole,0.0,3.95,10000.0,...,0.37931,0.111431,0.0,0.0,0.0,-0.091378,0.000344,0,0.019308,0.249479


## Drop DF rows according to a condition

In [4]:
# drop all records that are not associated with Ian or Nicole
Ian_and_Nicole = df.drop(df[ ~ (df['name'].str.contains('Ian') | df['name'].str.contains('Nicole'))].index)
Ian_and_Nicole['name'].unique()

array(['Nicole', 'Ian'], dtype=object)

## Sort a dataframe

In [5]:
df.sort_values(by = ['line_id', 'strategy_id', 'site_id'], ascending = False, axis = 0, inplace=True)
df.reset_index(drop = True, inplace = True)
df.head()

Unnamed: 0,site_id,strategy_id,list_type,line_id,adv_id,adv_vertical,name,goal,price,limit,...,win_rate_site,win_rate_strat,cvr_strat,cvr,line_cvr,hist_zscore,overlap,target_variable,win_rate_site_table,win_rate_strat_table
0,1114119077,359117,testing,29695,658.0,Automotive,Edward,35.0,3.24,75000.0,...,0.35206,0.146937,0.000235,0.002625,0.000157,16.512526,0.001697,0,0.361192,0.156831
1,1104262560,359117,testing,29695,658.0,Automotive,Edward,35.0,3.24,75000.0,...,,0.146937,0.000235,0.001369,0.000157,12.733262,0.030301,0,0.195153,0.156831
2,1102384171,359117,testing,29695,658.0,Automotive,Edward,35.0,3.24,75000.0,...,0.526733,0.146937,0.000235,0.002047,0.000157,32.661064,0.014129,0,0.499383,0.156831
3,1099671672,359117,testing,29695,658.0,Automotive,Edward,35.0,3.24,75000.0,...,0.065434,0.146937,0.000235,0.001479,0.000157,4.129372,0.003913,0,0.05528,0.156831
4,1097198698,359117,testing,29695,658.0,Automotive,Edward,35.0,3.24,75000.0,...,0.026784,0.146937,0.000235,0.003135,0.000157,91.39897,0.163626,0,0.033759,0.156831


## Fill NA values

With the most commonly occurring value:

In [6]:
df['win_rate_site'].fillna(df['win_rate_site'].value_counts().idxmax(), inplace = True)

## Create a column conditional on the values of another column

In [8]:
import pandas_helper as ph
group_names = ['Low', 'Okay', 'Good', 'Great']
pd.pct_rank_qcut(df['cvr'], 4)

ImportError: No module named pandas_helper

In [None]:
# for a string condition (results in True/False)
df['auto'] = df['adv_vertical'].str.contains('*Auto*')

# for a continuous value
df['test'] = df['']

# Apply a dictionary of operations to different columns

In [None]:
apply_dict = {'num_bids': sum, 'num_won':sum, 'pct_won': pct_won, 'avg_bid':w_avg_bid, 'avg_paid': w_avg_paid, 'imps':sum, 
              'clicks': sum, 'CTR': ctr, 'cons':sum, 'vts':sum, 'eCPM':np.average, 'eCPA':ecpa, 'revenue':revenue, 
             'total_spend':sum, 'profit':sum, 'margin': margin}

df['strategy_name'] = df['strategy_name'].str.replace(test_tag, '')
df['strategy_name'] = df['strategy_name'].str.replace(control_tag, '')
df['strategy_name'] = df['strategy_name'].str.replace(' \(copy\)', '')
df['strategy_name'] = df['strategy_name'].map(lambda l: convert_to_minutes(l))

results = df.groupby(by=['test', 'strategy_id', 'strategy_name'], as_index = False).agg(apply_dict)

# Save results

In [None]:
results.to_csv('demand-rtb-report_output.csv', sep = ',', header = True, index = False)

results