In [3]:
import pandas as pd
# train_subsetの最初の10,000件(約6GB)を読み込み
df = pd.read_csv('data/avazu/train_subset.csv')

# device_idが何種類あるか計算
len(df['device_id'].unique())

906

In [4]:
def click_counting(x, bin_column):
    clicks = pd.Series(x[x['click'] > 0][bin_column].value_counts(), name='clicks')
    no_clicks = pd.Series(x[x['click'] < 1][bin_column].value_counts(), name='no_clicks')

    counts = pd.DataFrame([clicks,no_clicks]).T.fillna('0')
    counts['total_clicks'] = counts['clicks'].astype('int64') + counts['no_clicks'].astype('int64')
    return counts

def bin_counting(counts):
    counts['N+'] = counts['clicks'].astype('int64').divide(counts['total_clicks'].astype('int64'))
    counts['N-'] = counts['no_clicks'].astype('int64').divide(counts['total_clicks'].astype('int64'))
    counts['log_N+'] = counts['N+'].divide(counts['N-'])
    # Bin Countingのプロパティを返すだけの場合、ここでフィルタリングを実行
    bin_counts = counts.filter(items= ['N+', 'N-', 'log_N+'])
    return counts, bin_counts

# Bin counts example: device_id
bin_column = 'device_id'
device_clicks = click_counting(df.filter(items=[bin_column, 'click']), bin_column)
device_all, device_bin_counts = bin_counting(device_clicks)

device_all

Unnamed: 0,clicks,no_clicks,total_clicks,N+,N-,log_N+
a99f214a,1279,5878,7157,0.178706,0.821294,0.217591
25635c83,2,0,2,1.000000,0.000000,inf
c357dbff,2,12,14,0.142857,0.857143,0.166667
e62f1261,2,1,3,0.666667,0.333333,2.000000
135f7d9a,2,0,2,1.000000,0.000000,inf
9af87478,2,0,2,1.000000,0.000000,inf
79fc2cba,1,0,1,1.000000,0.000000,inf
7f8c00b4,1,0,1,1.000000,0.000000,inf
42545ab6,1,0,1,1.000000,0.000000,inf
c608e669,1,0,1,1.000000,0.000000,inf


In [6]:
device_bin_counts

Unnamed: 0,N+,N-,log_N+
a99f214a,0.178706,0.821294,0.217591
25635c83,1.000000,0.000000,inf
c357dbff,0.142857,0.857143,0.166667
e62f1261,0.666667,0.333333,2.000000
135f7d9a,1.000000,0.000000,inf
9af87478,1.000000,0.000000,inf
79fc2cba,1.000000,0.000000,inf
7f8c00b4,1.000000,0.000000,inf
42545ab6,1.000000,0.000000,inf
c608e669,1.000000,0.000000,inf


In [7]:
len(device_bin_counts)

906

In [8]:
device_all.sort_values(by = 'total_clicks', ascending=False).head(4)

Unnamed: 0,clicks,no_clicks,total_clicks,N+,N-,log_N+
a99f214a,1279,5878,7157,0.178706,0.821294,0.217591
c357dbff,2,12,14,0.142857,0.857143,0.166667
a167aa83,0,7,7,0.0,1.0,0.0
d2bbb640,0,6,6,0.0,1.0,0.0
