In [1]:
import pandas as pd
import numpy as np
import math
import json
%matplotlib inline

# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

In [16]:
transactions = transcript[transcript.event == "transaction"].reset_index()
amount = transactions.value.apply(lambda x: x['amount'])
transactions.value = amount
transactions = transactions[['person', 'event', 'value', 'time']]
transactions.to_csv("data/transaction.csv")
transactions

Unnamed: 0,person,event,value,time
0,02c083884c7d45b39cc68e1314fec56c,transaction,0.83,0
1,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,transaction,34.56,0
2,54890f68699049c2a04d415abc25e717,transaction,13.23,0
3,b2f1cd155b864803ad8334cdf13c4bd2,transaction,19.51,0
4,fe97aa22dd3e48c8b143116a8403dd52,transaction,18.97,0
...,...,...,...,...
138948,b3a1272bc9904337b331bf348c3e8c17,transaction,1.59,714
138949,68213b08d99a4ae1b0dcb72aebd9aa35,transaction,9.53,714
138950,a00058cf10334a308c68e7631c529907,transaction,3.61,714
138951,76ddbd6576844afe811f1a3c0fbb5bec,transaction,3.53,714


In [5]:
offers = pd.read_csv("data/offers.csv", index_col=0)
offers.head()

Unnamed: 0,person,event,offer_id,time,reward
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,9b98b8c7a33c4b65b9aebfe6a799e6d9,0,
1,a03223e636434f42ac4c3df47e8bac43,offer received,0b1e1539f2cc45b7b9fa7c272da2e1d7,0,
2,e2127556f4f64592b11af22de27a7932,offer received,2906b810c7d4411798c6938adc9daaa5,0,
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,fafdcd668e3743c1bb461111dcafc2a4,0,
4,68617ca6246f4fbc85e91a2a49552598,offer received,4d5c57ea9a6940dd891ad53e9dbe8da0,0,


In [6]:
offers.describe()

Unnamed: 0,time,reward
count,167581.0,33579.0
mean,353.778412,4.904137
std,198.301287,2.886647
min,0.0,2.0
25%,168.0,2.0
50%,408.0,5.0
75%,510.0,5.0
max,714.0,10.0


In [7]:
offers = offers[offers.event.isin(["offer received", "offer viewed"])][['person', 'event', 'offer_id', 'time']]

In [21]:
train_offers = offers[offers.time < 504]
test_offers = offers[offers.time >= 504]
train_transactions = transactions[transactions.time < 504].groupby("person")["value"].sum()
test_transactions = transactions[transactions.time >= 504].groupby("person")["value"].sum()

In [22]:
# train_offers
train_transactions

person
0009655768c64bdeb2e877511632db8f     30.73
00116118485d4dfda04fdbaba9a87b5c      4.09
0011e0d4e6b944f998e987f904e8c1e5     25.42
0020c2b971eb4e9188eac86d93036a77     98.33
0020ccbbb6d84e358d3414a3ff76cffd    136.26
                                     ...  
fff3ba4757bd42088c044ca26d73817a    148.84
fff7576017104bcc8677a8d63322b5e1     16.05
fff8957ea8b240a6b5e634b6ee8eafcf     11.26
fffad4f4828548d1b5583907f2e9906b     50.64
ffff82501cea40309d5fdd7edcca4a07    175.02
Name: value, Length: 16058, dtype: float64

In [23]:
test_transactions

person
0009655768c64bdeb2e877511632db8f     96.87
0011e0d4e6b944f998e987f904e8c1e5     54.04
0020c2b971eb4e9188eac86d93036a77     98.53
0020ccbbb6d84e358d3414a3ff76cffd     17.79
003d66b6608740288d6cc97a6903f4f0     20.95
                                     ...  
fff3ba4757bd42088c044ca26d73817a    432.14
fff7576017104bcc8677a8d63322b5e1     13.89
fff8957ea8b240a6b5e634b6ee8eafcf      0.89
fffad4f4828548d1b5583907f2e9906b     38.19
ffff82501cea40309d5fdd7edcca4a07     51.05
Name: value, Length: 15261, dtype: float64

In [24]:
a = train_offers.pivot_table(index=['person', 'offer_id'], columns='event', aggfunc='size')
print(len(a))
a = a[a['offer received'] > 0]
print(len(a))
av = a['offer viewed'].fillna(0).clip(0, 1)
av = pd.DataFrame(av)
print(len(av))

45375
45375
45375


In [25]:
def get_view_probability(df):
    offer_view_counts = df.pivot_table(index=['person', 'offer_id'], columns='event', aggfunc='size')
    offer_view_counts = offer_view_counts[offer_view_counts['offer received'] > 0]
    view_prob = offer_view_counts['offer viewed'].fillna(0)
    view_prob = view_prob.clip(0, 1)
    view_prob = pd.DataFrame(view_prob)
    return view_prob

get_view_probability(train_offers)

Unnamed: 0_level_0,Unnamed: 1_level_0,offer viewed
person,offer_id,Unnamed: 2_level_1
0009655768c64bdeb2e877511632db8f,3f207df678b143eea3cee63160fa8bed,1.0
0009655768c64bdeb2e877511632db8f,5a8bc65990b245e5a138643cd4eb9837,1.0
0009655768c64bdeb2e877511632db8f,f19421c1d4aa40978ebb69ca19b0e20d,1.0
00116118485d4dfda04fdbaba9a87b5c,f19421c1d4aa40978ebb69ca19b0e20d,1.0
0011e0d4e6b944f998e987f904e8c1e5,0b1e1539f2cc45b7b9fa7c272da2e1d7,1.0
...,...,...
fffad4f4828548d1b5583907f2e9906b,5a8bc65990b245e5a138643cd4eb9837,1.0
fffad4f4828548d1b5583907f2e9906b,f19421c1d4aa40978ebb69ca19b0e20d,1.0
ffff82501cea40309d5fdd7edcca4a07,0b1e1539f2cc45b7b9fa7c272da2e1d7,1.0
ffff82501cea40309d5fdd7edcca4a07,2906b810c7d4411798c6938adc9daaa5,1.0


In [26]:
train_set = get_view_probability(train_offers)
test_set = get_view_probability(test_offers)

In [144]:
train_set.to_csv("data/train.csv")
test_set.to_csv("data/test.csv")

In [27]:
train_persons = set([x[0] for x in train_set.index])
test_persons = set([x[0] for x in test_set.index])
intersects = train_persons & test_persons
len(train_persons), len(test_persons), len(intersects)

(16926, 15919, 15851)

In [29]:
train_offers = offers[offers.time < 504]
test_offers = offers[offers.time >= 504]
train_transactions = transactions[transactions.time < 504].groupby("person")["value"].sum()
test_transactions = transactions[transactions.time >= 504].groupby("person")["value"].sum()

In [30]:
train_transactions.to_csv("data/train_transactions.csv")
test_transactions.to_csv("data/test_trasactions.csv")