# Generate Train-Test Dataset
Generate train/test dataset for model training

In [2]:
import pandas as pd
import numpy as np
import math
import json
%matplotlib inline

# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

In [7]:
def get_view_probability(df):
    offer_view_counts = df.pivot_table(index=['person', 'offer_id'], columns='event', aggfunc='size')
    offer_view_counts = offer_view_counts[offer_view_counts['offer received'] > 0]
    view_prob = offer_view_counts['offer viewed'].fillna(0)
    view_prob = view_prob.clip(0, 1)
    view_prob = pd.DataFrame(view_prob)
    return view_prob

In [12]:
def prepare_train_test(transcript, profile, portfolio):
    transactions = pd.read_csv("data/transactions.csv", index_col=0)
    offers = pd.read_csv("data/offers.csv", index_col=0)
    offers = offers[offers.event.isin(["offer received", "offer viewed"])][['person', 'event', 'offer_id', 'time']]
    
    train_offers = offers[offers.time < 504]
    test_offers = offers[offers.time >= 504]
    train_transactions = transactions[transactions.time < 504].groupby("person")["amount"].sum()
    test_transactions = transactions[transactions.time >= 504].groupby("person")["amount"].sum()

    train_set = get_view_probability(train_offers)
    test_set = get_view_probability(test_offers)
    
    # save X(user, portfolio), y(viewed - 1 or 0) train/test set
    train_set.to_csv("data/train.csv")
    test_set.to_csv("data/test.csv")

    # save person-total transaction dataset for train/test
    train_transactions.to_csv("data/train_transactions.csv")
    test_transactions.to_csv("data/test_transactions.csv")

    return train_set, test_set, train_transactions, test_transactions

prepare_train_test(transcript, profile, portfolio)

(                                                                   offer viewed
 person                           offer_id                                      
 0009655768c64bdeb2e877511632db8f 3f207df678b143eea3cee63160fa8bed           1.0
                                  5a8bc65990b245e5a138643cd4eb9837           1.0
                                  f19421c1d4aa40978ebb69ca19b0e20d           1.0
 00116118485d4dfda04fdbaba9a87b5c f19421c1d4aa40978ebb69ca19b0e20d           1.0
 0011e0d4e6b944f998e987f904e8c1e5 0b1e1539f2cc45b7b9fa7c272da2e1d7           1.0
 ...                                                                         ...
 fffad4f4828548d1b5583907f2e9906b 5a8bc65990b245e5a138643cd4eb9837           1.0
                                  f19421c1d4aa40978ebb69ca19b0e20d           1.0
 ffff82501cea40309d5fdd7edcca4a07 0b1e1539f2cc45b7b9fa7c272da2e1d7           1.0
                                  2906b810c7d4411798c6938adc9daaa5           1.0
                            

In [30]:
train_transactions.to_csv("data/train_transactions.csv")
test_transactions.to_csv("data/test_trasactions.csv")