# It's time to know how the datasets are related between them

In [47]:
import pandas as pd
from tqdm import tqdm
import os

In [2]:
# read in the json files
portfolio = pd.read_json('clean_data/portfolio.json')
profile = pd.read_json('clean_data/profile.json')
offer_completed = pd.read_json('clean_data/offers_completed.json')
transactions = pd.read_json('clean_data/transactions.json')
offers_viewed = pd.read_json('clean_data/offers_viewed.json')
offers_received = pd.read_json('clean_data/offers_received.json')

In [52]:
"""
This process takes a while, so I'm gonna store the output dataframe into a file called offers.json.
TODO: Use multiprocessing to make this faster
"""
# Declaring the offers file path
offers_path = 'clean_data/offers.json'

# Checking if the file already exists
if os.path.exists(offers_path):
    # Loading the existing json file
    offers = pd.read_json(offers_path)
else:
    # Defining the important columns from the portfolio dataframe
    portfolio_columns = ['reward', 'difficulty', 'duration', 'bogo', 'discount', 
                         'informational', 'social', 'mobile', 'email', 'web']
    # Declaring a list where all the new rows will be append
    new_rows = []
    print("Starting offers formating.... ({})".format(offers_received.shape[0]))
    
    # Iterating over the received offers
    for index, row in tqdm(offers_received.iterrows()):
        # Getting the important portfolio fields related to the offer
        new_row = portfolio[portfolio.id==row.offer_id].iloc[0][portfolio_columns].to_dict()
        
        # Getting the peron, start_time and due_time
        new_row['person'] = row.person
        new_row['start_time'] = row.time
        new_row['due_time'] = new_row.get('start_time') + (new_row.get('duration')*24)

        # Checking which of the offers were seen
        viewed = offers_viewed.loc[
            (offers_viewed.person==row.person) 
            & (offers_viewed.offer_id==row.offer_id) 
            & (
                (offers_viewed.time>=new_row.get('start_time'))
                | (offers_viewed.time<new_row.get('due_time'))
            )
        ]
        if not viewed.empty:
            new_row['viewed_time'] = viewed.reset_index().iloc[0].time

        # Checking which of the offers were completed
        completed = offer_completed.loc[
            (offer_completed.person==row.person) 
            & (offer_completed.offer_id==row.offer_id) 
            & (
                (offer_completed.time>=new_row.get('start_time'))
                | (offer_completed.time<new_row.get('due_time'))
            )
        ]
        if not completed.empty:
            new_row['completed_time'] = completed.reset_index().iloc[0].time
            
        # Appending the new row obtained
        new_rows.append(new_row)
    
    # Creating a dataframe with the bunch of new rows created
    offers = pd.DataFrame(new_rows)
    
    # Saving the dataframe into a json
    offers.to_json(offers_path)

8it [00:00, 77.08it/s]

Starting offers formating.... (76277)


76277it [15:08, 83.98it/s]


In [54]:
offers.head()

Unnamed: 0,reward,difficulty,duration,bogo,discount,informational,social,mobile,email,web,person,start_time,due_time,viewed_time,completed_time
0,5,5,7,1,0,0,0,1,1,1,78afa995795e4d85b5d9ceeca43f5fef,0,168,6.0,132.0
1,5,20,10,0,1,0,0,0,1,1,a03223e636434f42ac4c3df47e8bac43,0,240,6.0,
2,2,10,7,0,1,0,0,1,1,1,e2127556f4f64592b11af22de27a7932,0,168,18.0,
3,2,10,10,0,1,0,1,1,1,1,8ec6ce2a7e7949b1bf142def7d0e0586,0,240,12.0,
4,10,10,5,1,0,0,1,1,1,1,68617ca6246f4fbc85e91a2a49552598,0,120,84.0,


# Data conclusions

In [None]:
users = set(profile.id)
print("Number of users: ", len(users))

In [None]:
users_with_offers = set(offers_received.person)
print("Number of users with at least 1 offer: {} ({}% of all users)".format(len(users_with_offers), round(len(users_with_offers)*100/len(users), 2)))
print("Number of offers:", len(offers_received))

In [None]:
users_offer_viewed = set(offers_viewed.person)
print("Number of users who have known they had had at least one offer: {} ({}% of users with offers)".format(
    len(users_offer_viewed), round(len(users_offer_viewed)*100/len(users_with_offers), 2)
))
print("Number of offers viewed: {} ({}% of all offers sent)".format(
    len(offers_viewed), round(len(offers_viewed)*100/len(offers_received), 2)
))

In [None]:
users_offer_completed = set(offer_completed.person)
users_offer_completed_knowledge = users_offer_completed & users_offer_viewed
users_offer_completed_no_knowledge = users_offer_completed - users_offer_viewed
print("Number of users who completed at least one offer with knlowledge about their offers: {} ({}% of users with offers)".format(
    len(users_offer_completed_knowledge), round(len(users_offer_completed_knowledge)*100/len(users_with_offers), 2)
))

print("Number of users who completed at least one offer without knowing about their offers: {} ({}% of users with offers)".format(
    len(users_offer_completed_no_knowledge), round(len(users_offer_completed_no_knowledge)*100/len(users_with_offers), 2)
))
print("Number of offers completed: {} ({}% of all offers sent)".format(
    len(offer_completed), round(len(offer_completed)*100/len(offers_received), 2)
))