# Creating household-centered data views

The purpose of this notebook is to examine the relational tables provided by Dunhumby and compile data views that contain all informationr related a single household.  The ultimate goal is to be able to determine the correct number of features to use later with both clustering and classification algorithms so that my initial data wrangling is undertaken with a clear purpose and reasoning behind it. 

The instructions for setting up a db and accessing these csv files using sqlite were found [here](https://mungingdata.com/sqlite/create-database-load-csv-python/)

In [1]:
import sqlite3
import pandas as pd
from pathlib import Path
from collections import defaultdict

In [2]:
#Creating a sqlite database, connection, and cursor
Path('trial.db').touch()
conn = sqlite3.connect('trial.db')
c = conn.cursor()

In [3]:
# This cell was used to create the tables needed to store the data files in for sqlite, I've commented it out for now
"""
# Creates a table for household demographics
c.execute('''CREATE TABLE HH_DEMOGRAPHIC (HOUSEHOLD_KEY int, AGE_DESC text, MARITAL_STATUS_CODE text, 
INCOME_DESC text, HOMEOWNER_DESC text, HH_COMP_DESC text, HOUSEHOLD_SIZE_DESC text, KID_CATEGORY_DESC text)''')
# Creates a table for transactional data
c.execute('''CREATE TABLE TRANSACTION_DATA (HOUSEHOLD_KEY int, BASKET_ID int, DAY int, PRODUCT_ID int, QUANTITY int, 
SALES_VALUE real, STORE_ID int, COUPON_MATCH_DISC real, COUPON_DISC real, RETAIL_DISC real, TRANS_TIME real, 
WEEK_NO real)''')
# Creates a table for connecting campaigns to houeholds
c.execute('''CREATE TABLE CAMPAIGN_TABLE (HOUSEHOLD_KEY int, CAMPAIGN int, DESCRIPTION text)''')
# Creates a table for campaign descriptions
c.execute('''CREATE TABLE CAMPAIGN_DESC (CAMPAIGN int, DESCRIPTION text, START_DAY int, END_DAY int)''')
# Creates a table for product information
c.execute('''CREATE TABLE PRODUCT (PRODUCT_ID int, COMMODITY_DESC text, SUB_COMMODITY_DESC text, MANUFACTURER int, 
DEPARTMENT text, BRAND text, CURR_SIZE_OF_PRODUCT text)''')
# Creates a table for display data
c.execute('''CREATE TABLE DISPLAY_DATA (PRODUCT_ID int, STORE_ID int, WEEK_NO int, DISPLAY text, MAILER text)''')
# creates a table for coupon redemption data
c.execute('''CREATE TABLE REDEEMED (HOUSEHOLD_KEY int, DAY int, COUPON_UPC int, PRODUCT_ID int)''')
# Creates a table linking coupons & campaigns to products
c.execute('''CREATE TABLE COUPON (CAMPAIGN int, COUPON_UPC int, PRODUCT_ID int)''')""";

## This section loads all the csv's into the table that was previously created

In [4]:
hh_demo = pd.read_csv('../Data/hh_demographic.csv')

hh_demo.to_sql('HH_DEMOGRAPHIC', conn, if_exists='replace', index = False)

In [5]:
transactions = pd.read_csv('../Data/transaction_data.csv')

transactions.to_sql('TRANSACTION_DATA', conn, if_exists='replace', index=False)

In [6]:
campaign_table = pd.read_csv('../Data/campaign_table.csv')

campaign_table.to_sql('CAMPAIGN_TABLE', conn, if_exists='replace', index=False)

In [7]:
campaign_desc = pd.read_csv('../Data/campaign_desc.csv')

campaign_desc.to_sql('CAMPAIGN_DESC', conn, if_exists='replace', index=False)

In [8]:
products = pd.read_csv('../Data/product.csv')

products.to_sql('PRODUCT', conn, if_exists='replace', index=False)

In [9]:
display = pd.read_csv('../Data/causal_data.csv')

display.to_sql('DISPLAY_DATA', conn, if_exists='replace', index=False)

In [10]:
redeem = pd.read_csv('../Data/coupon_redempt.csv')

redeem.to_sql('REDEEMED', conn, if_exists='replace', index=False)

In [11]:
coupon = pd.read_csv('../Data/coupon.csv')

coupon.to_sql('COUPON', conn, if_exists='replace', index=False)

### This section is used to cursor objects that contain lists of lists with all information about Household 1 using sqlite3

In [12]:
#creates Redeemed_1 with information on redeemed coupons
c.execute('''SELECT *
FROM REDEEMED
WHERE HOUSEHOLD_KEY = 1''')
Redeemed_1 = c.fetchall()

In [13]:
#creates Campaign_1 with information on mailers sent to homes and days they ran
c.execute('''
Select CT.HOUSEHOLD_KEY, CT.CAMPAIGN, CT.DESCRIPTION, CD.START_DAY, CD.END_DAY
FROM CAMPAIGN_TABLE AS CT
LEFT JOIN CAMPAIGN_DESC AS CD ON CD.CAMPAIGN = CT.CAMPAIGN
WHERE CT.HOUSEHOLD_KEY = 1;''')
Campaign_1 = c.fetchall()

In [14]:
#creates Transactions_1 with information on individual product purchases, discounts, stores shopped at, 
#baskets, and day/week/time of purchase
c.execute('''
SELECT *
FROM TRANSACTION_DATA
WHERE HOUSEHOLD_KEY=1
''')
Transactions_1 = c.fetchall()

In [15]:
#creates Products_1 with descriptive information on products purchased by household 1
c.execute('''
SELECT T.PRODUCT_ID, P.COMMODITY_DESC, P.SUB_COMMODITY_DESC, P.MANUFACTURER, P.DEPARTMENT, P.BRAND
FROM TRANSACTION_DATA AS T 
LEFT JOIN PRODUCT AS P 
ON P.PRODUCT_ID = T.PRODUCT_ID
WHERE HOUSEHOLD_KEY = 1''')
Products_1 = c.fetchall()

In [16]:
#creates Display_1 with information on where products were 'displayed' in store or in the mailers that went out
c.execute('''
SELECT T.HOUSEHOLD_KEY, D.PRODUCT_ID, D.STORE_ID, D.WEEK_NO, D.DISPLAY, D.MAILER
FROM TRANSACTION_DATA AS T
LEFT JOIN DISPLAY_DATA AS D ON D.PRODUCT_ID = T.PRODUCT_ID AND D.STORE_ID = T.STORE_ID AND T.WEEK_NO = D.WEEK_NO
WHERE T.HOUSEHOLD_KEY = 1 AND D.DISPLAY <> 0 AND D.MAILER <> 0''')
Display_1 = c.fetchall()

In [17]:
# creates Household_1 which holds the demographic information regarding Household_1
c.execute('''
SELECT * 
FROM HH_DEMOGRAPHIC
WHERE HOUSEHOLD_KEY = 1''')
Household_1 = c.fetchall()

In [18]:
#. Dummy cursor object to query aggregation information about household 1
c.execute('''
SELECT COUNT(distinct D.product_id)
FROM TRANSACTION_DATA AS T
LEFT JOIN DISPLAY_DATA AS D ON D.PRODUCT_ID = T.PRODUCT_ID AND D.STORE_ID = T.STORE_ID AND T.WEEK_NO = D.WEEK_NO
WHERE T.HOUSEHOLD_KEY = 1 AND D.DISPLAY <> 0 AND D.MAILER <> 0''')
DUMMY = c.fetchall()
DUMMY

[(40,)]

# Functions

This section is for writing functions that will create dictionaries of the above queries that can be loaded into a single row of a data frame and associated with a household ID

In [19]:
# Function that creates a dictionary that shows which coupons were redeemed on which days
def Redeem_dict(k):
    """ This function is meant to take a list of tuples and create a dictionary whose keys are the 2nd value in each
    tuple, and who's values are a list containing the 3rd value of all tuples a given 2nd value appears in."""
    R_dict = {}
    for i in k:
        if i[1] not in R_dict:
            R_dict[i[1]] = []
            R_dict[i[1]].append(i[2])
        else: 
            R_dict[i[1]].append(i[2])
    return R_dict

In [20]:
# Function to create a dictionary with all the household demographic information
def h_demo(k):
    """This function unpacks a list with a single tuple containing 7 elements into a dictionary that 
    labels all elements and can be stored """
    hh_dict = {}
    hh_dict['age'] = k[0][0]
    hh_dict['marital_status'] = k[0][1]
    hh_dict['income_group'] = k[0][2]
    hh_dict['homeowner_group'] = k[0][3]
    hh_dict['household'] = k[0][4]
    hh_dict['household_count'] = k[0][5]
    hh_dict['kids'] = k[0][6]
    return hh_dict

In [21]:
# Generalized function to create a nested dictionary storing the coupons & days they were redeemed on 
# for each marketing campaign
def Redeemed_full(k):
    """This function is designed to break down a list of tuples into a readable nested dictionary.  
    The primary dictionary's keys are all of the unique 4th values in the tuples.  For each of those 
    keys the value is a nested dictionary who's keys are 'Days' and 'Coupons', whose values are the 2nd and 3rd values 
    in each tuple which are associated with the 4th value of each """
    main_dict = {}
    for i in k:
        if i[-1] not in main_dict:  # checks if a campaign already exists in the dictionary
            main_dict[i[-1]] = {}
            main_dict[i[-1]]['Days'] = []
            main_dict[i[-1]]['Days'].append(i[1])  #adds that campaign & info from this tuple if not
            main_dict[i[-1]]['Coupons'] = []
            main_dict[i[-1]]['Coupons'].append(i[2])
        else:
            if i[1] not in main_dict[i[-1]]['Days'] and i[2] not in main_dict[i[-1]]['Coupons']:
                main_dict[i[-1]]['Days'].append(i[1]) #adds the info from the tuple to existing campaign if it exists
                main_dict[i[-1]]['Coupons'].append(i[2])
    return main_dict   

In [22]:
# Function to create a dictionary with the information on marketing campaigns sent to a given household and the days
# in which it was active
def Campaign_full(k):
    c_dict = {}
    for i in k:
        c_dict[i[1]] = {}
        c_dict[i[1]]['Type'] = i[2]
        c_dict[i[1]]['Start_day'] = i[3]
        c_dict[i[1]]['End_day'] = i[4]
    return c_dict

In [23]:
# This function creates a dictionary that can be used to store all information about a given product identified 
# by it's product ID.  This dictionary will be a nested dictionary wherein the primary keys are the product ID, 
# and the secondary dictionary's key/value pairs describe the nature of that product
def Product_full(k):
    p_dict = {}
    for i in k:
        p_dict[i[0]] = {}
        p_dict[i[0]]['Dept'] = i[4]
        p_dict[i[0]]['Category'] = i[1]
        p_dict[i[0]]['Sub-category'] = i[2]
        p_dict[i[0]]['Brand'] = i[-1]
        p_dict[i[0]]['Manufacturer_ID'] = i[3]
    return p_dict

In [24]:
# This function creates a series of nested dictionaries with information on where products were displayed in a 
# given store and the position in which a given product was displayed in  either the store or in a marketing mailer
def Display_full(k):
    d_dict = {}
    for i in k:
        # Checks to see if a given store already has been logged in this dictionary, if not creates that store as a
        # key and an empty dictionary as it's values
        if i[2] not in d_dict:
            d_dict[i[2]] = {} # adds the store as a key, and a dictionary as it's value
            d_dict[i[2]][i[1]] = {}   # adds the product as a key, and a dictionary as it's value
            d_dict[i[2]][i[1]][i[3]] = {}  #adds the week as a key, and a dictionary as it's value
            d_dict[i[2]][i[1]][i[3]]['Display'] = i[4]  # adds the display location for a given week/product/store
            d_dict[i[2]][i[1]][i[3]]['Mailer'] = i[-1]
        elif i[1] not in d_dict[i[2]]:
            d_dict[i[2]][i[1]] = {}   # checks if a product exists for a store, if not, adds the product info
            d_dict[i[2]][i[1]][i[3]] = {}  
            d_dict[i[2]][i[1]][i[3]]['Display'] = i[4]  
            d_dict[i[2]][i[1]][i[3]]['Mailer'] = i[-1]
        elif i[3] not in d_dict[i[2]][i[1]]:
            d_dict[i[2]][i[1]][i[3]] = {}  # if the store and product have already been added, adds in weekly info
            d_dict[i[2]][i[1]][i[3]]['Display'] = i[4]  
            d_dict[i[2]][i[1]][i[3]]['Mailer'] = i[-1]
    return d_dict

In [25]:
# This function will create a dictionary of all transactions for a household, it's primary keys will be the basket ID
# for each transaction, where it's secondary keys will have time & store information as well as a secondary dictionary
# of the products purchased with the product info assigned to the product ID
def Transactions_full(k):
    t_dict = {}
    for i in k:
        if i[1] not in t_dict:
            t_dict[i[1]] = {}
            t_dict[i[1]]['Store'] = i[6]
            t_dict[i[1]]['Week'] = i[9]
            t_dict[i[1]]['Day'] = i[2]
            t_dict[i[1]]['Time'] = i[8]
            t_dict[i[1]][i[3]] = {}
            t_dict[i[1]][i[3]]['Sale_value'] = i[5]
            t_dict[i[1]][i[3]]['Quantity'] = i[4]
            t_dict[i[1]][i[3]]['Coupon_disc'] = i[10]
            t_dict[i[1]][i[3]]['Loyalty_disc'] = i[7]
            t_dict[i[1]][i[3]]['Manufacturer_rebate'] = i[11]
        elif i[3] not in t_dict[i[1]]:
            t_dict[i[1]][i[3]] = {}
            t_dict[i[1]][i[3]]['Sale_value'] = i[5]
            t_dict[i[1]][i[3]]['Quantity'] = i[4]
            t_dict[i[1]][i[3]]['Coupon_disc'] = i[10]
            t_dict[i[1]][i[3]]['Loyalty_disc'] = i[7]
            t_dict[i[1]][i[3]]['Manufacturer_rebate'] = i[11]
    return t_dict

# Assembling the first household row

In [26]:
HH1 = {'Household': 1, 'Demographic':h_demo(Household_1), 'Day_redeemed':Redeem_dict(Redeemed_1), 
       'Campaign_redeemed':Redeemed_full(Redeemed_1), 'Campaigns':Campaign_full(Campaign_1), 
      'Products':Product_full(Products_1), 'Product_display':Display_full(Display_1), 
       'Transactions':Transactions_full(Transactions_1)}

In [27]:
H_1 = pd.DataFrame(columns=HH1.keys())
H_1.append(HH1, ignore_index=True)

Unnamed: 0,Household,Demographic,Day_redeemed,Campaign_redeemed,Campaigns,Products,Product_display,Transactions
0,1,"{'age': '65+', 'marital_status': 'A', 'income_...","{421: [10000085364, 51700010076], 427: [542000...","{8: {'Days': [421, 427], 'Coupons': [100000853...","{8: {'Type': 'TypeA', 'Start_day': 412, 'End_d...","{825123: {'Dept': 'GROCERY', 'Category': 'SALD...","{436: {9527290: {10: {'Display': '1', 'Mailer'...","{27601281299: {'Store': 436, 'Week': 8, 'Day':..."


# Creating multiple other household views

Creating a row for Household 2497

In [28]:
# Creating queries for household 2497
# Demographic info
c.execute('''
SELECT * 
FROM HH_DEMOGRAPHIC
WHERE HOUSEHOLD_KEY = 2497''')
Household_2497 = c.fetchall()
# Coupon redemption info
c.execute('''SELECT *
FROM REDEEMED
WHERE HOUSEHOLD_KEY = 2497''')
Redeemed_2497 = c.fetchall()
# Campaign info
c.execute('''
Select CT.HOUSEHOLD_KEY, CT.CAMPAIGN, CT.DESCRIPTION, CD.START_DAY, CD.END_DAY
FROM CAMPAIGN_TABLE AS CT
LEFT JOIN CAMPAIGN_DESC AS CD ON CD.CAMPAIGN = CT.CAMPAIGN
WHERE CT.HOUSEHOLD_KEY = 2497;''')
Campaign_2497 = c.fetchall()
# Product info
c.execute('''
SELECT T.PRODUCT_ID, P.COMMODITY_DESC, P.SUB_COMMODITY_DESC, P.MANUFACTURER, P.DEPARTMENT, P.BRAND
FROM TRANSACTION_DATA AS T 
LEFT JOIN PRODUCT AS P 
ON P.PRODUCT_ID = T.PRODUCT_ID
WHERE HOUSEHOLD_KEY = 2497''')
Product_2497 = c.fetchall()
# Display info
c.execute('''
SELECT T.HOUSEHOLD_KEY, D.PRODUCT_ID, D.STORE_ID, D.WEEK_NO, D.DISPLAY, D.MAILER
FROM TRANSACTION_DATA AS T
LEFT JOIN DISPLAY_DATA AS D ON D.PRODUCT_ID = T.PRODUCT_ID AND D.STORE_ID = T.STORE_ID AND T.WEEK_NO = D.WEEK_NO
WHERE T.HOUSEHOLD_KEY = 2497 AND D.DISPLAY <> 0 AND D.MAILER <> 0''')
Display_2497 = c.fetchall()
# Transaction info
c.execute('''
SELECT *
FROM TRANSACTION_DATA
WHERE HOUSEHOLD_KEY=2497
''')
Transaction_2497 = c.fetchall()

In [29]:
##### Creating the dictionary for Household 2375
HH2497 = {'Household': 2497, 'Demographic':h_demo(Household_2497), 'Day_redeemed':Redeem_dict(Redeemed_2497), 
       'Campaign_redeemed':Redeemed_full(Redeemed_2497), 'Campaigns':Campaign_full(Campaign_2497), 
      'Products':Product_full(Product_2497), 'Product_display':Display_full(Display_2497), 
       'Transactions':Transactions_full(Transaction_2497)}
# Creating the single row dataframe
H_2497 = pd.DataFrame(columns=HH2497.keys())
H_2497.append(HH2497, ignore_index=True)

Unnamed: 0,Household,Demographic,Day_redeemed,Campaign_redeemed,Campaigns,Products,Product_display,Transactions
0,2497,"{'age': '45-54', 'marital_status': 'U', 'incom...",{},{},"{8: {'Type': 'TypeA', 'Start_day': 412, 'End_d...","{838220: {'Dept': 'DRUG GM', 'Category': 'CIGA...","{339: {5569230: {12: {'Display': '7', 'Mailer'...","{27913035576: {'Store': 339, 'Week': 12, 'Day'..."


Creating a row for Household 1796

In [30]:
# Creating queries for household 1796
# Demographic info
c.execute('''
SELECT * 
FROM HH_DEMOGRAPHIC
WHERE HOUSEHOLD_KEY = 1796''')
Household_1796 = c.fetchall()
# Coupon redemption info
c.execute('''SELECT *
FROM REDEEMED
WHERE HOUSEHOLD_KEY = 1796''')
Redeemed_1796 = c.fetchall()
# Campaign info
c.execute('''
Select CT.HOUSEHOLD_KEY, CT.CAMPAIGN, CT.DESCRIPTION, CD.START_DAY, CD.END_DAY
FROM CAMPAIGN_TABLE AS CT
LEFT JOIN CAMPAIGN_DESC AS CD ON CD.CAMPAIGN = CT.CAMPAIGN
WHERE CT.HOUSEHOLD_KEY = 1796;''')
Campaign_1796 = c.fetchall()
# Product info
c.execute('''
SELECT T.PRODUCT_ID, P.COMMODITY_DESC, P.SUB_COMMODITY_DESC, P.MANUFACTURER, P.DEPARTMENT, P.BRAND
FROM TRANSACTION_DATA AS T 
LEFT JOIN PRODUCT AS P 
ON P.PRODUCT_ID = T.PRODUCT_ID
WHERE HOUSEHOLD_KEY = 1796''')
Product_1796 = c.fetchall()
# Display info
c.execute('''
SELECT T.HOUSEHOLD_KEY, D.PRODUCT_ID, D.STORE_ID, D.WEEK_NO, D.DISPLAY, D.MAILER
FROM TRANSACTION_DATA AS T
LEFT JOIN DISPLAY_DATA AS D ON D.PRODUCT_ID = T.PRODUCT_ID AND D.STORE_ID = T.STORE_ID AND T.WEEK_NO = D.WEEK_NO
WHERE T.HOUSEHOLD_KEY = 1796 AND D.DISPLAY <> 0 AND D.MAILER <> 0''')
Display_1796 = c.fetchall()
# Transaction info
c.execute('''
SELECT *
FROM TRANSACTION_DATA
WHERE HOUSEHOLD_KEY=1796
''')
Transaction_1796 = c.fetchall()

In [31]:
# Creating the dictionary for Household 1796
HH1796 = {'Household': 1796, 'Demographic':h_demo(Household_1796), 'Day_redeemed':Redeem_dict(Redeemed_1796), 
       'Campaign_redeemed':Redeemed_full(Redeemed_1796), 'Campaigns':Campaign_full(Campaign_1796), 
      'Products':Product_full(Product_1796), 'Product_display':Display_full(Display_1796), 
       'Transactions':Transactions_full(Transaction_1796)}
# Creating the single row dataframe
H_1796 = pd.DataFrame(columns=HH1796.keys())
H_1796.append(HH1796, ignore_index=True)

Unnamed: 0,Household,Demographic,Day_redeemed,Campaign_redeemed,Campaigns,Products,Product_display,Transactions
0,1796,"{'age': '19-24', 'marital_status': 'U', 'incom...",{},{},"{18: {'Type': 'TypeA', 'Start_day': 587, 'End_...","{918046: {'Dept': 'DRUG GM', 'Category': 'CIGA...","{343: {913785: {15: {'Display': 'A', 'Mailer':...","{27829385838: {'Store': 343, 'Week': 11, 'Day'..."


Creating a row for Household 13

In [32]:
# Creating queries for household 13
# Demographic info
c.execute('''
SELECT * 
FROM HH_DEMOGRAPHIC
WHERE HOUSEHOLD_KEY = 13''')
Household_13 = c.fetchall()
# Coupon redemption info
c.execute('''SELECT *
FROM REDEEMED
WHERE HOUSEHOLD_KEY = 13''')
Redeemed_13 = c.fetchall()
# Campaign info
c.execute('''
Select CT.HOUSEHOLD_KEY, CT.CAMPAIGN, CT.DESCRIPTION, CD.START_DAY, CD.END_DAY
FROM CAMPAIGN_TABLE AS CT
LEFT JOIN CAMPAIGN_DESC AS CD ON CD.CAMPAIGN = CT.CAMPAIGN
WHERE CT.HOUSEHOLD_KEY = 13;''')
Campaign_13 = c.fetchall()
# Product info
c.execute('''
SELECT T.PRODUCT_ID, P.COMMODITY_DESC, P.SUB_COMMODITY_DESC, P.MANUFACTURER, P.DEPARTMENT, P.BRAND
FROM TRANSACTION_DATA AS T 
LEFT JOIN PRODUCT AS P 
ON P.PRODUCT_ID = T.PRODUCT_ID
WHERE HOUSEHOLD_KEY = 13''')
Product_13 = c.fetchall()
# Display info
c.execute('''
SELECT T.HOUSEHOLD_KEY, D.PRODUCT_ID, D.STORE_ID, D.WEEK_NO, D.DISPLAY, D.MAILER
FROM TRANSACTION_DATA AS T
LEFT JOIN DISPLAY_DATA AS D ON D.PRODUCT_ID = T.PRODUCT_ID AND D.STORE_ID = T.STORE_ID AND T.WEEK_NO = D.WEEK_NO
WHERE T.HOUSEHOLD_KEY = 13 AND D.DISPLAY <> 0 AND D.MAILER <> 0''')
Display_13 = c.fetchall()
# Transaction info
c.execute('''
SELECT *
FROM TRANSACTION_DATA
WHERE HOUSEHOLD_KEY=13
''')
Transaction_13 = c.fetchall()

In [33]:
# Creating the dictionary for Household 13
HH13 = {'Household': 13, 'Demographic':h_demo(Household_13), 'Day_redeemed':Redeem_dict(Redeemed_13), 
       'Campaign_redeemed':Redeemed_full(Redeemed_13), 'Campaigns':Campaign_full(Campaign_13), 
      'Products':Product_full(Product_13), 'Product_display':Display_full(Display_13), 
       'Transactions':Transactions_full(Transaction_13)}
# Creating the single row dataframe
H_13 = pd.DataFrame(columns=HH13.keys())
H_13.append(HH13, ignore_index=True)

Unnamed: 0,Household,Demographic,Day_redeemed,Campaign_redeemed,Campaigns,Products,Product_display,Transactions
0,13,"{'age': '25-34', 'marital_status': 'U', 'incom...","{396: [53700048182], 424: [10000085364], 434: ...","{5: {'Days': [396], 'Coupons': [53700048182]},...","{8: {'Type': 'TypeA', 'Start_day': 412, 'End_d...","{942385: {'Dept': 'DRUG GM', 'Category': 'SPRI...","{323: {5569230: {15: {'Display': '2', 'Mailer'...","{28235291311: {'Store': 323, 'Week': 15, 'Day'..."


Creating a row for Household 2494

In [34]:
# Creating queries for household 2494
# Demographic info
c.execute('''
SELECT * 
FROM HH_DEMOGRAPHIC
WHERE HOUSEHOLD_KEY = 2494''')
Household_2494 = c.fetchall()
# Coupon redemption info
c.execute('''SELECT *
FROM REDEEMED
WHERE HOUSEHOLD_KEY = 2494''')
Redeemed_2494 = c.fetchall()
# Campaign info
c.execute('''
Select CT.HOUSEHOLD_KEY, CT.CAMPAIGN, CT.DESCRIPTION, CD.START_DAY, CD.END_DAY
FROM CAMPAIGN_TABLE AS CT
LEFT JOIN CAMPAIGN_DESC AS CD ON CD.CAMPAIGN = CT.CAMPAIGN
WHERE CT.HOUSEHOLD_KEY = 2494;''')
Campaign_2494 = c.fetchall()
# Product info
c.execute('''
SELECT T.PRODUCT_ID, P.COMMODITY_DESC, P.SUB_COMMODITY_DESC, P.MANUFACTURER, P.DEPARTMENT, P.BRAND
FROM TRANSACTION_DATA AS T 
LEFT JOIN PRODUCT AS P 
ON P.PRODUCT_ID = T.PRODUCT_ID
WHERE HOUSEHOLD_KEY = 2494''')
Product_2494 = c.fetchall()
# Display info
c.execute('''
SELECT T.HOUSEHOLD_KEY, D.PRODUCT_ID, D.STORE_ID, D.WEEK_NO, D.DISPLAY, D.MAILER
FROM TRANSACTION_DATA AS T
LEFT JOIN DISPLAY_DATA AS D ON D.PRODUCT_ID = T.PRODUCT_ID AND D.STORE_ID = T.STORE_ID AND T.WEEK_NO = D.WEEK_NO
WHERE T.HOUSEHOLD_KEY = 2494 AND D.DISPLAY <> 0 AND D.MAILER <> 0''')
Display_2494 = c.fetchall()
# Transaction info
c.execute('''
SELECT *
FROM TRANSACTION_DATA
WHERE HOUSEHOLD_KEY=2494
''')
Transaction_2494 = c.fetchall()

In [35]:
# Creating the dictionary for Household 2494
HH2494 = {'Household': 2494, 'Demographic':h_demo(Household_2494), 'Day_redeemed':Redeem_dict(Redeemed_2494), 
       'Campaign_redeemed':Redeemed_full(Redeemed_2494), 'Campaigns':Campaign_full(Campaign_2494), 
      'Products':Product_full(Product_2494), 'Product_display':Display_full(Display_2494), 
       'Transactions':Transactions_full(Transaction_2494)}
# Creating the single row dataframe
H_2494 = pd.DataFrame(columns=HH2494.keys())
H_2494.append(HH2494, ignore_index=True)

Unnamed: 0,Household,Demographic,Day_redeemed,Campaign_redeemed,Campaigns,Products,Product_display,Transactions
0,2494,"{'age': '35-44', 'marital_status': 'U', 'incom...","{633: [51111120001, 54105800093], 635: [100000...","{18: {'Days': [633, 635], 'Coupons': [51111120...","{18: {'Type': 'TypeA', 'Start_day': 587, 'End_...","{825550: {'Dept': 'GROCERY', 'Category': 'DRY ...","{333: {849843: {18: {'Display': '2', 'Mailer':...","{28345214231: {'Store': 333, 'Week': 17, 'Day'..."


Creating a row for Household 1001

In [36]:
# Creating queries for household 1001
# Demographic info
c.execute('''
SELECT * 
FROM HH_DEMOGRAPHIC
WHERE HOUSEHOLD_KEY = 1001''')
Household_1001 = c.fetchall()
# Coupon redemption info
c.execute('''SELECT *
FROM REDEEMED
WHERE HOUSEHOLD_KEY = 1001''')
Redeemed_1001 = c.fetchall()
# Campaign info
c.execute('''
Select CT.HOUSEHOLD_KEY, CT.CAMPAIGN, CT.DESCRIPTION, CD.START_DAY, CD.END_DAY
FROM CAMPAIGN_TABLE AS CT
LEFT JOIN CAMPAIGN_DESC AS CD ON CD.CAMPAIGN = CT.CAMPAIGN
WHERE CT.HOUSEHOLD_KEY = 1001;''')
Campaign_1001 = c.fetchall()
# Product info
c.execute('''
SELECT T.PRODUCT_ID, P.COMMODITY_DESC, P.SUB_COMMODITY_DESC, P.MANUFACTURER, P.DEPARTMENT, P.BRAND
FROM TRANSACTION_DATA AS T 
LEFT JOIN PRODUCT AS P 
ON P.PRODUCT_ID = T.PRODUCT_ID
WHERE HOUSEHOLD_KEY = 1001''')
Product_1001 = c.fetchall()
# Display info
c.execute('''
SELECT T.HOUSEHOLD_KEY, D.PRODUCT_ID, D.STORE_ID, D.WEEK_NO, D.DISPLAY, D.MAILER
FROM TRANSACTION_DATA AS T
LEFT JOIN DISPLAY_DATA AS D ON D.PRODUCT_ID = T.PRODUCT_ID AND D.STORE_ID = T.STORE_ID AND T.WEEK_NO = D.WEEK_NO
WHERE T.HOUSEHOLD_KEY = 1001 AND D.DISPLAY <> 0 AND D.MAILER <> 0''')
Display_1001 = c.fetchall()
# Transaction info
c.execute('''
SELECT *
FROM TRANSACTION_DATA
WHERE HOUSEHOLD_KEY=1001
''')
Transaction_1001 = c.fetchall()

In [37]:
# Creating the dictionary for Household 1001
HH1001 = {'Household': 1001, 'Demographic':h_demo(Household_1001), 'Day_redeemed':Redeem_dict(Redeemed_1001), 
       'Campaign_redeemed':Redeemed_full(Redeemed_1001), 'Campaigns':Campaign_full(Campaign_1001), 
      'Products':Product_full(Product_1001), 'Product_display':Display_full(Display_1001), 
       'Transactions':Transactions_full(Transaction_1001)}
# Creating the single row dataframe
H_1001 = pd.DataFrame(columns=HH1001.keys())
H_1001.append(HH1001, ignore_index=True)

Unnamed: 0,Household,Demographic,Day_redeemed,Campaign_redeemed,Campaigns,Products,Product_display,Transactions
0,1001,"{'age': '45-54', 'marital_status': 'U', 'incom...",{},{},"{30: {'Type': 'TypeA', 'Start_day': 323, 'End_...","{823862: {'Dept': 'DELI', 'Category': 'CHICKEN...","{31782: {1029743: {11: {'Display': '3', 'Maile...","{27164961199: {'Store': 31782, 'Week': 3, 'Day..."


Creating a row for Household 766

In [38]:
# Creating queries for household 766
# Demographic info
c.execute('''
SELECT * 
FROM HH_DEMOGRAPHIC
WHERE HOUSEHOLD_KEY = 766''')
Household_766 = c.fetchall()
# Coupon redemption info
c.execute('''SELECT *
FROM REDEEMED
WHERE HOUSEHOLD_KEY = 766''')
Redeemed_766 = c.fetchall()
# Campaign info
c.execute('''
Select CT.HOUSEHOLD_KEY, CT.CAMPAIGN, CT.DESCRIPTION, CD.START_DAY, CD.END_DAY
FROM CAMPAIGN_TABLE AS CT
LEFT JOIN CAMPAIGN_DESC AS CD ON CD.CAMPAIGN = CT.CAMPAIGN
WHERE CT.HOUSEHOLD_KEY = 766;''')
Campaign_766 = c.fetchall()
# Product info
c.execute('''
SELECT T.PRODUCT_ID, P.COMMODITY_DESC, P.SUB_COMMODITY_DESC, P.MANUFACTURER, P.DEPARTMENT, P.BRAND
FROM TRANSACTION_DATA AS T 
LEFT JOIN PRODUCT AS P 
ON P.PRODUCT_ID = T.PRODUCT_ID
WHERE HOUSEHOLD_KEY = 766''')
Product_766 = c.fetchall()
# Display info
c.execute('''
SELECT T.HOUSEHOLD_KEY, D.PRODUCT_ID, D.STORE_ID, D.WEEK_NO, D.DISPLAY, D.MAILER
FROM TRANSACTION_DATA AS T
LEFT JOIN DISPLAY_DATA AS D ON D.PRODUCT_ID = T.PRODUCT_ID AND D.STORE_ID = T.STORE_ID AND T.WEEK_NO = D.WEEK_NO
WHERE T.HOUSEHOLD_KEY = 766 AND D.DISPLAY <> 0 AND D.MAILER <> 0''')
Display_766 = c.fetchall()
# Transaction info
c.execute('''
SELECT *
FROM TRANSACTION_DATA
WHERE HOUSEHOLD_KEY=766
''')
Transaction_766 = c.fetchall()

In [39]:
# Creating the dictionary for Household 766
HH766 = {'Household': 766, 'Demographic':h_demo(Household_766), 'Day_redeemed':Redeem_dict(Redeemed_766), 
       'Campaign_redeemed':Redeemed_full(Redeemed_766), 'Campaigns':Campaign_full(Campaign_766), 
      'Products':Product_full(Product_766), 'Product_display':Display_full(Display_766), 
       'Transactions':Transactions_full(Transaction_766)}
# Creating the single row dataframe
H_766 = pd.DataFrame(columns=HH766.keys())
H_766.append(HH766, ignore_index=True)

Unnamed: 0,Household,Demographic,Day_redeemed,Campaign_redeemed,Campaigns,Products,Product_display,Transactions
0,766,"{'age': '45-54', 'marital_status': 'A', 'incom...",{568: [53500000076]},"{16: {'Days': [568], 'Coupons': [53500000076]}}","{8: {'Type': 'TypeA', 'Start_day': 412, 'End_d...","{821324: {'Dept': 'GROCERY', 'Category': 'BAKI...","{322: {923149: {17: {'Display': '3', 'Mailer':...","{28345350537: {'Store': 322, 'Week': 17, 'Day'..."


At this point I've created a view of each household with a fixed number of features that will allow for the storage of all information related to each household in the data set regardless of the number of transactions or 

In [40]:
coupon.head()

Unnamed: 0,COUPON_UPC,PRODUCT_ID,CAMPAIGN
0,10000089061,27160,4
1,10000089064,27754,9
2,10000089073,28897,12
3,51800009050,28919,28
4,52100000076,28929,25


In [45]:
campaign_table.head()

Unnamed: 0,DESCRIPTION,household_key,CAMPAIGN
0,TypeA,17,26
1,TypeA,27,26
2,TypeA,212,26
3,TypeA,208,26
4,TypeA,192,26


In [46]:
transactions.head()

Unnamed: 0,household_key,BASKET_ID,DAY,PRODUCT_ID,QUANTITY,SALES_VALUE,STORE_ID,RETAIL_DISC,TRANS_TIME,WEEK_NO,COUPON_DISC,COUPON_MATCH_DISC
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [48]:
products.head()

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ
