In [2]:
# mount google drive if running in colab
try:
  from google.colab import drive
  drive.mount('/content/drive/')
  import sys
  sys.path.append('/content/drive/My Drive/Colab Notebooks/Starbucks_Udacity/src/utilities')
  %cd /content/drive/My Drive/Colab Notebooks/Starbucks_Udacity/notebooks/exploratory
except:
  pass

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive/
/content/drive/My Drive/Colab Notebooks/Starbucks_Udacity/notebooks/exploratory


In [None]:
import numpy as np
import pandas as pd
import os
import joblib
import helper

In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

portfolio = pd.read_json('../../data/raw/portfolio.json', orient='records', lines=True)
profile = pd.read_json('../../data/raw/profile.json', orient='records', lines=True)
transcript = pd.read_json('../../data/raw/transcript.json', orient='records', lines=True)

## 1. Examining and reorganising portfolio data

In [None]:
portfolio.head()

In [None]:
# input dataframe and column names to return unique items per column
def uniques(df, column_names):
    for col in df[column_names].columns:
        unique_values = list(sorted(df[col].unique(), reverse=True))
        print("{}: {}".format(col, unique_values, len(unique_values)))

In [None]:
uniques(portfolio, ['difficulty', 'duration', 'offer_type', 'reward'])
print("channel: '[email, mobile, social, web]'")

In [None]:
# Converting channels into categories, dropping email since this is common to all channels
portw = portfolio.join(portfolio.channels.str.join('|').str.get_dummies())
portw.drop(['channels', 'email'], axis=1, inplace=True)

In [None]:
# Changing column order 
portw = portw[['id', 'difficulty', 'reward', 'duration', 'offer_type', 'mobile', 'web', 'social']]

In [None]:
# Get dummies for offer_type and removing original column
portw = portw.join(pd.get_dummies(portw.offer_type))
portw.drop(['offer_type'], axis=1, inplace=True)

In [None]:
# Sorting according to expected effect of offer
portw.sort_values(['difficulty', 'reward', 'duration'], ascending=False, inplace=True)
portw

In [None]:
# mapping offer id to simpler format
id = list(portw['id'])
portw.id = portw.id.map({a:b for a,b in zip(id, 'abcdefghij')})
portw.reset_index(drop=True)

* ### Script to wrangle raw portfolio data

In [None]:
def wrangle_portfolio(portfolio, save=None):
    '''
    Wrangle and preprocess portfolio data into usable format
    '''
        
    portw = portfolio.join(portfolio.channels.str.join('|').str.get_dummies())
    portw.drop(['channels', 'email'], axis=1, inplace=True)
    portw = portw[['id', 'difficulty', 'reward', 'duration', 'offer_type', 'mobile', 'web', 'social']]
    portw = portw.join(pd.get_dummies(portw.offer_type))
    portw.drop(['offer_type'], axis=1, inplace=True)
    portw.sort_values(['difficulty', 'reward', 'duration'], ascending=False, inplace=True)
    id = list(portw['id'])
    portw.id = portw.id.map({a:b for a,b in zip(id, 'abcdefghij')})
    portw.reset_index(drop=True, inplace=True)
    
    if save:
        try:
            dirName='../../data/interim'
            os.mkdir(dirName)
            print("Directory " , dirName ,  " Created ") 
        except FileExistsError:
            pass
        
        portw.to_csv(dirName + '/portw1.csv', index=False)
        print('saved as {}'.format(dirName + '/portw1.csv'))
    
    return portw

In [None]:
wrangle_portfolio(portfolio, save=True)

## 2. Examining and reorganising profile and transcript data

In [None]:
profile.head()

In [None]:
# getting dummies for gender, unknown "None" becomes baseline
profilec = profile.join(pd.get_dummies(profile.gender))
profilec.drop(['gender'], axis=1, inplace=True)

In [None]:
# converting to datetime
profilec.became_member_on = pd.to_datetime(profilec.became_member_on, format='%Y%m%d')

In [None]:
# rearranging column order
profilec = profilec[['id', 'age', 'income', 'became_member_on', 'F', 'M', 'O']]

In [None]:
# renaming id column to person
profilec.rename(columns={'id': 'person'}, inplace=True)

In [None]:
profilec.head(5)

In [None]:
# merging trancript data with person data
tranc = transcript.merge(profilec, on='person')
tranc.head()

In [None]:
# Splitting out value column into two columns
tranc = tranc.join(pd.DataFrame(list(tranc.value)))
tranc.drop('value', axis=1, inplace=True)

In [None]:
tranc.head()

In [None]:
# filling NaNs as blank strings across two column version of offer id
tranc['offer id'] = tranc['offer id'].fillna(value="")
tranc['offer_id'] = tranc['offer_id'].fillna(value="")

In [None]:
# concatenating offer id fields since raw data had differing naming conventions
tranc['offer_id'] = tranc['offer id'].map(str) + tranc.offer_id.map(str)

In [None]:
# remove redundant offer id column
tranc.drop('offer id', axis=1, inplace=True)

In [None]:
tranc.head()

In [None]:
# mapping offer_id to abcdefghij representing each offer
tranc.offer_id = tranc.offer_id.map({a:b for a,b in zip(id, 'abcdefghij')})

In [None]:
# renaming columns to more appropriate column titles
tranc.rename(columns={'offer_id': 'id', 'reward': 'rewarded', 'became_member_on': 'signed_up'}, inplace=True)

In [None]:
tranc.head()

In [None]:
# merging tranc with portw
tranc = tranc.merge(portw, how='left', on='id')

In [None]:
tranc.head(10)

In [None]:
# Filling all NaNs as zeros
tranc = tranc.fillna(value=0)

In [None]:
# Replacing zero income back to NaN - XGBoost algorithm will be able to handle nans
tranc.income.replace({0: np.nan}, inplace=True)

In [None]:
# Age of 118 is likely an error where birthdate has been set to 1900, therefore replace age of 119 with Nan
tranc.age.replace({118: np.nan}, inplace=True)

In [None]:
tranc.head(10)

In [None]:
# Adding cumulative amount spent
tranc['cum_amount'] = tranc.groupby('person').amount.cumsum()

In [None]:
tranc.head()

In [None]:
# Converting event into categorical data type
tranc.event = pd.Categorical(tranc.event, categories=['offer received', 'offer viewed', 'offer completed', 'transaction'], ordered=True)

In [None]:
# concatenating person with offer id to try and make a unique offer_id 
tranc['offer_id'] = tranc.person + tranc.id.astype(str)

In [None]:
# reordering columns
tranc = tranc[['offer_id', 'person', 'event', 'time', 'age', 'income', 'signed_up', 'F', 'M', 'O',
               'amount', 'id', 'rewarded', 'difficulty', 'reward', 'duration', 'mobile', 'web', 
               'social', 'bogo', 'discount', 'informational', 'cum_amount']]

In [None]:
# concatenating offer_id with offer type to find instaces of multiple similar offers
tranc['offer_multi'] = tranc.offer_id + tranc.event.astype(str)

In [None]:
# checking value counts for offer_multi containing string "offer" - confirmed that multiple offers of the same type can be applied to same customer
tranc['offer_multi'].value_counts()[tranc['offer_multi'].value_counts().index.str.contains('offer')].head()

In [None]:
# Adding a numerical suffix to distinigush a repeated similar offer for the same person
tranc['offer_multi_correction'] = tranc.groupby('offer_multi').offer_id.apply(lambda n: n + (np.arange(len(n))+1).astype(str))

'''
Utilising:
https://stackoverflow.com/questions/27806825/how-to-modify-duplicated-rows-in-python-pandas
'''

In [None]:
# updating offer_id to new unique version
tranc.offer_id = tranc.offer_multi_correction

In [None]:
# dropping unneeded columns
tranc.drop(['offer_multi', 'offer_multi_correction'], axis=1, inplace=True)

In [None]:
# creating joined column as number of days difference from latest signed up date in the data
tranc['joined'] = (tranc.signed_up - tranc.signed_up.max()).dt.days

In [None]:
tranc.head(100)

In [None]:
# one hot encoding event
tranc = tranc.join(pd.get_dummies(tranc.event))

In [None]:
# converting id to categorical 
tranc['id'] = pd.Categorical(tranc.id, categories=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', '0'], ordered=True)

In [None]:
tranc.head(10)

* ### Script to wrangle raw profile and transcript data

In [None]:
def wrangle_portfolio(portfolio, profile, transcript, save=None):
    '''
    Wrangle and preprocess profile and transcript data into usable format
    '''
    
    portw = portfolio.join(portfolio.channels.str.join('|').str.get_dummies())
    portw.drop(['channels', 'email'], axis=1, inplace=True)
    portw = portw[['id', 'difficulty', 'reward', 'duration', 'offer_type', 'mobile', 'web', 'social']]
    portw = portw.join(pd.get_dummies(portw.offer_type))
    portw.drop(['offer_type'], axis=1, inplace=True)
    portw.sort_values(['difficulty', 'reward', 'duration'], ascending=False, inplace=True)
    id = list(portw['id'])
    portw.id = portw.id.map({a:b for a,b in zip(id, 'abcdefghij')})
    portw.reset_index(drop=True, inplace=True)
    
    if save:
        try:
            dirName='../../data/interim'
            os.mkdir(dirName)
            print("Directory " , dirName ,  " Created ") 
        except FileExistsError:
            pass
        
        portw.to_csv(dirName + '/portw.csv', index=False)
        print('saved as {}'.format(dirName + '/portw.csv'))
    
    
    profilec = profile.join(pd.get_dummies(profile.gender))
    profilec.drop(['gender'], axis=1, inplace=True)
    profilec.became_member_on = pd.to_datetime(profilec.became_member_on, format='%Y%m%d')
    profilec = profilec[['id', 'age', 'income', 'became_member_on', 'F', 'M', 'O']]
    profilec.rename(columns={'id': 'person'}, inplace=True)
    
    tranc = transcript.merge(profilec, on='person')
    tranc = tranc.join(pd.DataFrame(list(tranc.value)))
    tranc.drop('value', axis=1, inplace=True)
    
    tranc['offer id'] = tranc['offer id'].fillna(value="")
    tranc['offer_id'] = tranc['offer_id'].fillna(value="")
    
    tranc['offer_id'] = tranc['offer id'].map(str) + tranc.offer_id.map(str)
    tranc.drop('offer id', axis=1, inplace=True)
    
    tranc.offer_id = tranc.offer_id.map({a:b for a,b in zip(id, 'abcdefghij')})
    tranc.rename(columns={'offer_id': 'id', 'reward': 'rewarded', 'became_member_on': 'signed_up'}, inplace=True)
    tranc = tranc.merge(portw, how='left', on='id')
    tranc = tranc.fillna(value=0)
    tranc.income.replace({0: np.nan}, inplace=True)
    tranc.age.replace({118: np.nan}, inplace=True)
    tranc['cum_amount'] = tranc.groupby('person').amount.cumsum()
    tranc.event = pd.Categorical(tranc.event, categories=['offer received', 'offer viewed', 'offer completed', 'transaction'], ordered=True)
    tranc['offer_id'] = tranc.person + tranc.id.astype(str)
    tranc = tranc[['offer_id', 'person', 'event', 'time', 'age', 'income', 'signed_up', 'F', 'M', 'O',
               'amount', 'id', 'rewarded', 'difficulty', 'reward', 'duration', 'mobile', 'web', 
               'social', 'bogo', 'discount', 'informational', 'cum_amount']]
    tranc['offer_multi'] = tranc.offer_id + tranc.event.astype(str)
    tranc['offer_multi_correction'] = tranc.groupby('offer_multi').offer_id.apply(lambda n: n + (np.arange(len(n))+1).astype(str))
    tranc.offer_id = tranc.offer_multi_correction
    tranc.drop(['offer_multi', 'offer_multi_correction'], axis=1, inplace=True)
    tranc['joined'] = (tranc.signed_up - tranc.signed_up.max()).dt.days
    tranc = tranc.join(pd.get_dummies(tranc.event))
    tranc['id'] = pd.Categorical(tranc.id, categories=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', '0'], ordered=True)
    
    if save:
        try:
            dirName='../../data/interim'
            os.mkdir(dirName)
            print("Directory " , dirName ,  " Created ") 
        except FileExistsError:
            pass
        
        #tranc.to_csv(dirName + '/tranc.csv', index=False)
        #tranc.to_pickle(dirName + '/tranc.pickle')
        #print('saved as {}'.format(dirName + '/tranc.pickle'))
        
        joblib.dump(tranc, dirName + '/' + save, compress=True)
        print('saved as {}'.format(dirName + '/' + save)) 
    
    return tranc

In [None]:
tranc = wrangle_portfolio(portfolio, profile, transcript, save='tranc.joblib')

In [None]:
tranc