# Loading Datasets

In [580]:
import pandas as pd
import json
import copy
import numpy as np

import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots

In [581]:
portfolio_df = pd.read_csv('../Datasets/portfolio.csv', index_col=0)
profile_df = pd.read_csv('../Datasets/profile.csv', index_col=0)
transcript_df = pd.read_csv('../Datasets/transcript.csv', index_col=0)

In [582]:
portfolio_df.info()
portfolio_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reward      10 non-null     int64 
 1   channels    10 non-null     object
 2   difficulty  10 non-null     int64 
 3   duration    10 non-null     int64 
 4   offer_type  10 non-null     object
 5   id          10 non-null     object
dtypes: int64(3), object(3)
memory usage: 560.0+ bytes


Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"['email', 'mobile', 'social']",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
1,10,"['web', 'email', 'mobile', 'social']",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
2,0,"['web', 'email', 'mobile']",0,4,informational,3f207df678b143eea3cee63160fa8bed
3,5,"['web', 'email', 'mobile']",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
4,5,"['web', 'email']",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7


In [583]:
# columns gender and income has null values, we will see how to handle them later
profile_df.info()
profile_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17000 entries, 0 to 16999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            14825 non-null  object 
 1   age               17000 non-null  int64  
 2   id                17000 non-null  object 
 3   became_member_on  17000 non-null  int64  
 4   income            14825 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 796.9+ KB


Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,20170804,


In [584]:
transcript_df.info()
transcript_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 306534 entries, 0 to 306533
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   person  306534 non-null  object
 1   event   306534 non-null  object
 2   value   306534 non-null  object
 3   time    306534 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 11.7+ MB


Unnamed: 0,person,event,value,time
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0
2,e2127556f4f64592b11af22de27a7932,offer received,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0
4,68617ca6246f4fbc85e91a2a49552598,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0


# Merging datasets

In [585]:
# checking for duplicated profile information
print("Are there any duplicated profiles in profile dataframe?", profile_df.duplicated('id').any())

Are there any duplicated profiles in profile dataframe? False


In [586]:
transcript_profile_df = pd.merge(transcript_df, profile_df, left_on='person', right_on='id', how='outer')

In [587]:
value_count_for_value = {}

for item in transcript_profile_df['value']:
    data = json.loads(item.replace("'", '"'))
    for key in data.keys():
        if key in value_count_for_value:
            value_count_for_value[key] += 1
        else:
            value_count_for_value[key] = 1

print("Types of values in value column of dataframe", value_count_for_value)

"""
Looking at a csv dataset record, "12672,fe97aa22dd3e48c8b143116a8403dd52,offer completed,"{'offer_id': 'fafdcd668e3743c1bb461111dcafc2a4', 'reward': 2}",0"
difference between 'offer_id' and 'offer id' is that when there is a offer completed, it will be 'offer_id' and a reward, instead of 'offer id'.
Let us validate that below
"""

mapping_flag = True
for index, record in transcript_profile_df.iterrows():
    value = record['value']
    event = record['event']
    if 'offer_id' in value:
        if event == 'offer completed':
            pass
        else:
            print("Invalidated. 'offer_id' is not a 1-1 mapping with offer completed event.", record)
            mapping_flag = False
            break

if mapping_flag:
    print("Validated. 'offer_id' is a 1-1 mapping with offer completed event.")

Types of values in value column of dataframe {'offer id': 134002, 'amount': 138953, 'offer_id': 33579, 'reward': 33579}
Validated. 'offer_id' is a 1-1 mapping with offer completed event.


In [588]:
def extract_values(x):
    data = json.loads(x.replace("'", '"'))
    
    offer_id = data.get('offer id')
    transaction_amount = data.get('amount')
    reward_awarded = data.get('reward')

    if reward_awarded:
        offer_id = data.get('offer_id')

    return offer_id, transaction_amount, reward_awarded

transcript_profile_df[['offer_id', 'transaction_amount', 'reward_awarded']] = transcript_profile_df['value'].apply(lambda x: extract_values(x)).apply(pd.Series)

In [589]:
full_df = pd.merge(transcript_profile_df, portfolio_df, left_on='offer_id', right_on='id', how='outer')

# Data Cleaning and Transformation

In [590]:
#dropping columns because id_x is person id and id_y is offer_id
pre_clean_df = copy.deepcopy(full_df)
pre_clean_df = pre_clean_df.drop(['id_x', 'id_y', 'value'], axis=1)

# converting became_member_on to date time format
pre_clean_df['became_member_on'] = pd.to_datetime(pre_clean_df['became_member_on'], format='%Y%m%d')
profile_df['became_member_on'] = pd.to_datetime(profile_df['became_member_on'], format='%Y%m%d')

In [591]:
pre_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 306534 entries, 0 to 306533
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   person              306534 non-null  object        
 1   event               306534 non-null  object        
 2   time                306534 non-null  int64         
 3   gender              272762 non-null  object        
 4   age                 306534 non-null  int64         
 5   became_member_on    306534 non-null  datetime64[ns]
 6   income              272762 non-null  float64       
 7   offer_id            167581 non-null  object        
 8   transaction_amount  138953 non-null  float64       
 9   reward_awarded      33579 non-null   float64       
 10  reward              167581 non-null  float64       
 11  channels            167581 non-null  object        
 12  difficulty          167581 non-null  float64       
 13  duration            167581 no

In [592]:
#handling null in gender and income columns

#is gender null when income is null?
gender_null_df = pre_clean_df[pre_clean_df['gender'].isnull()]
both_null = gender_null_df['income'].isnull().all()

print("All income is null when gender is null?", both_null)
print("How many null rows?", len(gender_null_df))
print("How many percentage of null gender and income contributes to all transactions?", round(len(gender_null_df)/len(pre_clean_df)*100, 2))
print("How many percentage of customers out of ALL customers did not indicate gender?", round(len(profile_df[profile_df["gender"].isnull()])/len(profile_df)*100, 2))

All income is null when gender is null? True
How many null rows? 33772
How many percentage of null gender and income contributes to all transactions? 11.02
How many percentage of customers out of ALL customers did not indicate gender? 12.79


Since income and gender are crucial in developing an accurate profile of a person, we could either:
1. Remove every row with income and gender is null since 11% of transactions removed should not cause much lost in information
2. Remove gender and income columns entirely and replace them with binary variables that indicates if the user provided these information (eg. 0 if not provided)
3. Create 2 datasets, one with and one without gender and income and conduct separate analysis on each of them

There is no additional information that would allow us to accurately determine the income of those who did not indicate it, hence it would be best to remove
the income column. As for gender, we can create a new variable "Unknown" for those who do not indicate it. However, since gender is null when income is null,
we will remove all rows where gender and income is null.

In [593]:
pre_clean_df = pre_clean_df[pre_clean_df['income'].notna()]
cleaned_profile_df = profile_df[profile_df['income'].notna()]

# changing string list to a list
portfolio_df['channels'] = portfolio_df['channels'].apply(lambda x: eval(x))

# giving a easily identifiable name for each offer type, instead of using offer id. Offer name will now represent details of the offer in this order:
# offer_type, duration, difficulty, reward
portfolio_df['new_id'] = portfolio_df.apply(lambda row: f"{row['offer_type']}_{row['duration']}_{row['difficulty']}_{row['reward']}", axis=1)

cleaned_portfolio_df = copy.deepcopy(portfolio_df)

In [594]:
# using new naming convnetion for offer type naming
merged_df = pd.merge(pre_clean_df, cleaned_portfolio_df[['id', 'new_id']], left_on='offer_id', right_on='id', how='left')
merged_df.drop(['offer_id', 'id'], axis=1, inplace=True)

cleaned_portfolio_df['id'] = cleaned_portfolio_df['new_id']
cleaned_portfolio_df.drop(['new_id'], axis=1, inplace=True)

merged_df.rename(columns={'new_id': 'offer_id'}, inplace=True)
pre_clean_df = merged_df 
pre_clean_df.head()


Unnamed: 0,person,event,time,gender,age,became_member_on,income,transaction_amount,reward_awarded,reward,channels,difficulty,duration,offer_type,offer_id
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,F,75,2017-05-09,100000.0,,,5.0,"['web', 'email', 'mobile']",5.0,7.0,bogo,bogo_7_5_5
1,78afa995795e4d85b5d9ceeca43f5fef,offer viewed,6,F,75,2017-05-09,100000.0,,,5.0,"['web', 'email', 'mobile']",5.0,7.0,bogo,bogo_7_5_5
2,78afa995795e4d85b5d9ceeca43f5fef,offer completed,132,F,75,2017-05-09,100000.0,,5.0,5.0,"['web', 'email', 'mobile']",5.0,7.0,bogo,bogo_7_5_5
3,e2127556f4f64592b11af22de27a7932,offer received,408,M,68,2018-04-26,70000.0,,,5.0,"['web', 'email', 'mobile']",5.0,7.0,bogo,bogo_7_5_5
4,e2127556f4f64592b11af22de27a7932,offer viewed,420,M,68,2018-04-26,70000.0,,,5.0,"['web', 'email', 'mobile']",5.0,7.0,bogo,bogo_7_5_5


In [595]:
#new cleaned df

cleaned_df = copy.deepcopy(pre_clean_df)

# Exploratory Data Analysis

In [596]:
cleaned_profile_df.head()

Unnamed: 0,gender,age,id,became_member_on,income
1,F,55,0610b486422d4921ae7d2bf64640c50b,2017-07-15,112000.0
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,2017-05-09,100000.0
5,M,68,e2127556f4f64592b11af22de27a7932,2018-04-26,70000.0
8,M,65,389bc3fa690240e798340f5a15918d5c,2018-02-09,53000.0
12,M,58,2eeac8d8feae4a8cad5a6af0499a211d,2017-11-11,51000.0


In [597]:
cleaned_df.head()

Unnamed: 0,person,event,time,gender,age,became_member_on,income,transaction_amount,reward_awarded,reward,channels,difficulty,duration,offer_type,offer_id
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,F,75,2017-05-09,100000.0,,,5.0,"['web', 'email', 'mobile']",5.0,7.0,bogo,bogo_7_5_5
1,78afa995795e4d85b5d9ceeca43f5fef,offer viewed,6,F,75,2017-05-09,100000.0,,,5.0,"['web', 'email', 'mobile']",5.0,7.0,bogo,bogo_7_5_5
2,78afa995795e4d85b5d9ceeca43f5fef,offer completed,132,F,75,2017-05-09,100000.0,,5.0,5.0,"['web', 'email', 'mobile']",5.0,7.0,bogo,bogo_7_5_5
3,e2127556f4f64592b11af22de27a7932,offer received,408,M,68,2018-04-26,70000.0,,,5.0,"['web', 'email', 'mobile']",5.0,7.0,bogo,bogo_7_5_5
4,e2127556f4f64592b11af22de27a7932,offer viewed,420,M,68,2018-04-26,70000.0,,,5.0,"['web', 'email', 'mobile']",5.0,7.0,bogo,bogo_7_5_5


In [598]:
cleaned_portfolio_df

Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"[email, mobile, social]",10,7,bogo,bogo_7_10_10
1,10,"[web, email, mobile, social]",10,5,bogo,bogo_5_10_10
2,0,"[web, email, mobile]",0,4,informational,informational_4_0_0
3,5,"[web, email, mobile]",5,7,bogo,bogo_7_5_5
4,5,"[web, email]",20,10,discount,discount_10_20_5
5,3,"[web, email, mobile, social]",7,7,discount,discount_7_7_3
6,2,"[web, email, mobile, social]",10,10,discount,discount_10_10_2
7,0,"[email, mobile, social]",0,3,informational,informational_3_0_0
8,5,"[web, email, mobile, social]",5,5,bogo,bogo_5_5_5
9,2,"[web, email, mobile]",10,7,discount,discount_7_10_2


In [599]:
# template for graphs

graph_custom_template = dict(
    layout=go.Layout(
        title_font=dict(size=24),
        xaxis=dict(
            title_font=dict(size=18),
            showgrid=True,
            gridwidth=1,
            showline=True,
            linecolor='black',
            linewidth=2,
            gridcolor='Grey'
        ),
        yaxis=dict(
            title_font=dict(size=18),
            showgrid=True,
            gridwidth=1,
            showline=True,
            linecolor='black',
            linewidth=2,
            gridcolor='Grey'
        )
    )
)

## Univariate & Bivariate Analysis

### Gender Analysis

In [600]:
# gender distribution plot
gender_counts = cleaned_profile_df['gender'].value_counts()
gender_data = go.Pie(labels=gender_counts.index, values=gender_counts.values)

fig = go.Figure(data=[gender_data])
fig.update_layout(title_text='Gender Distribution', template=graph_custom_template, width = 400, height=400)
fig.show()

**Insights:**

Purchase amount during the period are mainly by male and females with males contributing to 20%~ more purchases than females. A small percentaage of 1.5%~ have others as their gender.

### Age Analysis

In [601]:
# age distribution plot
fig = go.Figure(data=[go.Histogram(
    x=cleaned_profile_df['age'],
    marker=dict(
        color='rgba(0, 123, 255, 0.7)',
        line=dict(
            color='rgba(0, 123, 255, 1)',
            width=2  
        )
    )
)])

fig.update_layout(
    title_text='Age Distribution',
    xaxis_title_text='Age', 
    yaxis_title_text='Count',
    template=graph_custom_template,
    bargap=0.2,
    bargroupgap=0.1
)

fig.show()

bin_size = 10
bins = np.arange(10, cleaned_profile_df['age'].max() + bin_size, bin_size)
hist, bin_edges = np.histogram(cleaned_profile_df['age'], bins=bins)

percentages = (hist / hist.sum()) * 100
bin_labels = [f"{edge} - {edge+bin_size}" for edge in bins[:-1]]


fig = go.Figure(data=go.Bar(
    x=bin_labels,
    y=percentages,
    text=np.round(percentages, 2),
    textposition='outside',
    marker=dict(
        color='rgba(0, 123, 255, 0.7)',
        line=dict(
            color='rgba(0, 123, 255, 1)',
            width=2  
        )
    )
))

fig.update_layout(
    title_text='Age Distribution with Percentage Contribution',
    xaxis_title_text='Age Bins',
    yaxis_title_text='Percentage of Total',
    template=graph_custom_template,
)

fig.show()


**Insights:**

Customers aged 10 to 70 make up 80% Starbucks customer base.
Customers aged 50 to 60 make up 23.89% of Starbucks customer base which is the most, compared to other age groups. This is followed by age group 60 to 70 which makes up 20.18% and 40 to 50, which makes up 15.58%.

### Income Analysis

In [602]:
fig = go.Figure(data=[go.Histogram(
    x=cleaned_profile_df['income'],
    marker=dict(
        color='rgba(0, 123, 255, 0.7)',
        line=dict(
            color='rgba(0, 123, 255, 1)',
            width=2  
        )
    )
)])

fig.update_layout(
    title_text='Income Distribution',
    xaxis_title_text='Income', 
    yaxis_title_text='Count',  
    template=graph_custom_template,
    bargap=0.2,
    bargroupgap=0.1
)

fig.show()

**Insights:**

The most common income range among the members is between $50k and $70k as seen from the tallest bars being within that income range. As income increases beyond the $70k mark, the number of membes within each income range decreases.

Lowest income members: On the left of the graph, there's a sudden jump in member Count starting from around $30k, which suggests that there are relatively fewer members with income lower than $30k.

### Age VS Income Analysis

In [603]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Histogram2dContour(
    x=cleaned_profile_df['age'], 
    y=cleaned_profile_df['income'],
    colorscale='sunsetdark',
    contours=dict(
        showlabels=True,
        labelfont=dict(
            size=12,
            color='black',
        )
    )
))

fig.update_layout(
    title_text='Income-Age Density Distribution',
    xaxis_title_text='Age', 
    yaxis_title_text='Income',
    template=graph_custom_template
)

fig.show()


**Insights on the relationship between Income and Age:**

The highest data concentration seems to be within the age range of roughly 45 to 65. This is indicated by the densest contour lines labelled with higher numbers like 120 to 160, which that the region enclosed by the line contains 120 to 160 data points respectively.

The common income range for this densest age group is around 60k to 80k as this is where the contours are closest together. As the age increases beyond 65 or decreases below 45, the density of data points decreases, as indicated by the wider spacing between contour lines.

Younger Age Group: There's a lower density of data points those under 45, which is indicated by the more sparse contour lines in the left area.

High-Income Earners: There are individuals in the higher income brackets (above 100k), especially in the age range of 55 to 65.

There appears to be little relationship between age and income as there no specific trends to the contours. However, the highest income levels do seem to tbe clustered around the middle aged group.

### Age VS Gender Analysis

In [604]:
fig1 = go.Figure()
fig1.add_trace(go.Box(x=cleaned_profile_df['gender'], y=cleaned_profile_df['age'], boxmean='sd'))
fig1.update_layout(
    title='Age Distribution by Gender',
    xaxis_title='Gender',
    yaxis_title='Age',
    template= graph_custom_template
)
fig1.show()

avg_age_by_gender = cleaned_profile_df.groupby('gender')['age'].mean().reset_index()
print("Average Age by Gender")
print(avg_age_by_gender)

Average Age by Gender
  gender        age
0      F  57.544950
1      M  52.116690
2      O  54.400943


**Insights:**

The average ages of female, male and other members are 57, 52 and 54 respectively. The spread of male members are wider than others and females which indicate a wider variety of male ages. All 3 genders follow a somewhat normal distribution as seen by their first and third quartile and median centered symmetrically around the mean.

There is also a presence of a outlier in the female gender of age 101. However, it does not skew the distribution by much.

### Membership Duration Analysis

In [605]:
cleaned_profile_df['became_member_on'] = pd.to_datetime(cleaned_profile_df['became_member_on'])
cleaned_profile_df['membership_year'] = cleaned_profile_df['became_member_on'].dt.year

# Membership trend over time
memberships_by_year = cleaned_profile_df.groupby('membership_year').size()
fig1 = go.Figure(data=[
    go.Bar(x=memberships_by_year.index, y=memberships_by_year.values)
])
fig1.update_layout(title='New Memberships per Year',
                   xaxis_title='Year',
                   yaxis_title='Count of New Memberships',
                   template= graph_custom_template)
fig1.show()

# Box plot for income by membership year
fig2 = go.Figure()
for year in sorted(cleaned_profile_df['membership_year'].unique()):
    fig2.add_trace(go.Box(
        y=cleaned_profile_df[cleaned_profile_df['membership_year'] == year]['income'],
        name=str(year),
        boxmean='sd'
    ))
fig2.update_layout(title='Income Distribution by Membership Year',
                   xaxis_title='Year',
                   yaxis_title='Income',
                   template=graph_custom_template)
fig2.show()

# Box plot for age by membership year
fig3 = go.Figure()
for year in sorted(cleaned_profile_df['membership_year'].unique()):
    fig3.add_trace(go.Box(
        y=cleaned_profile_df[cleaned_profile_df['membership_year'] == year]['age'],
        name=str(year),
        boxmean='sd'
    ))
fig3.update_layout(title='Age Distribution by Membership Year',
                   xaxis_title='Year',
                   yaxis_title='Age',
                   template=graph_custom_template)
fig3.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



**Insights:**

The customers in this pool mainly signed up for a membership during 2016 to 2018. Before 2016, memebership sign ups were very low. This could mean that membership sign ups were somehow encouraged or pushed for after 2015.

Even though the number of sign ups increase by 3.5 times as compared from 2017 to 20115, the income and age distribution of the customers does not seem to vary much throughout the years. This means that the number of sign ups does not affect the demographics of Starbucks members as it seems that Starbucks has been true to its image in attracting a certain demographic of customers to sign up for their memberships.

### Offer Analysis

In [606]:
print("Distribution of Offer Types")
offer_type_counts = portfolio_df['offer_type'].value_counts()
print(offer_type_counts)

# Scatter Plot for Difficulty vs. Reward by Offer Type
fig2 = go.Figure()
for offer_type in portfolio_df['offer_type'].unique():
    df_subset = portfolio_df[portfolio_df['offer_type'] == offer_type]
    fig2.add_trace(go.Scatter(
        x=df_subset['difficulty'],
        y=df_subset['reward'],
        mode='markers',
        name=offer_type,
        marker=dict(
            size=12
        )
    ))

fig2.update_layout(
    title='Difficulty vs. Reward by Offer Type',
    xaxis_title='Difficulty',
    yaxis_title='Reward',
    legend_title='Offer Type',
    template=graph_custom_template
)
fig2.show()

Distribution of Offer Types
bogo             4
discount         4
informational    2
Name: offer_type, dtype: int64


**Insights:**

For informational offers, it is treated as 0 difficulty and 0 reward by Starbucks.
For buy one get one offers, the difficulty of the ofer is proportional to the rewards given.
For discount offers, the difficulty of the offer does not determine how much rewards the member will get when completing the offer.

In [607]:
channel_counts_per_offer_type = pd.DataFrame(index=cleaned_portfolio_df['offer_type'].unique())

for index, row in cleaned_portfolio_df.iterrows():
    offer_type = row['offer_type']
    for channel in row['channels']:
        if channel in channel_counts_per_offer_type.columns:
            channel_counts_per_offer_type.loc[offer_type, channel] = (
                channel_counts_per_offer_type.loc[offer_type, channel] + 1
            ) if pd.notna(channel_counts_per_offer_type.loc[offer_type, channel]) else 1
        else:
            channel_counts_per_offer_type[channel] = pd.NA
            channel_counts_per_offer_type.loc[offer_type, channel] = 1

channel_counts_per_offer_type.fillna(0, inplace=True)
channel_counts_per_offer_type = channel_counts_per_offer_type.astype(int)
channel_counts_per_offer_type.reset_index(inplace=True)
channel_counts_per_offer_type.rename(columns={'index': 'offer_type'}, inplace=True)

print("Total count of channels used by each offer:")
print(channel_counts_per_offer_type)

Total count of channels used by each offer:
      offer_type  email  mobile  social  web
0           bogo      4       4       3    3
1  informational      2       2       1    1
2       discount      4       3       2    4


**Insights:**

Distribution of Offer Types:
1. bogo:             4
2. discount:         4
3. informational:    2

Offer type bogo and discount mainly utilize email, mobile and web channels to reach out to customers. Whereas informational offer type mainly use email and mobile to reach out to customers.

In [608]:
fig3 = go.Figure()
for offer_type in portfolio_df['offer_type'].unique():
    df_subset = portfolio_df[portfolio_df['offer_type'] == offer_type]
    fig3.add_trace(go.Scatter3d(
        x=df_subset['difficulty'],
        y=df_subset['reward'],
        z=df_subset['duration'],
        mode='markers',
        name=offer_type,
        marker=dict(
            size=5, 
            opacity=0.8
        )
    ))

fig3.update_layout(
    title='Relationship between Duration, Reward, and Difficulty by Offer Type',
    scene=dict(
        xaxis_title='Difficulty',
        yaxis_title='Reward',
        zaxis_title='Duration (days)'
    ),
    legend_title='Offer Type',
    template=graph_custom_template,
    height= 600
)

fig3.show()


![alt text](image.png)
![alt text](image-1.png)

**Insights:**

Offers with higher difficulties like discount, gets a longer duration for completion.

Given the same difficulty and duration, bogo gives the best reward to difficult ratio and longest duration to difficulty ratio.

## Transactions Analysis

In [609]:
event_counts = cleaned_df['event'].value_counts()

fig = go.Figure(data=[go.Bar(
    x=event_counts.index,
    y=event_counts.values,
    text=event_counts.values,
    textposition='auto'
)])

fig.update_layout(
    title='Event Value Counts',
    xaxis_title='Event',
    yaxis_title='Count',
    template=graph_custom_template
)

fig.show()


## Offer Analysis

### What is the breakdown of offers being given out (received), which offers are being viewed the most and how many percent of each offers are completed?

In [644]:
offer_received = cleaned_df[cleaned_df['event'] == 'offer received']
offer_viewed = cleaned_df[cleaned_df['event'] == 'offer viewed']
offer_completed = cleaned_df[cleaned_df['event'] == 'offer completed']

offer_received_counts = offer_received['offer_type'].value_counts().sort_index()
print(offer_received_counts)
offer_viewed_counts = offer_viewed['offer_type'].value_counts().sort_index()
print(offer_viewed_counts)
offer_completed_counts = offer_completed['offer_type'].value_counts().sort_index()
print(offer_completed_counts)

viewed_rates = (offer_viewed_counts / offer_received_counts * 100).reset_index()
viewed_rates.columns = ['offer_type', 'view_rate']

completed_rates = (offer_completed_counts / offer_received_counts * 100).reset_index()
completed_rates.columns = ['offer_type', 'completion_rate']

for offer_type in offer_received_counts.index:
    view_rate = viewed_rates.loc[viewed_rates['offer_type'] == offer_type, 'view_rate'].values[0]
    completion_rate = completed_rates.loc[completed_rates['offer_type'] == offer_type, 'completion_rate'].values[0]
    if pd.isna(completion_rate):
        completion_rate = 0
    print(f"Offer type {offer_type} has a view rate of {view_rate:.2f}% and a completion rate of {completion_rate:.2f}%.")

fig = go.Figure(data=[
    go.Bar(name='Offer Received', x=offer_received_counts.index, y=offer_received_counts.values),
    go.Bar(name='Offer Viewed', x=offer_viewed_counts.index, y=offer_viewed_counts.reindex(offer_received_counts.index, fill_value=0).values),
    go.Bar(name='Offer Completed', x=offer_completed_counts.index, y=offer_completed_counts.reindex(offer_received_counts.index, fill_value=0).values)
])

fig.update_layout(
    title='Comparison of Offer Events by Offer Type',
    xaxis_title='Offer Type',
    yaxis_title='Count',
    barmode='group',
    template=graph_custom_template
)
fig.show()

bogo             26537
discount         26664
informational    13300
Name: offer_type, dtype: int64
bogo             22039
discount         18461
informational     9360
Name: offer_type, dtype: int64
bogo        15258
discount    17186
Name: offer_type, dtype: int64
Offer type bogo has a view rate of 83.05% and a completion rate of 57.50%.
Offer type discount has a view rate of 69.24% and a completion rate of 64.45%.
Offer type informational has a view rate of 70.38% and a completion rate of 0.00%.


**Insights:**

There is no significant difference between the viewing rates of each offer. However, bogo offers do tend to receive a higher view rate.

### Are transactions mappable to the a offer completed? So that we can know how much was spent when an offer is completed?

In [611]:
transactions_cleaned_df = cleaned_df[cleaned_df['event'] == 'transaction']
offers_completed_cleaned_df = cleaned_df[cleaned_df['event'] == 'offer completed']

matched_events = pd.merge(
    transactions_cleaned_df,
    offers_completed_cleaned_df,
    on=['person', 'time'],
    how='inner',
    suffixes=('_transaction', '_offer_completed')
)

if len(matched_events) == 32444:
    print(f"There are {len(matched_events)} events where a transaction matches an 'offer completed' event for the same person at the same time. \n\
This means that every offer completed has a transaction related to it. Hence we can find the amount spent for that offer completion")
else:
    print("Some offer completed event does not have a mapped transaction.")


There are 32444 events where a transaction matches an 'offer completed' event for the same person at the same time. 
This means that every offer completed has a transaction related to it. Hence we can find the amount spent for that offer completion


### How many transactions are made without being shown an offer?

In [612]:
transactions_cleaned_df

Unnamed: 0,person,event,time,gender,age,became_member_on,income,transaction_amount,reward_awarded,reward,channels,difficulty,duration,offer_type,offer_id
14372,78afa995795e4d85b5d9ceeca43f5fef,transaction,132,F,75,2017-05-09,100000.0,19.89,,,,,,,
14373,78afa995795e4d85b5d9ceeca43f5fef,transaction,144,F,75,2017-05-09,100000.0,17.78,,,,,,,
14374,78afa995795e4d85b5d9ceeca43f5fef,transaction,222,F,75,2017-05-09,100000.0,19.67,,,,,,,
14375,78afa995795e4d85b5d9ceeca43f5fef,transaction,240,F,75,2017-05-09,100000.0,29.72,,,,,,,
14376,78afa995795e4d85b5d9ceeca43f5fef,transaction,378,F,75,2017-05-09,100000.0,23.93,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138324,47683732768a4f7db7abb710ca22e66e,transaction,684,M,24,2017-11-08,56000.0,7.10,,,,,,,
138325,3873fe915496482eb589fa316ae7b0db,transaction,510,M,58,2017-09-05,57000.0,1.57,,,,,,,
138326,3873fe915496482eb589fa316ae7b0db,transaction,588,M,58,2017-09-05,57000.0,1.21,,,,,,,
138327,3873fe915496482eb589fa316ae7b0db,transaction,612,M,58,2017-09-05,57000.0,2.65,,,,,,,


In [613]:
offer_received

Unnamed: 0,person,event,time,gender,age,became_member_on,income,transaction_amount,reward_awarded,reward,channels,difficulty,duration,offer_type,offer_id
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,F,75,2017-05-09,100000.0,,,5.0,"['web', 'email', 'mobile']",5.0,7.0,bogo,bogo_7_5_5
3,e2127556f4f64592b11af22de27a7932,offer received,408,M,68,2018-04-26,70000.0,,,5.0,"['web', 'email', 'mobile']",5.0,7.0,bogo,bogo_7_5_5
6,389bc3fa690240e798340f5a15918d5c,offer received,168,M,65,2018-02-09,53000.0,,,5.0,"['web', 'email', 'mobile']",5.0,7.0,bogo,bogo_7_5_5
8,389bc3fa690240e798340f5a15918d5c,offer received,408,M,65,2018-02-09,53000.0,,,5.0,"['web', 'email', 'mobile']",5.0,7.0,bogo,bogo_7_5_5
11,d058f73bf8674a26a95227db098147b1,offer received,504,F,56,2018-04-28,88000.0,,,5.0,"['web', 'email', 'mobile']",5.0,7.0,bogo,bogo_7_5_5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272751,1044403864ed41af9cc17577c185abc4,offer received,576,M,33,2015-08-16,39000.0,,,3.0,"['web', 'email', 'mobile', 'social']",7.0,7.0,discount,discount_7_7_3
272753,cc7322218ad24de3aedc3123ec1c9e79,offer received,408,F,52,2016-04-19,86000.0,,,3.0,"['web', 'email', 'mobile', 'social']",7.0,7.0,discount,discount_7_7_3
272755,e61a1530bbd2415597b8a326dc466962,offer received,504,M,56,2017-09-16,43000.0,,,3.0,"['web', 'email', 'mobile', 'social']",7.0,7.0,discount,discount_7_7_3
272757,25d81869d43045c09ebf32a6696908e6,offer received,408,M,75,2017-05-29,116000.0,,,3.0,"['web', 'email', 'mobile', 'social']",7.0,7.0,discount,discount_7_7_3


In [614]:
offer_received = cleaned_df[(cleaned_df['event'] == 'offer recevied')]

received_unmatched_transactions = pd.merge(
    transactions_cleaned_df,
    offer_received,
    on='person',
    how='left',
    indicator=True
)

transactions_without_offer_receoved = received_unmatched_transactions[received_unmatched_transactions['_merge'] == 'left_only']

print(f"Number of transactions made without receiving any offers: {len(transactions_without_offer_receoved)}")
print("How many unique person make-up these transactions?", transactions_without_offer_receoved['person'].nunique())

offer_viewed = cleaned_df[(cleaned_df['event'] == 'offer viewed')]

unmatched_transactions = pd.merge(
    transactions_cleaned_df,
    offer_viewed,
    on='person',
    how='left',
    indicator=True
)

transactions_without_offer = unmatched_transactions[unmatched_transactions['_merge'] == 'left_only']

print(f"Number of transactions made without viewing any offers: {len(transactions_without_offer)}")

merged_transactions = pd.merge(
    transactions_cleaned_df,
    offer_viewed,
    on='person',
    how='left',
    suffixes=('_transaction', '_offer_viewed'),
    indicator=True
)

transactions_without_offer_view = merged_transactions[
    (merged_transactions['time_transaction'] > merged_transactions['time_offer_viewed']) |
    (pd.isna(merged_transactions['time_offer_viewed']))
]

transactions_without_offer_view =  transactions_without_offer_view[transactions_without_offer_view['_merge'] == 'left_only']

num_transactions_without_offer_view = transactions_without_offer_view['time_transaction'].count()

print(f"Number of transactions made without any offer being viewed first: {num_transactions_without_offer_view}")

Number of transactions made without receiving any offers: 123957
How many unique person make-up these transactions? 14492


Number of transactions made without viewing any offers: 570
Number of transactions made without any offer being viewed first: 570


**Insights:**

Out of 123957 transactions, only 570 transactions are made without being shown any offer and without any offer being viewed first. 
Transactions make up almost 45% of the all the dataset. 

1. Transactions volume is 4 times the number of offers completed.
2. Transaction volume is 2.5 times the number of offers viewed. This means that after viewing an offer, a customer is likely to return again to make another purchase without the use of the offer.
3. This also means that many customers made purchases becaues they are driven to do so by any offer shown to them.
4. Every customer was shown or viewed an offer before they made a transaction.

### Relationship between offer received, offer viewed, offer completed, transaction

In [615]:
cleaned_df['day'] = (cleaned_df['time'] // 24) + 1 

event_counts_by_day = cleaned_df.groupby(['event', 'day']).size().reset_index(name='event_count')

fig = go.Figure()
event_types = event_counts_by_day['event'].unique()
for event in event_types:
    df_subset = event_counts_by_day[event_counts_by_day['event'] == event]
    fig.add_trace(go.Scatter(
        x=df_subset['day'],
        y=df_subset['event_count'],
        mode='lines+markers',
        name=event
    ))

fig.update_layout(
    title='Daily Count of Events',
    xaxis=dict(title='Day'),
    yaxis=dict(title='Event Count'),
    legend_title_text='Event Type',
    template=graph_custom_template
)

fig.show()


offers_events = cleaned_df[cleaned_df['event'].isin(['offer received', 'offer viewed'])]
events_counts_by_day = offers_events.groupby(['day', 'event']).size().unstack()
events_counts_by_day['offer_view_rate'] = (events_counts_by_day['offer viewed'] / events_counts_by_day['offer received']) * 100
events_counts_by_day.reset_index(inplace=True)
events_view_rate = copy.deepcopy(events_counts_by_day)

print(events_view_rate[pd.isna(events_view_rate['offer_view_rate']) == False])



event  day  offer received  offer viewed  offer_view_rate
0        1         11023.0        4722.0        42.837703
7        8         11061.0        4918.0        44.462526
14      15         11080.0        4849.0        43.763538
17      18         11124.0        5096.0        45.810859
21      22         11048.0        4957.0        44.867849
24      25         11165.0        4975.0        44.558889


In [616]:
print("Number of members", len(cleaned_profile_df))

Number of members 14825


**Insights:**

There is no particular trend or interval in which offers are dished out by Starbucks. From the graph, we can see that:
1. Whenever Starbucks gives out a bunch of offers at one shot, it is around 11k offers each time.
2. Not everyone receives an offer when Starbucks gives them out, as shown by only around 11k offers given out when there are 14825 members.
3. Whenever Starbucks gives out the offers, the offer view rate hovers between 42% to 45%, which is a very small range. (This is assuming that all offers viewed on that day were offers that were sent out that day as we do not have enough data to map which offer was viewed and at what time.)

### What are the times taken to view and compelete each offer? What are the factors affecting time taken to view and complete each offer and how much effect these factors have?

In [617]:
person_offer_metrics

{'78afa995795e4d85b5d9ceeca43f5fef': {('bogo_7_5_5',
   0,
   0): {'offer_type': 'bogo', 'received_time': 0, 'view_time': 6, 'complete_time': 132, 'time_to_view': 6, 'time_to_complete': 126},
  ('informational_3_0_0', 168, 138329): {'offer_type': 'informational',
   'received_time': 168,
   'view_time': 216,
   'complete_time': None,
   'time_to_view': 48,
   'time_to_complete': None},
  ('bogo_7_10_10', 408, 150845): {'offer_type': 'bogo',
   'received_time': 408,
   'view_time': 408,
   'complete_time': 510,
   'time_to_view': 0,
   'time_to_complete': 102},
  ('bogo_5_5_5', 504, 167086): {'offer_type': 'bogo',
   'received_time': 504,
   'view_time': 582,
   'complete_time': None,
   'time_to_view': 78,
   'time_to_complete': None}},
 'e2127556f4f64592b11af22de27a7932': {('bogo_7_5_5',
   408,
   3): {'offer_type': 'bogo', 'received_time': 408, 'view_time': 420, 'complete_time': 522, 'time_to_view': 12, 'time_to_complete': 102},
  ('informational_4_0_0', 336, 196402): {'offer_type':

In [618]:
from collections import defaultdict

offers_received = cleaned_df[cleaned_df['event'] == 'offer received'].copy()
offers_viewed = cleaned_df[cleaned_df['event'] == 'offer viewed'][['person', 'offer_id', 'time']].copy()
offers_completed = cleaned_df[cleaned_df['event'] == 'offer completed'][['person', 'offer_id', 'time']].copy()

# 24 because 1 day is 24
offers_received['end_time'] = offers_received['time'] + offers_received['duration'] * 24

viewed_dict = defaultdict(list)
for index, row in offers_viewed.iterrows():
    viewed_dict[(row['person'], row['offer_id'])].append(row['time'])

completed_dict = defaultdict(list)
for index, row in offers_completed.iterrows():
    completed_dict[(row['person'], row['offer_id'])].append(row['time'])

person_offer_metrics = {}

for index, row in offers_received.iterrows():
    person = row['person']
    offer_id = row['offer_id']
    offer_type = row['offer_type']
    received_time = row['time']
    end_time = row['end_time']

    unique_offer_key = (offer_id, received_time, index)

    if person not in person_offer_metrics:
        person_offer_metrics[person] = {}
        
    if unique_offer_key not in person_offer_metrics[person]:
        person_offer_metrics[person][unique_offer_key] = {
            'offer_type': offer_type,
            'received_time': received_time,
            'view_time': None,
            'complete_time': None,
            'time_to_view': None,
            'time_to_complete': None
        }
        
    # Check if the offer was viewed within its validity period and record the time to view
    viewed_time = None
    received_now_time = None
    if (person, offer_id) in viewed_dict:
        for i, view_time in enumerate(viewed_dict[(person, offer_id)]):
            if received_time <= view_time <= end_time:
                viewed_time = view_time
                received_now_time = received_time
                person_offer_metrics[person][unique_offer_key]['view_time'] = view_time
                person_offer_metrics[person][unique_offer_key]['time_to_view'] = view_time - received_time
                viewed_dict[(person, offer_id)].pop(i)
                break
        
    # Check if the offer was completed within its validity period and record the time to complete
    if (person, offer_id) in completed_dict:
        for i, complete_time in enumerate(completed_dict[(person, offer_id)]):
            if complete_time <= end_time and viewed_time != None:
                if complete_time >= viewed_time:
                    person_offer_metrics[person][unique_offer_key]['complete_time'] = complete_time
                    person_offer_metrics[person][unique_offer_key]['time_to_complete'] = complete_time - viewed_time
                    completed_dict[(person, offer_id)].pop(i)
                    break
            if viewed_time == None and received_now_time != None:
                if complete_time <= end_time:
                    person_offer_metrics[person][unique_offer_key]['complete_time'] = complete_time
                    person_offer_metrics[person][unique_offer_key]['time_to_complete'] = complete_time - received_now_time
                    completed_dict[(person, offer_id)].pop(i)
                    break


In [619]:
# extracting data for getting relevant metrics from person_offer_metrics, so that we can analyse time taken to view and complete each offer
data = []
for person, offers in person_offer_metrics.items():
    for offer_id, metrics in offers.items():
        data.append({
            'person': person,
            'offer_id': offer_id,
            'offer_type': metrics['offer_type'],
            'time_to_view': metrics['time_to_view'],
            'time_to_complete': metrics['time_to_complete']
        })
metrics_df = pd.DataFrame(data)

metrics_df['original_offer_id'] = [x[0] for x in metrics_df['offer_id']]

In [620]:
metrics_df

Unnamed: 0,person,offer_id,offer_type,time_to_view,time_to_complete,original_offer_id
0,78afa995795e4d85b5d9ceeca43f5fef,"(bogo_7_5_5, 0, 0)",bogo,6.0,126.0,bogo_7_5_5
1,78afa995795e4d85b5d9ceeca43f5fef,"(informational_3_0_0, 168, 138329)",informational,48.0,,informational_3_0_0
2,78afa995795e4d85b5d9ceeca43f5fef,"(bogo_7_10_10, 408, 150845)",bogo,0.0,102.0,bogo_7_10_10
3,78afa995795e4d85b5d9ceeca43f5fef,"(bogo_5_5_5, 504, 167086)",bogo,78.0,,bogo_5_5_5
4,e2127556f4f64592b11af22de27a7932,"(bogo_7_5_5, 408, 3)",bogo,12.0,102.0,bogo_7_5_5
...,...,...,...,...,...,...
66496,c28c139d78c94303a0a993e3731e789f,"(discount_7_7_3, 408, 272637)",discount,0.0,54.0,discount_7_7_3
66497,c28c139d78c94303a0a993e3731e789f,"(discount_7_7_3, 504, 272641)",discount,12.0,24.0,discount_7_7_3
66498,9a3f45cf29ef428b932492c7a5d6ac52,"(discount_7_7_3, 576, 272726)",discount,24.0,78.0,discount_7_7_3
66499,ef1e309b68ce4278bf7d80b764bc46d6,"(discount_7_7_3, 408, 272734)",discount,42.0,0.0,discount_7_7_3


In [621]:
fig = make_subplots(rows=1, cols=2, subplot_titles=("Time to View", "Time to Complete"))

for offer_type in metrics_df['offer_type'].unique():
    fig.add_trace(
        go.Box(y=metrics_df[metrics_df['offer_type'] == offer_type]['time_to_view'],
               name=offer_type,
               marker_color='lightseagreen'),
        row=1, col=1
    )
    fig.add_trace(
        go.Box(y=metrics_df[metrics_df['offer_type'] == offer_type]['time_to_complete'],
               name=offer_type,
               marker_color='indianred'),
        row=1, col=2
    )

fig.update_layout(title_text="Distribution of Time to View and Complete by Offer Type", template=graph_custom_template, showlegend=False)
fig.show()

**Insights:**

The box plot for the time to view suggests that the median time to view BOGO and discount offers is under 50 hours, indicating that members view these offers quickly after receiving them.

Bogo offers have a similar interquartile range as informational and discount offers. This suggests:
1. Faster view time for bogo offers
2. Even though bogo and discount offers have a wider time to view spread as compared to informational offers, bogo's lower median suggest that while some customers might take longer to act, the majority view thesse offers more quickly.
3. The box size and position of each offer's time to view is relatively similar which shows that the response times are quite similar across all offers.

As for time to complete, discount offers take a longer time to complete as compared to bogo offers. Discount offers also have a wider spread of data, as shown by the bigger box, which suggest that there are more variation for time to compelete of discount a compared to bogo offers.

In [622]:
fig = make_subplots(rows=1, cols=2, subplot_titles=("Time to View", "Time to Complete"))

sorted_offer_ids = sorted(metrics_df['original_offer_id'].unique())

for offer_id in sorted_offer_ids:
    fig.add_trace(
        go.Box(y=metrics_df[metrics_df['original_offer_id'] == offer_id]['time_to_view'],
               name=offer_id,
               marker_color='lightseagreen'),
        row=1, col=1
    )
    fig.add_trace(
        go.Box(y=metrics_df[metrics_df['original_offer_id'] == offer_id]['time_to_complete'],
               name=offer_id,
               marker_color='indianred'),
        row=1, col=2
    )

fig.update_layout(title_text="Distribution of Time to View and Complete by Offer Id", template=graph_custom_template, showlegend=False)
fig.show()

**Insights:**

These plots further emphasize the insights stated previously. However, here we can see a detailed view how the factors of each offer affect the time to view and time to complete each offer. 
We can see that the difficulty of the offer affects the time to view and time to complete the most. 
Bogo offers are usually viewed and completed in the shortest amount of time.

### Offer completion rate

In [623]:
metrics_df['completed'] = metrics_df['time_to_complete'].notna()

offer_summary = metrics_df.groupby('original_offer_id').agg(
    total_offers=('original_offer_id', 'size'),
    completed_offers=('completed', 'sum')
).reset_index()

offer_summary['completion_rate'] = (offer_summary['completed_offers'] / offer_summary['total_offers']) * 100

fig = px.bar(offer_summary, 
             x='original_offer_id', 
             y='completion_rate',
             text='completion_rate',
             color='completion_rate',
             color_continuous_scale='RdBu') 

fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside', hoverinfo='x+y')

fig.update_layout(
    title='Offer Completion Rates Across Different Offers',
    xaxis_title='Offer ID',
    yaxis_title='Completion Rate (%)',
    yaxis=dict(range=[0, 100]),
    coloraxis_colorbar=dict(title='Completion Rate (%)'),
    template=graph_custom_template
)

fig.show()


**Insights:**

The completion rate of discount_10_10_2 and discount 7_7_3 is the highest out of all the discount offers. The time taken to view them is also the lowest out of all the discount offers.

The average completion rate of discount offers is more than bogo offers, which shows that discount offers are better in driving trasnactions.

In [641]:
corr_data = pd.merge(metrics_df, cleaned_portfolio_df[['id', 'difficulty', 'reward', 'duration']], 
                     how='left', left_on='original_offer_id', right_on='id')

corr_data.info()

offer_summary = corr_data.groupby('original_offer_id').agg(
    total_offers=('original_offer_id', 'size'),
    completed_offers=('completed', 'sum')
).reset_index()

offer_summary['completion_rate'] = (offer_summary['completed_offers'] / offer_summary['total_offers']) * 100

corr_data = pd.merge(corr_data, offer_summary[['original_offer_id', 'completion_rate']], on='original_offer_id', how='left')

correlation_df = corr_data[['completion_rate', 'time_to_view', 'time_to_complete', 'difficulty', 'reward', 'duration']]

correlation_matrix = correlation_df.corr()

fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.index,
    colorscale='RdBu',
    zmid=0,
    texttemplate="%{z:.2f}",
    hoverinfo="text+x+y"
))

fig.update_layout(
    title="Correlation Matrix for Offer Interaction Times and Offer Attributes",
    xaxis=dict(tickmode="array", tickvals=np.arange(len(correlation_matrix.columns)), ticktext=correlation_matrix.columns),
    yaxis=dict(tickmode="array", tickvals=np.arange(len(correlation_matrix.index)), ticktext=correlation_matrix.index),
    autosize=True,
    margin=dict(l=150, r=100, t=100, b=100),
    template=graph_custom_template,
    width = 850, height=400
)

fig.show()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 66501 entries, 0 to 66500
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   person             66501 non-null  object 
 1   offer_id           66501 non-null  object 
 2   offer_type         66501 non-null  object 
 3   time_to_view       48843 non-null  float64
 4   time_to_complete   22646 non-null  float64
 5   original_offer_id  66501 non-null  object 
 6   completed          66501 non-null  bool   
 7   id                 66501 non-null  object 
 8   difficulty         66501 non-null  int64  
 9   reward             66501 non-null  int64  
 10  duration           66501 non-null  int64  
dtypes: bool(1), float64(2), int64(3), object(5)
memory usage: 5.6+ MB


**Insights:**

This correlation plot agrees with our intuition that:
2. There is moderate positive lienar corelation between completion rate and difficulty
3. There is moderate positive lienar corelation between difficulty and duration of offer

However, some does not agree:
1. There is a strong negative linear correlation between completion rate and reward
4. There is moderate negative lienar corelation between reward and duration of offer