In [1]:
import pandas as pd
import numpy as np
import time
from datetime import datetime
from matplotlib import pyplot as plt
import seaborn as sns
import pickle
%matplotlib inline

# Load data

In [2]:
profiles = pd.read_json('raw_data/profile.json', orient='records', lines=True)
transcript = pd.read_json('raw_data/transcript.json', orient='records', lines=True)
offers = pd.read_csv("processed_data/offers_df.csv").drop("Unnamed: 0", axis = 1)
offers.head()

Unnamed: 0,client_id,offer_id,completed,when_received,gender_x,age_x,became_member_on_x,income_x,new_id_x,reward_x,...,gender_y,age_y,became_member_on_y,income_y,new_id_y,reward_y,channels_y,difficulty_y,duration_y,offer_type_y
0,1,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,0,F,75,20170509,100000.0,1,5,...,F,75,20170509,100000.0,1,5,"['web', 'email', 'mobile']",5,7,bogo
1,1,5a8bc65990b245e5a138643cd4eb9837,1,168,F,75,20170509,100000.0,1,0,...,F,75,20170509,100000.0,1,0,"['email', 'mobile', 'social']",0,3,informational
2,1,ae264e3637204a6fb9bb56bc8210ddfd,1,408,F,75,20170509,100000.0,1,10,...,F,75,20170509,100000.0,1,10,"['email', 'mobile', 'social']",10,7,bogo
3,1,f19421c1d4aa40978ebb69ca19b0e20d,0,504,F,75,20170509,100000.0,1,5,...,F,75,20170509,100000.0,1,5,"['web', 'email', 'mobile', 'social']",5,5,bogo
4,2,0b1e1539f2cc45b7b9fa7c272da2e1d7,0,0,,118,20170804,,2,5,...,,118,20170804,,2,5,"['web', 'email']",20,10,discount


# Pre exploration

In [3]:
timelapse = transcript.groupby("person").agg({"time": lambda x: (np.max(x) - np.min(x))/ 24})
print("Average nº od days: ", np.mean(timelapse.time.values))
print("Mean deviation nº of days; ", np.std(timelapse.time.values))

Average nº od days:  25.46329411764706
Mean deviation nº of days;  3.796254698650266


# Additional cleaning and feature creation

In [4]:
offers["became_member_on"] = pd.to_datetime(offers.became_member_on, format = "%Y%m%d")

AttributeError: 'DataFrame' object has no attribute 'became_member_on'

## Age

Abnormal number of people 118 yo

In [None]:
profiles.age.describe()

In [None]:
plt.subplots(figsize = (6, 3))
profiles.age.hist()
plt.title("Age distribution")


In [None]:
np.mean(offers.age == 118)

12 % of persons are apparently 118

## Dealing with  118 y.o.

Income and Gender are null for clients with age equal to 118

In [None]:
print("Age 118. Prop of nulls in Gender:", profiles.loc[profiles.age == 118].gender.isnull().mean())
print("Age 118. Prop of nulls in Income:", profiles.loc[profiles.age == 118].income.isnull().mean())

In [None]:
profiles.loc[profiles.age == 118].head()

Adding a fake age flag to the profiles and main dataset

In [None]:
offers["age_error"] = offers.age == 118

offers.groupby("age_error").agg({"completed": np.mean, 
                              "income": np.mean,
                              "became_member_on": np.mean})


Clients with fake age ave a very low average spending compared to the rest

Being income and gender null for clients of age 118, the only client feature that could potentially diverge from the rest is 'became_member_on'. We compare this feature for the two groups below

In [None]:
plt.subplots(figsize = (6, 3.3))
offers["month_member_on"] = offers.became_member_on.values.astype('datetime64[M]')

pr = pd.DataFrame(offers.groupby(["month_member_on", "age_error"]).count()["client_id"]).reset_index()

pr["f"] = pr.groupby("age_error").client_id.apply(lambda x: x / sum(x))
pr


# pr
sns.lineplot(data = pr, x = "month_member_on", y = "f", hue = "age_error")
plt.title("Client antiquity in time for aged 118 and the rest")

Since the only feature that is not null for persons aged 118 doesn't difere from the rest, **we proceed to remove these persons**

We also observe a strange behaviour related with when a person became member: it remains constant for the most part but there are two big increases around mid 2015 and mid 2017

In [None]:
# removing persons aged 118

# data = data.loc[data.age != 118]
offers = offers.loc[offers.age != 118]

## income

In [None]:
plt.clf()
fig, ax = plt.subplots(figsize = (6, 3))
income_df = offers.groupby("client_id").agg({"income": "mean"})
sns.histplot(data = income_df, x = "income", ax = ax)
plt.title("Income distribution")

## Additional features

Among the additional features creates, **seniority** divides the period de person became member in three categories, following the findings in the "Dealing with 118 y.o section

In [None]:
offers["income_thousands"] = offers.income / 1000

# creation of seniority variable, which divides the period the person became member, by categor
offers['seniority'] =  np.where(offers.month_member_on < "2015-08-01", "Gen1", 
                                      np.where(offers.month_member_on < "2017-08-01", "Gen2", 
                                               "Gen3"))


seniority_order = ['Gen 1', 'Gen 2', 'Gen 3']
offers = offers.reindex(offers['seniority'].isin(seniority_order).index, level=0)
offers = offers.sort_values(by='seniority')

profiles['became_member_on']  = pd.to_datetime(profiles.became_member_on, format = "%Y%m%d")
profiles["month_member_on"] = profiles.became_member_on.values.astype('datetime64[M]')

profiles['seniority'] =  np.where(profiles.month_member_on < "2015-08-01", "Gen1", 
                                      np.where(profiles.month_member_on < "2017-08-01", "Gen2", 
                                               "Gen3"))


In [None]:
profiles.became_member_on.describe()

In [None]:
def summarize_completed(group_vars):
    
    return offers.groupby(group_vars).agg({"completed": "mean"}).reset_index()

# Completion by type

In [None]:
offers.groupby("offer_type").agg({"completed": "mean"})

In [None]:
offers_income = offers.groupby(["offer_type", "income_thousands"]).agg({"completed": "mean"}).reset_index()

In [None]:
offers_income

In [None]:
#  fig, ax = plt.subplots()



sns.relplot(data = offers_income, 
            x = "income_thousands", 
            y = "completed", 
            hue = "offer_type", 
            kind = "line",
            aspect = 1.5,
            height = 4
           )

plt.title("Proportion of offers completed by income")

In [None]:
(profiles.income > 90000).mean()

In [None]:
offers.groupby(["completed", "offer_type"]).agg({"income_thousands": "mean"})

# Seniority

In [None]:
offers.seniority.value_counts()

In [None]:
offers.groupby("seniority").agg({"completed": "mean", 
                                            "income": "mean", 
                                            "gender": lambda x: np.mean(x == "M"),
                                            "age": "mean", 
                                           "duration": "mean", 
                                           "difficulty": "mean"})

Turns out that Gen 1 customers are the older ones. It seems they have less population aged between 20 and 40. It turns out that this age range is formed by people who completed less discount and bogo offers

In [None]:
sns.displot(data = offers, 
           x = "age", 
           row = "seniority", 
           height = 1.5, 
           aspect = 3, 
           facet_kws=dict(sharey=False))

In [None]:
profiles_age_period = profiles.loc[profiles.age  != 118].value_counts(["age", "seniority"]).reset_index()\
.rename(columns = {0: "n"})
profiles_age_period["f"] = profiles_age_period.groupby("age").n.apply(lambda x: x / sum(x))
sns.relplot(data = profiles_age_period, 
           x = "age", 
           y = "f", 
           hue = "seniority", 
           height = 4, 
           aspect = 1.5, 
           kind = "line")

In [None]:
offers.groupby(["seniority", "offer_type"])\
.agg({"completed": "mean"}).reset_index().pivot(columns = "seniority", 
                                                index = "offer_type", 
                                               values = "completed")

In [None]:
offers_period_completed = offers.groupby(["seniority", "offer_type"])\
.agg({"completed": "mean"}).reset_index()

offers_period_completed

sns.catplot(data = offers_period_completed, 
           x = "seniority", 
           y = "completed", 
           hue = "offer_type",
           kind = "bar", 
           height = 3, 
           aspect = 1.8)

# Gender

It looks like women and other genders respond better to offers than men

In [None]:
offers.groupby(["gender", "offer_type"]).agg({"completed": "mean", "new_id": "count"})

In [None]:
offers.groupby(["gender", "offer_type", "seniority"]).agg({"completed": "mean"}).reset_index()\
.pivot(index = ["offer_type", "gender"], 
       columns = "seniority", 
      values = "completed")

# Age

It looks like young people older than (35 - 40) respond better to offers

In [None]:
completed_age = summarize_completed(["offer_type", "age"])

sns.relplot(data = completed_age.loc[completed_age.age <= 80], 
           x = "age", 
           y = "completed", 
           hue = "offer_type", 
           kind = "line", 
           height = 4, 
           aspect = 1.5)

plt.title("Rate of completion by age")

In [None]:
offers["age_int"] = pd.cut(offers.age, bins = [20, 40, 60, 80, 100])

In [None]:
summarize_completed("age_int")

In [None]:
summarize_completed(["offer_type", "age_int"])

# Duration

Hay que mirar si esta aparente contradiccion se compensa con el reward o el difficulty

In [None]:
summarize_completed(["offer_type", "duration"])

In [None]:
offers.groupby(["offer_type", "duration"]).agg({"completed": "mean", 
                                                "difficulty": "mean", 
                                                "reward": "mean"})

# Difficulty

In [None]:
summarize_completed(["offer_type", "difficulty"])

# Reward

In [None]:
summarize_completed(["offer_type", "reward"])

It looks like duration have the opposite effect than expected. Difficulty and reward behaving as expected. It seems like for bogo, both of these magnitudes are totally correlated. DIfficulty has a big effect on discount

In [None]:
offers.value_counts(subset = ["offer_type", "difficulty", "reward"])

# Income

In [None]:
offers.groupby("completed").agg({"income_thousands": ["mean","std"]})

In [None]:
sns.displot(data = offers, 
           x = "income_thousands", 
           hue = "completed", 
           height = 3, 
            kind = "kde",
           aspect = 3)

In [None]:
completed_income = summarize_completed(["offer_type", "income_thousands"])

sns.relplot(data = completed_income, 
           x = "income_thousands", 
           y = "completed", 
           hue = "offer_type", 
           kind = "line", 
           height = 4, 
           aspect = 1.5)

plt.title("Rate of completion by income")

# Channels

In [None]:
offers.channels.value_counts()

In [None]:
summarize_completed(["channels"])

In [None]:
def is_channel(li, channel):
    
    return (channel in li) * 1

# email is in every offer, we don't have to count for it
for channel in ["web", "mobile", "social"]:
    
    offers[channel] = offers.channels.apply(lambda x: is_channel(x, channel))


In [None]:
# 
offers.loc[offers.completed == 1].groupby("social").agg({"age": "mean", "gender": lambda x: np.mean(x == "M"), 
                                                      "income_thousands": "mean"})

In [None]:
offers[["web", "mobile", "social"]].mean()

In [None]:
offers_by_channel = offers.value_counts(subset = ["offer_type", "channels"]).reset_index()
sns.catplot(data = offers_by_channel, 
           x = "channels", 
           y = 0, 
           hue = "offer_type", 
           kind = "bar", 
           height = 3, 
           aspect = 3)

In [None]:
offers.groupby("social").agg({"completed": "mean"})

In [None]:
offers.groupby(["offer_type", "social"]).agg({"completed": "mean"})

In [None]:
summarize_completed(["channels", "offer_type"])

Since a offer is completed whenever it is viewed and there are enough amount spent, we should control by income when modeling

# Conclusions

- All offers are completed a similar rate (between 40 % and 45 %), being Informational, the offer type with the hightest rate of completion and bogo the type with the lowest rate
- The rate of completion for bogo and discount seems to increase with income, atleast up to the $ 90 K, that's it, almost the 90 % of the population.
- Gen 2 is the generation with the hightest rate of completion, specially for BOGO offers
- Female complete bogo and discount offers at a higher rate than males. People who identifies with other gender have the highest completion rate for every offer type.
- Young people seem to complete offers at a lower rate that older ones. There seem to be a a stepped increase at the age of 35.
- Offers distributed through social channels have a significative higher rate of completion than the ones which are not, specially for the discount offer.

# More variables for modeling and viz.

In [None]:
offers["young"] = offers.age < 35
offers["social"] = offers.channels.apply(lambda x: "social" in x)

offers = offers.replace({"offer_type": {"informational": "Informational", "bogo": "Bogo", "discount": "Discount"}})
offers = offers.replace({"gender": {"M": "Male", "F": "Female", "O": "Other"}})


# Saving data

In [None]:
pickle.dump(offers, open("processed_data/offers_cleaned.p", "wb"))