In [96]:
import pandas as pd
from datetime import datetime

# Campaign Data


In [67]:
df_campaign = pd.read_csv('Marketing Department/campaign_data.csv', sep='\t')
df_campaign

Unnamed: 0.1,Unnamed: 0,campaign_id,campaign_name,campaign_description,discount
0,0,CAMPAIGN24776,wouldn't you know it,"""Twee retro vinyl single-origin coffee sartori...",1%
1,1,CAMPAIGN33679,could be written on the back of a postage stamp,"""Fanny pack gentrify cardigan messenger bag."" ...",1pct
2,2,CAMPAIGN49972,me neither,"""DIY pug leggings everyday craft beer cardigan...",10%%
3,3,CAMPAIGN61872,on the huh,"""Trust fund pinterest chambray."" - Claude Aufd...",5%
4,4,CAMPAIGN03110,stick a fork in it,"""YOLO tumblr Yuccie austin."" - Jordi Kunde",1percent
5,5,CAMPAIGN46302,you must be new here,"""Craft beer XOXO hella tacos chillwave cred or...",1pct
6,6,CAMPAIGN11190,mind your own beeswax,"""Street shoreditch viral before they sold out ...",10pct
7,7,CAMPAIGN53595,how do I get to the train station,"""Craft beer venmo lomo fixie readymade marfa.""...",20pct
8,8,CAMPAIGN29983,would it hurt,"""Vegan migas ramps keytar wolf cray kickstarte...",1%
9,9,CAMPAIGN52447,pound for pound,"""Semiotics biodiesel everyday craft beer etsy ...",1percent


#### Check for nulls


In [68]:
df_campaign.isnull().sum()

Unnamed: 0              0
campaign_id             0
campaign_name           0
campaign_description    0
discount                0
dtype: int64

#### Check for duplicates


In [69]:
df_campaign[df_campaign.duplicated()]

Unnamed: 0.1,Unnamed: 0,campaign_id,campaign_name,campaign_description,discount


In [70]:
df_campaign[df_campaign['campaign_id'].duplicated()]

Unnamed: 0.1,Unnamed: 0,campaign_id,campaign_name,campaign_description,discount


In [71]:
df_campaign[df_campaign['campaign_name'].duplicated()]

Unnamed: 0.1,Unnamed: 0,campaign_id,campaign_name,campaign_description,discount


In [72]:
df_campaign[df_campaign['campaign_description'].duplicated()]

Unnamed: 0.1,Unnamed: 0,campaign_id,campaign_name,campaign_description,discount


In [73]:
df_campaign[df_campaign['discount'].duplicated()]

Unnamed: 0.1,Unnamed: 0,campaign_id,campaign_name,campaign_description,discount
5,5,CAMPAIGN46302,you must be new here,"""Craft beer XOXO hella tacos chillwave cred or...",1pct
8,8,CAMPAIGN29983,would it hurt,"""Vegan migas ramps keytar wolf cray kickstarte...",1%
9,9,CAMPAIGN52447,pound for pound,"""Semiotics biodiesel everyday craft beer etsy ...",1percent


#### Check unique values of campaign names


In [74]:
unique_campaign_names = df_campaign['campaign_name'].unique()
unique_campaign_names

array(["wouldn't you know it",
       'could be written on the back of a postage stamp', 'me neither',
       'on the huh', 'stick a fork in it', 'you must be new here',
       'mind your own beeswax', 'how do I get to the train station',
       'would it hurt', 'pound for pound'], dtype=object)

#### Check unique values of campaign description


In [75]:
unique_campaign_description = df_campaign['campaign_description'].unique()
unique_campaign_description

array(['"Twee retro vinyl single-origin coffee sartorial fanny pack brunch offal health." - Raleigh Senger',
       '"Fanny pack gentrify cardigan messenger bag." - Bradley Stamm',
       '"DIY pug leggings everyday craft beer cardigan knausgaard +1 crucifix flannel." - Tremayne Nader',
       '"Trust fund pinterest chambray." - Claude Aufderhar',
       '"YOLO tumblr Yuccie austin." - Jordi Kunde',
       '"Craft beer XOXO hella tacos chillwave cred organic letterpress disrupt artisan." - Rodrick Lebsack',
       '"Street shoreditch viral before they sold out yr ramps skateboard skateboard bitters pabst." - Brendan Miller',
       '"Craft beer venmo lomo fixie readymade marfa." - Benny Bogan',
       '"Vegan migas ramps keytar wolf cray kickstarter five dollar toast." - Adeline Brakus',
       '"Semiotics biodiesel everyday craft beer etsy semiotics keffiyeh meditation single-origin coffee." - Bernadette Pollich'],
      dtype=object)

#### Check unique values of discount


In [76]:
unique_campaign_discount = df_campaign['discount'].unique()
unique_campaign_discount

array(['1%', '1pct', '10%%', '5%', '1percent', '10pct', '20pct'],
      dtype=object)

### Drop "Unnamed: 0" column


In [77]:
df_campaign = df_campaign.drop('Unnamed: 0', axis=1)
df_campaign

Unnamed: 0,campaign_id,campaign_name,campaign_description,discount
0,CAMPAIGN24776,wouldn't you know it,"""Twee retro vinyl single-origin coffee sartori...",1%
1,CAMPAIGN33679,could be written on the back of a postage stamp,"""Fanny pack gentrify cardigan messenger bag."" ...",1pct
2,CAMPAIGN49972,me neither,"""DIY pug leggings everyday craft beer cardigan...",10%%
3,CAMPAIGN61872,on the huh,"""Trust fund pinterest chambray."" - Claude Aufd...",5%
4,CAMPAIGN03110,stick a fork in it,"""YOLO tumblr Yuccie austin."" - Jordi Kunde",1percent
5,CAMPAIGN46302,you must be new here,"""Craft beer XOXO hella tacos chillwave cred or...",1pct
6,CAMPAIGN11190,mind your own beeswax,"""Street shoreditch viral before they sold out ...",10pct
7,CAMPAIGN53595,how do I get to the train station,"""Craft beer venmo lomo fixie readymade marfa.""...",20pct
8,CAMPAIGN29983,would it hurt,"""Vegan migas ramps keytar wolf cray kickstarte...",1%
9,CAMPAIGN52447,pound for pound,"""Semiotics biodiesel everyday craft beer etsy ...",1percent


### Clean discount values


#### Remove non-numeric characters


In [78]:
def clean_discount(discount):
    cleaned_discount = ''.join(char for char in discount if char.isdigit())
    return cleaned_discount


df_campaign['discount'] = df_campaign['discount'].apply(clean_discount)
df_campaign.head()

Unnamed: 0,campaign_id,campaign_name,campaign_description,discount
0,CAMPAIGN24776,wouldn't you know it,"""Twee retro vinyl single-origin coffee sartori...",1
1,CAMPAIGN33679,could be written on the back of a postage stamp,"""Fanny pack gentrify cardigan messenger bag."" ...",1
2,CAMPAIGN49972,me neither,"""DIY pug leggings everyday craft beer cardigan...",10
3,CAMPAIGN61872,on the huh,"""Trust fund pinterest chambray."" - Claude Aufd...",5
4,CAMPAIGN03110,stick a fork in it,"""YOLO tumblr Yuccie austin."" - Jordi Kunde",1


#### Add % to the discount column


In [79]:
df_campaign['discount'] = df_campaign['discount'] + '%'
df_campaign.head()

Unnamed: 0,campaign_id,campaign_name,campaign_description,discount
0,CAMPAIGN24776,wouldn't you know it,"""Twee retro vinyl single-origin coffee sartori...",1%
1,CAMPAIGN33679,could be written on the back of a postage stamp,"""Fanny pack gentrify cardigan messenger bag."" ...",1%
2,CAMPAIGN49972,me neither,"""DIY pug leggings everyday craft beer cardigan...",10%
3,CAMPAIGN61872,on the huh,"""Trust fund pinterest chambray."" - Claude Aufd...",5%
4,CAMPAIGN03110,stick a fork in it,"""YOLO tumblr Yuccie austin."" - Jordi Kunde",1%


### Export to a parquet file


In [80]:
df_campaign.to_parquet(
    'Marketing Department/campaign_data.parquet', index=False)

# Transactional Campaign Data


In [81]:
df_transactional_campaign = pd.read_csv(
    'Marketing Department/transactional_campaign_data.csv')
df_transactional_campaign

Unnamed: 0.1,Unnamed: 0,transaction_date,campaign_id,order_id,estimated arrival,availed
0,2,2023-08-14,CAMPAIGN49972,0612c246-57f1-40e8-9993-0f8d41992049,10days,1
1,4,2021-10-12,CAMPAIGN46302,b4c411de-2fd3-4806-91ae-165edc9baa12,13days,0
2,6,2023-01-30,CAMPAIGN29983,26de6b40-db2d-40b9-a64c-58736eaf0381,3days,1
3,8,2022-01-05,CAMPAIGN46302,26b60a4e-aafe-4b99-bace-034d088a4a53,8days,1
4,9,2023-09-19,CAMPAIGN46302,4aab29ae-e610-46bf-92af-199f6f420cee,14days,1
...,...,...,...,...,...,...
124882,499990,2022-04-14,CAMPAIGN61872,6a5163c2-ba62-461c-a729-5b06fe266bd9,12days,0
124883,499991,2022-11-23,CAMPAIGN11190,883e2fe2-bbdf-4b5e-8348-c2bcba07da0e,15days,1
124884,499992,2023-07-28,CAMPAIGN33679,575b0a2d-5539-4565-a10b-1f4f6516cd27,12days,1
124885,499995,2020-04-12,CAMPAIGN49972,ba506489-1505-481d-b0b1-8cdbf04a515f,6days,1


#### Check for nulls


In [82]:
df_transactional_campaign.isnull().sum()

Unnamed: 0           0
transaction_date     0
campaign_id          0
order_id             0
estimated arrival    0
availed              0
dtype: int64

#### Check for duplicates


In [83]:
df_transactional_campaign[df_transactional_campaign.duplicated()]

Unnamed: 0.1,Unnamed: 0,transaction_date,campaign_id,order_id,estimated arrival,availed


In [84]:
campaign_duplicates = df_transactional_campaign['campaign_id'].duplicated(
).sum()
print('Number of duplicated rows: ', campaign_duplicates)

Number of duplicated rows:  124877


In [85]:
campaign_id_duplicates = df_transactional_campaign['campaign_id'].value_counts(
)
print(campaign_id_duplicates)

campaign_id
CAMPAIGN46302    12732
CAMPAIGN49972    12719
CAMPAIGN03110    12652
CAMPAIGN53595    12626
CAMPAIGN11190    12449
CAMPAIGN52447    12447
CAMPAIGN24776    12433
CAMPAIGN29983    12352
CAMPAIGN33679    12345
CAMPAIGN61872    12132
Name: count, dtype: int64


In [86]:
order_duplicates = df_transactional_campaign['order_id'].duplicated().sum()
print('Number of duplicated rows: ', order_duplicates)

Number of duplicated rows:  0


In [87]:
arrival_duplicates = df_transactional_campaign['estimated arrival'].duplicated(
).sum()
print('Number of duplicated rows: ', arrival_duplicates)

Number of duplicated rows:  124874


In [88]:
arrival_value_duplicates = df_transactional_campaign['estimated arrival'].value_counts(
)
print(arrival_value_duplicates)

estimated arrival
10days    9894
12days    9788
14days    9671
15days    9650
5days     9643
7days     9636
8days     9620
4days     9587
3days     9545
9days     9500
6days     9469
13days    9459
11days    9425
Name: count, dtype: int64


#### Check unique values of estimated arrival


In [89]:
unique_estimated_arrival = df_transactional_campaign['estimated arrival'].unique(
)
unique_estimated_arrival

array(['10days', '13days', '3days', '8days', '14days', '11days', '7days',
       '9days', '4days', '5days', '12days', '6days', '15days'],
      dtype=object)

### Drop 'Unnamed: 0' column


In [90]:
df_transactional_campaign = df_transactional_campaign.drop(
    'Unnamed: 0', axis=1)
df_transactional_campaign.head()

Unnamed: 0,transaction_date,campaign_id,order_id,estimated arrival,availed
0,2023-08-14,CAMPAIGN49972,0612c246-57f1-40e8-9993-0f8d41992049,10days,1
1,2021-10-12,CAMPAIGN46302,b4c411de-2fd3-4806-91ae-165edc9baa12,13days,0
2,2023-01-30,CAMPAIGN29983,26de6b40-db2d-40b9-a64c-58736eaf0381,3days,1
3,2022-01-05,CAMPAIGN46302,26b60a4e-aafe-4b99-bace-034d088a4a53,8days,1
4,2023-09-19,CAMPAIGN46302,4aab29ae-e610-46bf-92af-199f6f420cee,14days,1


### Remove future dates


In [91]:
df_transactional_campaign['transaction_date'] = pd.to_datetime(
    df_transactional_campaign['transaction_date'])

now = datetime.now()
future_dates = df_transactional_campaign[df_transactional_campaign['transaction_date'] > now]

print(future_dates)

       transaction_date    campaign_id                              order_id  \
1088         2023-12-15  CAMPAIGN03110  e2ea2e06-8c22-4f02-b693-141d98569690   
13944        2023-12-15  CAMPAIGN53595  d9b02147-3618-4042-b7ed-eba59d73b7fc   
15067        2023-12-30  CAMPAIGN53595  5c135515-dfed-4d0c-aaca-3e55b52568e3   
29062        2023-12-30  CAMPAIGN11190  0c74f772-6b56-4d93-b34e-221bbb2c5196   
30926        2023-12-30  CAMPAIGN03110  ac7a088a-54ce-4c60-b3c7-8609ffcd29bc   
40511        2023-12-15  CAMPAIGN11190  32662310-c738-4081-8754-3372191a50b0   
43293        2023-12-15  CAMPAIGN24776  e63ece5b-bde7-442f-93ef-14d3d2b43081   
44233        2023-12-15  CAMPAIGN29983  13e03f63-4ded-4730-a6fc-0d981f91e5d6   
57142        2023-12-15  CAMPAIGN29983  a1f3bf82-15f5-4e07-827d-426ea89fa1e4   
62337        2023-12-30  CAMPAIGN29983  feba40be-b208-4112-9d4e-62dabb9ceb87   
67276        2023-12-30  CAMPAIGN53595  4466db03-bb85-4bfc-ae74-e78ae738c5c6   
68445        2023-12-15  CAMPAIGN53595  

In [92]:
df_transactional_campaign = df_transactional_campaign[
    df_transactional_campaign['transaction_date'] <= now]

future_dates = df_transactional_campaign[df_transactional_campaign['transaction_date'] > now]

print(future_dates)

Empty DataFrame
Columns: [transaction_date, campaign_id, order_id, estimated arrival, availed]
Index: []


In [93]:
df_transactional_campaign.head()

Unnamed: 0,transaction_date,campaign_id,order_id,estimated arrival,availed
0,2023-08-14,CAMPAIGN49972,0612c246-57f1-40e8-9993-0f8d41992049,10days,1
1,2021-10-12,CAMPAIGN46302,b4c411de-2fd3-4806-91ae-165edc9baa12,13days,0
2,2023-01-30,CAMPAIGN29983,26de6b40-db2d-40b9-a64c-58736eaf0381,3days,1
3,2022-01-05,CAMPAIGN46302,26b60a4e-aafe-4b99-bace-034d088a4a53,8days,1
4,2023-09-19,CAMPAIGN46302,4aab29ae-e610-46bf-92af-199f6f420cee,14days,1


### Fix the format of the estimated arrival column


In [94]:
df_transactional_campaign['estimated arrival'] = df_transactional_campaign['estimated arrival'].astype(
    str).str.replace('days', ' days')

df_transactional_campaign.head()

Unnamed: 0,transaction_date,campaign_id,order_id,estimated arrival,availed
0,2023-08-14,CAMPAIGN49972,0612c246-57f1-40e8-9993-0f8d41992049,10 days,1
1,2021-10-12,CAMPAIGN46302,b4c411de-2fd3-4806-91ae-165edc9baa12,13 days,0
2,2023-01-30,CAMPAIGN29983,26de6b40-db2d-40b9-a64c-58736eaf0381,3 days,1
3,2022-01-05,CAMPAIGN46302,26b60a4e-aafe-4b99-bace-034d088a4a53,8 days,1
4,2023-09-19,CAMPAIGN46302,4aab29ae-e610-46bf-92af-199f6f420cee,14 days,1


### Export to a parquet file


In [95]:
df_transactional_campaign.to_parquet(
    'Marketing Department/transactional_campaign_data.parquet', index=False)