In [1]:
import pandas as pd
import numpy as np

## Campaign Data

In [2]:
df_campaign = pd.read_csv('Marketing Department/campaign_data.csv', sep = '\t')
print(df_campaign)
print(df_campaign.columns.ravel())
df_campaign.isnull().sum()

   Unnamed: 0    campaign_id                                    campaign_name  \
0           0  CAMPAIGN24776                             wouldn't you know it   
1           1  CAMPAIGN33679  could be written on the back of a postage stamp   
2           2  CAMPAIGN49972                                       me neither   
3           3  CAMPAIGN61872                                       on the huh   
4           4  CAMPAIGN03110                               stick a fork in it   
5           5  CAMPAIGN46302                             you must be new here   
6           6  CAMPAIGN11190                            mind your own beeswax   
7           7  CAMPAIGN53595                how do I get to the train station   
8           8  CAMPAIGN29983                                    would it hurt   
9           9  CAMPAIGN52447                                  pound for pound   

                                campaign_description  discount  
0  "Twee retro vinyl single-origin coffee s

Unnamed: 0              0
campaign_id             0
campaign_name           0
campaign_description    0
discount                0
dtype: int64

## Data Profiling for Campaign Data

In [3]:
df_campaign[df_campaign.duplicated()] # No Dupes

Unnamed: 0.1,Unnamed: 0,campaign_id,campaign_name,campaign_description,discount


In [4]:
df_campaign[df_campaign['campaign_id'].duplicated()]

Unnamed: 0.1,Unnamed: 0,campaign_id,campaign_name,campaign_description,discount


In [5]:
df_campaign[df_campaign['campaign_name'].duplicated()]

Unnamed: 0.1,Unnamed: 0,campaign_id,campaign_name,campaign_description,discount


In [6]:
df_campaign[df_campaign['campaign_description'].duplicated()]

Unnamed: 0.1,Unnamed: 0,campaign_id,campaign_name,campaign_description,discount


In [7]:
df_campaign[df_campaign['discount'].duplicated()]

Unnamed: 0.1,Unnamed: 0,campaign_id,campaign_name,campaign_description,discount
5,5,CAMPAIGN46302,you must be new here,"""Craft beer XOXO hella tacos chillwave cred or...",1pct
8,8,CAMPAIGN29983,would it hurt,"""Vegan migas ramps keytar wolf cray kickstarte...",1%
9,9,CAMPAIGN52447,pound for pound,"""Semiotics biodiesel everyday craft beer etsy ...",1percent


## Converting discount percentages to float

In [8]:
df_campaign['discount'] = df_campaign['discount'].str.rstrip('%pctpercent').astype(float)/100
df_campaign.head()
df_campaign.to_parquet("Marketing Department/parquets/cleaned_campaign.parquet")

## Transactional Campaign Data

In [9]:
df_transactional_campaign = pd.read_csv('Marketing Department/transactional_campaign_data.csv')
print(df_transactional_campaign)
print(df_transactional_campaign.columns.ravel())
df_transactional_campaign.isnull().sum()

        Unnamed: 0 transaction_date    campaign_id  \
0                2       2023-08-14  CAMPAIGN49972   
1                4       2021-10-12  CAMPAIGN46302   
2                6       2023-01-30  CAMPAIGN29983   
3                8       2022-01-05  CAMPAIGN46302   
4                9       2023-09-19  CAMPAIGN46302   
...            ...              ...            ...   
124882      499990       2022-04-14  CAMPAIGN61872   
124883      499991       2022-11-23  CAMPAIGN11190   
124884      499992       2023-07-28  CAMPAIGN33679   
124885      499995       2020-04-12  CAMPAIGN49972   
124886      499997       2023-03-24  CAMPAIGN52447   

                                    order_id estimated arrival  availed  
0       0612c246-57f1-40e8-9993-0f8d41992049            10days        1  
1       b4c411de-2fd3-4806-91ae-165edc9baa12            13days        0  
2       26de6b40-db2d-40b9-a64c-58736eaf0381             3days        1  
3       26b60a4e-aafe-4b99-bace-034d088a4a53           

Unnamed: 0           0
transaction_date     0
campaign_id          0
order_id             0
estimated arrival    0
availed              0
dtype: int64

## Data Profiling for Transactional Campaign Data

In [10]:
df_transactional_campaign[df_transactional_campaign['order_id'].duplicated()] # No Dupes

Unnamed: 0.1,Unnamed: 0,transaction_date,campaign_id,order_id,estimated arrival,availed


### Checking for future dates

In [11]:
cond = pd.to_datetime(df_transactional_campaign['transaction_date']) > pd.Timestamp(2024,1,1)
df_transactional_campaign.loc[cond]

Unnamed: 0.1,Unnamed: 0,transaction_date,campaign_id,order_id,estimated arrival,availed


In [12]:
df_transactional_campaign[df_transactional_campaign['estimated arrival'].str.contains('days')] # No Dupes

Unnamed: 0.1,Unnamed: 0,transaction_date,campaign_id,order_id,estimated arrival,availed
0,2,2023-08-14,CAMPAIGN49972,0612c246-57f1-40e8-9993-0f8d41992049,10days,1
1,4,2021-10-12,CAMPAIGN46302,b4c411de-2fd3-4806-91ae-165edc9baa12,13days,0
2,6,2023-01-30,CAMPAIGN29983,26de6b40-db2d-40b9-a64c-58736eaf0381,3days,1
3,8,2022-01-05,CAMPAIGN46302,26b60a4e-aafe-4b99-bace-034d088a4a53,8days,1
4,9,2023-09-19,CAMPAIGN46302,4aab29ae-e610-46bf-92af-199f6f420cee,14days,1
...,...,...,...,...,...,...
124882,499990,2022-04-14,CAMPAIGN61872,6a5163c2-ba62-461c-a729-5b06fe266bd9,12days,0
124883,499991,2022-11-23,CAMPAIGN11190,883e2fe2-bbdf-4b5e-8348-c2bcba07da0e,15days,1
124884,499992,2023-07-28,CAMPAIGN33679,575b0a2d-5539-4565-a10b-1f4f6516cd27,12days,1
124885,499995,2020-04-12,CAMPAIGN49972,ba506489-1505-481d-b0b1-8cdbf04a515f,6days,1


In [13]:
df_transactional_campaign.to_parquet("Marketing Department/parquets/cleaned_transaction_campaign.parquet")