# Data Preparation

### Import necessary libraries

In [31]:
import pandas as pd

### Load data from the Google Sheet

In [32]:
sheet_id = "1OHl5u6-31KyQSYHeLJBXAk9Xc4eiLPviXIlc6ycaTM4"

df = pd.read_csv(f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv")

df.to_csv("../data/ab_raw_data.csv")

df.head()

Unnamed: 0,user_id,install_date,country,test_group,trial,paid,subscription_name,revenue_1m
0,0000dd3fa4702a63d1b76aaffe1ab39b,2023-06-05,US,treatment,0,0,,0.0
1,0001f27ab7e22228e54c8b2028b43f24,2023-06-07,AU,treatment,0,0,,0.0
2,0006c5c547801308b36ea3cf669856ae,2023-06-07,AU,treatment,0,0,,0.0
3,000d1a300263c5db91cbefa3852898a7,2023-06-07,AU,control,0,0,,0.0
4,000e5e62e8746e467ed9f49ac5de3208,2023-06-06,US,treatment,0,0,,0.0


### Check data for a consistency, NaN values

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15285 entries, 0 to 15284
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   user_id            15285 non-null  object 
 1   install_date       15285 non-null  object 
 2   country            15285 non-null  object 
 3   test_group         15285 non-null  object 
 4   trial              15285 non-null  int64  
 5   paid               15285 non-null  int64  
 6   subscription_name  1723 non-null   object 
 7   revenue_1m         15285 non-null  float64
dtypes: float64(1), int64(2), object(5)
memory usage: 955.4+ KB


In [34]:
# change install_date type to datetime

df["install_date"] = pd.to_datetime(df["install_date"])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15285 entries, 0 to 15284
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   user_id            15285 non-null  object        
 1   install_date       15285 non-null  datetime64[ns]
 2   country            15285 non-null  object        
 3   test_group         15285 non-null  object        
 4   trial              15285 non-null  int64         
 5   paid               15285 non-null  int64         
 6   subscription_name  1723 non-null   object        
 7   revenue_1m         15285 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(2), object(4)
memory usage: 955.4+ KB


In [35]:
# check if trial and paid columns contains only 0 and 1 values

df[["trial", "paid"]].isin([0, 1]).all()

trial    True
paid     True
dtype: bool

In [36]:
df.describe()

Unnamed: 0,install_date,trial,paid,revenue_1m
count,15285,15285.0,15285.0,15285.0
mean,2023-06-04 01:01:08.537782272,0.112725,0.017337,0.097297
min,2023-06-01 00:00:00,0.0,0.0,0.0
25%,2023-06-02 00:00:00,0.0,0.0,0.0
50%,2023-06-04 00:00:00,0.0,0.0,0.0
75%,2023-06-06 00:00:00,0.0,0.0,0.0
max,2023-06-07 00:00:00,1.0,1.0,11.96
std,,0.316267,0.130529,0.782339


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15285 entries, 0 to 15284
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   user_id            15285 non-null  object        
 1   install_date       15285 non-null  datetime64[ns]
 2   country            15285 non-null  object        
 3   test_group         15285 non-null  object        
 4   trial              15285 non-null  int64         
 5   paid               15285 non-null  int64         
 6   subscription_name  1723 non-null   object        
 7   revenue_1m         15285 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(2), object(4)
memory usage: 955.4+ KB


In [38]:
# make sure each user id is unique

df["user_id"].is_unique

True

In [39]:
# check if text columns are consistent

df["country"].unique()

array(['US', 'AU', 'CA', 'GB'], dtype=object)

In [40]:
df["test_group"].unique()

array(['treatment', 'control'], dtype=object)

In [41]:
df["subscription_name"].unique()

array([nan, 'monthly.5.99.3d.trial', 'weekly.2.99.3d.trial'], dtype=object)

### Save cleaned data as a CSV file

In [42]:
df.to_csv("../data/ab_cleaned_data.csv")

### Create a summary dataframe for analysis and save it as a CSV file

In [43]:
analysis_df = (
    df.groupby("test_group")
    .agg(
        total_visitors=pd.NamedAgg(column="user_id", aggfunc="count"),
        conversions=pd.NamedAgg(column="paid", aggfunc="sum"),
        trial_starters=pd.NamedAgg(column="trial", aggfunc="sum"),
    )
    .reset_index()
)

analysis_df.head()

Unnamed: 0,test_group,total_visitors,conversions,trial_starters
0,control,7627,116,925
1,treatment,7658,149,798


In [44]:
analysis_df.to_csv("../data/ab_analysis_data.csv")