# Importing requires Libraries

In [1]:
import pandas as pd
from scipy.stats import chi2_contingency

# Reading and Exploring the Data

In [2]:
df = pd.read_csv('AdSmart.csv')

In [5]:
df.head(3)

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,6,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,6,Chrome Mobile WebView,0,1


In [7]:
df.duplicated().sum()

0

In [9]:
df = df.drop(columns=['auction_id'])

In [11]:
df.shape

(8077, 8)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8077 entries, 0 to 8076
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   experiment   8077 non-null   object
 1   date         8077 non-null   object
 2   hour         8077 non-null   int64 
 3   device_make  8077 non-null   object
 4   platform_os  8077 non-null   int64 
 5   browser      8077 non-null   object
 6   yes          8077 non-null   int64 
 7   no           8077 non-null   int64 
dtypes: int64(4), object(4)
memory usage: 504.9+ KB


In [15]:
df.isna().sum()

experiment     0
date           0
hour           0
device_make    0
platform_os    0
browser        0
yes            0
no             0
dtype: int64

In [40]:
df['no'].unique()

array([0, 1], dtype=int64)

In [36]:
df['yes'].unique()

array([0, 1], dtype=int64)

#### Now Iam sure that the data is clean , No Nulls or Duplicates or Outliers , all the data types are suitable for my testing 

In [34]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
hour,8077.0,11.61508,5.734879,0.0,7.0,13.0,15.0,23.0
platform_os,8077.0,5.947134,0.224333,5.0,6.0,6.0,6.0,7.0
yes,8077.0,0.070818,0.256537,0.0,0.0,0.0,0.0,1.0
no,8077.0,0.083075,0.276013,0.0,0.0,0.0,0.0,1.0


In [50]:
print("Experiment :", df['experiment'].unique())
print("platform_os :", df['platform_os'].unique())
print("Yes :", df['yes'].unique())
print("No :", df['no'].unique())

Experiment : ['exposed' 'control']
platform_os : [6 5 7]
Yes : [0 1]
No : [0 1]


In [58]:
yes_val = df[df['yes']==1]
yes_val.shape

(572, 8)

In [60]:
no_val = df[df['no']==1]
no_val.shape

(671, 8)

In [70]:
df.shape

(8077, 8)

# Data PreProccessing

In [74]:
# So I have many rows with no respones 
No_response = df.shape[0] - (yes_val.shape[0]+no_val.shape[0])
print(No_response)

6834


In [78]:
df1 = df[(df['yes'] == 1) | (df['no'] == 1)]

In [80]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1243 entries, 2 to 8071
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   experiment   1243 non-null   object
 1   date         1243 non-null   object
 2   hour         1243 non-null   int64 
 3   device_make  1243 non-null   object
 4   platform_os  1243 non-null   int64 
 5   browser      1243 non-null   object
 6   yes          1243 non-null   int64 
 7   no           1243 non-null   int64 
dtypes: int64(4), object(4)
memory usage: 87.4+ KB


In [92]:
rate = df1.groupby('experiment')[['yes','no']].sum()

In [94]:
rate['conversion_rate'] = rate['yes'] / (rate['yes'] + rate['no'])

In [100]:
rate

Unnamed: 0_level_0,yes,no,conversion_rate
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
control,264,322,0.450512
exposed,308,349,0.468798


# A/B Testing

In [106]:
contingency_table = [
    [rate.loc['control', 'yes'], rate.loc['control', 'no']],
    [rate.loc['exposed', 'yes'], rate.loc['exposed', 'no']]
]

# Perform the chi-square test
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

# Output the chi-square statistic and p-value
print("Chi-Square Statistic:", chi2)
print("P-Value:", p_value)

Chi-Square Statistic: 0.3465426444731172
P-Value: 0.5560768104229136


#### The p-value of 0.5561 is much higher than the common significance level of 0.05. This means we fail to reject the null hypothesis, suggesting that the difference in conversion rates between the control and exposed groups is not statistically significant.