In [34]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "marketing_AB.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "faviovaz/marketing-ab-testing",
  file_path,
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())

  df = kagglehub.load_dataset(


First 5 records:    Unnamed: 0  user id test group  converted  total ads most ads day  \
0           0  1069124         ad      False        130       Monday   
1           1  1119715         ad      False         93      Tuesday   
2           2  1144181         ad      False         21      Tuesday   
3           3  1435133         ad      False        355      Tuesday   
4           4  1015700         ad      False        276       Friday   

   most ads hour  
0             20  
1             22  
2             18  
3             10  
4             14  


In [29]:
df.columns

Index(['Unnamed: 0', 'user id', 'test group', 'converted', 'total ads',
       'most ads day', 'most ads hour'],
      dtype='object')

In [30]:
# prompt: delete Unnamed: 0 column

df = df.drop('Unnamed: 0', axis=1)


In [13]:

duplicate_rows_df = df[df.duplicated()]
print("Duplicate Rows except first occurrence based on all columns are :")
print(duplicate_rows_df)
print(f"Number of duplicate rows: {len(duplicate_rows_df)}")


Duplicate Rows except first occurrence based on all columns are :
Empty DataFrame
Columns: [user id, test group, converted, total ads, most ads day, most ads hour]
Index: []
Number of duplicate rows: 0


In [14]:
# prompt: check missing values in each column

import pandas as pd

# Assuming 'df' is your DataFrame (loaded from the previous code)
# If not, replace with your DataFrame loading method

missing_values = df.isnull().sum()
missing_values


Unnamed: 0,0
user id,0
test group,0
converted,0
total ads,0
most ads day,0
most ads hour,0


In [15]:
df.head(5)

Unnamed: 0,user id,test group,converted,total ads,most ads day,most ads hour
0,1069124,ad,False,130,Monday,20
1,1119715,ad,False,93,Tuesday,22
2,1144181,ad,False,21,Tuesday,18
3,1435133,ad,False,355,Tuesday,10
4,1015700,ad,False,276,Friday,14


In [8]:
df.columns

Index(['user id', 'test group', 'converted', 'total ads', 'most ads day',
       'most ads hour'],
      dtype='object')

In [16]:
# prompt: convert categorical columns to numerical using label encoder

import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Assuming 'df' is your DataFrame (loaded from the previous code)
# If not, replace with your DataFrame loading method

# Identify categorical columns
categorical_cols = ['test group','converted','most ads day']

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Iterate through categorical columns and apply label encoding
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

print(df.head())


   user id  test group  converted  total ads  most ads day  most ads hour
0  1069124           0          0        130             1             20
1  1119715           0          0         93             5             22
2  1144181           0          0         21             5             18
3  1435133           0          0        355             5             10
4  1015700           0          0        276             0             14


In [None]:
#Hypothesis-

#Null Hypothesis (H₀): There is no difference in conversion rates between the ad and psa groups
#Alternative Hypothesis (H₁): The conversion rate of the ad group is higher than the psa group

In [17]:
conversion_rate_table = pd.crosstab(df['test group'], df['converted'], normalize='index')
print(conversion_rate_table)

converted          0         1
test group                    
0           0.974453  0.025547
1           0.982146  0.017854


In [21]:
# Perform Chi-Square test
from scipy.stats import chi2_contingency
from scipy.stats import chi2_contingency
alpha = 0.05
for variable in df.columns:
    if variable != 'converted' and variable != 'user id':
        #Create a contingenct table (cross-tabulation)
        contingenct_table = pd.crosstab(df[variable], df['converted'])
        #Perform chi-squared test
        chi2, p, _, _ = chi2_contingency(contingenct_table)
        #Display the results
        print(f"\nChi-squared test for {variable} vs. converted:")
        print(f"Chi-squared value: {chi2}")
        print(f"p-value: {p}")
        #Check for significance
        if p < alpha:
            print(f"The difference in conversion rates across {variable} is statistically significant")
        else:
            print(f"There is no significant difference in conversion rates across {variable}")


Chi-squared test for test group vs. converted:
Chi-squared value: 54.005823883685245
p-value: 1.9989623063390075e-13
The difference in conversion rates across test group is statistically significant

Chi-squared test for total ads vs. converted:
Chi-squared value: 48001.99019466958
p-value: 0.0
The difference in conversion rates across total ads is statistically significant

Chi-squared test for most ads day vs. converted:
Chi-squared value: 410.0478857936585
p-value: 1.932184379244731e-85
The difference in conversion rates across most ads day is statistically significant

Chi-squared test for most ads hour vs. converted:
Chi-squared value: 430.76869230822086
p-value: 8.027629823696771e-77
The difference in conversion rates across most ads hour is statistically significant


In [24]:
df.columns

Index(['user id', 'test group', 'converted', 'total ads', 'most ads day',
       'most ads hour'],
      dtype='object')

In [33]:
df=df[['most ads day','converted']]

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first')
X = encoder.fit_transform(df[['most ads day']])
y = df['converted'].astype(int)

# Train logistic regression model
model = LogisticRegression()
model.fit(X, y)

# Print feature importance (coefficients)
feature_names = encoder.get_feature_names_out(['most ads day'])
coef_importance = dict(zip(feature_names, model.coef_[0]))

print("Feature Importance (Day/Hour Effect on Conversion):")
for feature, coef in coef_importance.items():
    print(f"{feature}: {coef:.4f}")

Feature Importance (Day/Hour Effect on Conversion):
most ads day_Monday: 0.4132
most ads day_Saturday: -0.0461
most ads day_Sunday: 0.0928
most ads day_Thursday: -0.0261
most ads day_Tuesday: 0.3055
most ads day_Wednesday: 0.1122


In [35]:
df=df[['most ads hour','converted']]

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first')
X = encoder.fit_transform(df[['most ads hour']])
y = df['converted'].astype(int)

# Train logistic regression model
model = LogisticRegression()
model.fit(X, y)

# Print feature importance (coefficients)
feature_names = encoder.get_feature_names_out(['most ads hour'])
coef_importance = dict(zip(feature_names, model.coef_[0]))

print("Feature Importance (Day/Hour Effect on Conversion):")
for feature, coef in coef_importance.items():
    print(f"{feature}: {coef:.4f}")

Feature Importance (Day/Hour Effect on Conversion):
most ads hour_1: -0.6803
most ads hour_2: -1.0300
most ads hour_3: -0.5361
most ads hour_4: -0.1286
most ads hour_5: -0.0776
most ads hour_6: -0.1453
most ads hour_7: -0.4857
most ads hour_8: -0.3574
most ads hour_9: -0.2417
most ads hour_10: -0.1233
most ads hour_11: -0.1064
most ads hour_12: -0.0360
most ads hour_13: -0.0021
most ads hour_14: 0.1244
most ads hour_15: 0.1794
most ads hour_16: 0.2165
most ads hour_17: 0.1315
most ads hour_18: 0.1052
most ads hour_19: 0.0847
most ads hour_20: 0.1884
most ads hour_21: 0.1590
most ads hour_22: 0.0672
most ads hour_23: -0.1124
