### **DATA CLEANING**

In [1]:
import pandas as pd 
import sqlalchemy


# Using raw string since the server name contain this '\' esape character
connection_data = conn_str = (
    r'DRIVER={SQL Server};'
    r'SERVER=KRISHNA\SQLEXPRESS;'
    r'DATABASE=MarketingCampaignDB;'
    r'Trusted_Connection=yes;'
)

# Creating an connection engine to the database stored on my server
connection = sqlalchemy.create_engine(f'mssql+pyodbc:///?odbc_connect={conn_str}').connect()
query = 'SELECT * FROM Marketing_AB'
# Reading all the data from the table into dataframe
df = pd.read_sql(query,connection)

In [2]:
def wrangle(df):
    """
    This function will remove the outliers from the dataframe using the interquaritle range method.

    Arguments:
    df : Pandas DataFrame
        The input dataframe to be cleaned.
    
    Returns:
    cleaned_df : Pandas DataFrame
        The dataframe with outliers removed.
    """
    quartile_1st = df['total_ads'].quantile(0.25)
    quartile_3rd = df['total_ads'].quantile(0.75)
    iqr = quartile_3rd - quartile_1st
    upper_whisker = quartile_3rd + 3 * iqr
    lower_whisker = quartile_1st - 3 * iqr

    # Removing the outliers from the dataframe
    cleaned_df = df[(df['total_ads'] >= lower_whisker) & (df['total_ads'] <= upper_whisker)]
    return cleaned_df

In [3]:
# Calling the function with orignal dataset as input and saving it on server for future uses
cleaned_df = wrangle(df)
cleaned_df.to_sql('Cleaned_Marketing_AB', con=connection, if_exists='replace', index=False)

-1

### **A/B TESTING**

In [4]:
def calculate_conversion_statistics(ad_group, psa_group):
    """
    This function will calculate the total conversions, conversion rates, p-value, 
    z-statistics for the proportion of two groups, and confidence interval for the 
    difference between the conversion rates of two groups.

    Arguments:
    ad_group : Pandas DataFrame
        The first group (e.g., 'ad').
    psa_group : Pandas DataFrame
        The second group (e.g., 'psa').

    Returns:
    stats : Pandas Dataframe
        Dataframe containing the calculated statistics.
    """

    # Importing libraries here to avoid redundancy and ensure they are available for local use
    from scipy.stats import norm
    import pandas as pd



    # Devising Hypothesis
    null_hypothesis = 'There is no real significant difference in the conversion rates of ad and psa group suggesting both perform similar in converting the users'
    alternative_hypothesis = 'There is a significant difference in the conversion rates of ad and psa group suggesting one performs better in converting the users'
    
    # Sample size of each group
    ad_n = len(ad_group)
    psa_n = len(psa_group)
    
    # Since conversion column contain bool values we are using sum methods
    ad_total_conversions = ad_group['converted'].sum()
    psa_total_conversions = psa_group['converted'].sum()

    # Calculating Conversion rates using mean method due to values being bool 
    ad_conversion_rate = ad_group['converted'].mean()
    psa_conversion_rate = psa_group['converted'].mean()

    # Pooled conversion rate
    pooled_conversion_rate = (ad_total_conversions + psa_total_conversions) / (ad_n + psa_n)

    # Calculating Standard error for the difference in conversion rate of both group
    se = (pooled_conversion_rate * (1 - pooled_conversion_rate) * (1/ad_n + 1/psa_n)) ** 0.5

    # # Calculating the z-statistic to check how far the difference in conversion rates 
    # is from zero (null hypothesis) in terms of standard errors  
    # A Z statistic > 1.96  means a greater evidence to reject the null hypothesis than to accept it
    z_statistic = (ad_conversion_rate - psa_conversion_rate) / se

    # Calculateing the P-value to check the likelihood of observing the difference in conversion rates
    # under the assumption that there is no real difference (null hypothesis) 
    # if p-value is below 0.05 we can reject the null hypothesis that there is no real difference betweenw
    # the two groups
    p_value = 2 * (1 - norm.cdf(abs(z_statistic)))

    # 95% Confidence interval for the difference in conversion rates  
    # This gives the range in which the true difference is likely to fall  
    # with 95% confidence, meaning that in 100 similar tests, the true difference would fall in this range 95 times  
    margin_of_error = norm.ppf(0.975) * se
    confidence_interval = (
        (ad_conversion_rate - psa_conversion_rate) - margin_of_error, # Lower
        (ad_conversion_rate - psa_conversion_rate) + margin_of_error  # Upper
    )

    stats = pd.DataFrame({
        'group': ['ad', 'psa'],
        'participants': [ad_n, psa_n],
        'total_conversions': [ad_total_conversions, psa_total_conversions],
        'conversion_rate': [ad_conversion_rate, psa_conversion_rate],
        'z_statistic': [z_statistic, None],
        'p_value': [p_value, None],
        'Lower_CI': [confidence_interval[0], None],
        'Upper_CI': [confidence_interval[1], None]
    })

    if p_value < 0.05 and abs(z_statistic) > 1.96:
        print(f"We reject the null hypothesis and accept the alternative hypothesis.\n\n{alternative_hypothesis}")
    else:
        print(f"We fail to reject the null hypothesis.\n\n{null_hypothesis}")
    
    return stats

    

In [5]:
# Dividing the dataframe into ad and psa group
ad_group = cleaned_df[cleaned_df['test_group'] == 'ad']
psa_group = cleaned_df[cleaned_df['test_group'] == 'psa']
# Calling te calculate_conversion_statistics function here with the ad and psa groups as argument
conversion_stats = calculate_conversion_statistics(ad_group, psa_group)
# Saving the conversion_stats as a csv into the local desktop
conversion_stats.to_csv('C:/Users/91626/Desktop/conversion_statistics.csv', index=False)

We reject the null hypothesis and accept the alternative hypothesis.

There is a significant difference in the conversion rates of ad and psa group suggesting one performs better in converting the users
