# A/B Testing and ETL

In [None]:
from pymongo import MongoClient
from pymongo.collection import Collection
from teaching_tools.ab_test.reset import Reset

r = Reset()
r.reset_database()

import math
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import scipy
from pymongo import MongoClient
from statsmodels.stats.contingency_tables import Table2x2
from statsmodels.stats.power import GofChisquarePower
from teaching_tools.ab_test.experiment import Experiment
from country_converter import CountryConverter

In [None]:
# Let's instantiate the MongoDB connection
# Create `client`
client = MongoClient(host="localhost", port=27017)
# Create `db`
db = client["wqu-abtest"]
# Assign `"mscfe-applicants"` collection to `mscfe_app`
mscfe_app = db["mscfe-applicants"]

In [None]:
# Querying the MongoDB
# Aggregate applicants by nationality
result = mscfe_app.aggregate(
    [
        {
            "$group": {
                "_id": "$countryISO2", "count": {"$count": {}}
            }
        }
    ]
)

# Load result into DataFrame
df_nationality = pd.DataFrame(result).rename({"_id": "country_iso2"}, axis="columns").sort_values("count")

print("df_nationality type:", type(df_nationality))
print("df_nationality shape", df_nationality.shape)
df_nationality.head()

In [None]:
# Instantiate `CountryConverter`
cc = CountryConverter()

# Create `"country_name"` column
df_nationality["country_name"] = cc.convert(
    df_nationality["country_iso2"], to="name_short"
)

# Create `"country_iso3"` column
df_nationality["country_iso3"] = cc.convert(df_nationality["country_iso2"], to="ISO3")

print("df_nationality type:", type(df_nationality))
print("df_nationality shape", df_nationality.shape)
df_nationality.head()

In [None]:
# Create `build_nat_choropleth` function
def build_nat_choropleth():
    fig = px.choropleth(
        data_frame=df_nationality,
        locations="country_iso3",
        color="count",
        projection="natural earth",
        color_continuous_scale=px.colors.sequential.Oranges,
        title="MScFE Applicants: Nationalities"
    )
    return fig

In [None]:
# ETL processes creating a class
class MongoRepository:
    """Repository class for interacting with MongoDB database.

    Parameters
    ----------
    client : `pymongo.MongoClient`
        By default, `MongoClient(host='localhost', port=27017)`.
    db : str
        By default, `'wqu-abtest'`.
    collection : str
        By default, `'mscfe-applicants'`.

    Attributes
    ----------
    collection : pymongo.collection.Collection
        All data will be extracted from and loaded to this collection.
    """

    # Task 7.2.14
    def __init__(
        self,
        client=MongoClient(host="localhost", port=27017),
        db="wqu-abtest",
        collection="mscfe-applicants"
    ):
        self.collection = client[db][collection]

    # Task 7.2.17
    def find_by_date(self, date_string):
        """Find records in a PyMongo Collection created on a given date.

        Parameters
        ----------
        collection : pymongo.collection.Collection
        Collection in which to search for documents.
        date_string : str
        Date to query. Format must be '%Y-%m-%d', e.g. '2022-06-28'.

        Returns
        -------
        observations : list
        Result of query. List of documents (dictionaries).
        """
        # Convert `date_string` to datetime object
        start = pd.to_datetime(date_string, format="%Y-%m-%d")
        # Offset `start` by 1 day
        end = start + pd.DateOffset(days=1)
        # Create PyMongo query for no-quiz applicants b/t `start` and `end`
        query = {"createdAt": {"$gte": start, "$lt": end}, "admissionsQuiz": "incomplete"}
        # Query collection, get result
        result = self.collection.find(query)
        # Convert `result` to list
        observations = list(result)
        
        return observations
    
    def update_applicants(self, observations_assigned):
        """Update applicant documents in collection.

        Parameters
        ----------
        collection : pymongo.collection.Collection
            Collection in which documents will be updated.

        observations_assigned : list
            Documents that will be used to update collection

        Returns
        -------
        transaction_result : dict
            Status of update operation, including number of documents
            and number of documents modified.
        """
        # Initialize couners
        n = 0
        n_modified = 0
    
        # Iterate through applicants
        for doc in observations_assigned:
            # Update doc
            result = self.collection.update_one(
                filter={"_id": doc["_id"]},
                update={"$set": doc}
            )
            # Update counters
            n += result.matched_count
            n_modified += result.modified_count
        
        # Create results
        transaction_result = {"n": n, "nModified": n_modified}
        return transaction_result
    
    def assign_to_groups(self, date_string):
        """Randomly assigns observations to control and treatment groups.

        Parameters
        ----------
        observations : list or pymongo.cursor.Cursor
            List of users to assign to groups.

        Returns
        -------
        observations : list
            List of documents from `observations` with two additional keys:
            `inExperiment` and `group`.
        """
        # Get observations
        observations = self.find_by_date(date_string)
        
        # Shuffle `observations`
        random.seed(42)
        random.shuffle(observations)

        # Get index position of item at observations halfway point
        idx = len(observations) // 2

        # Assign first half of observations to control group
        for doc in observations[:idx]:
            doc["inExperiment"] = True
            doc["group"] = "no email (control)"

        # Assign second half of observations to treatment group
        for doc in observations [idx:]:
            doc["inExperiment"] = True
            doc["group"] = "email (treatment)"
        
        # Update collection
        result = self.update_applicants(observations)
        return result

    # Task 7.5.14: `find_exp_observations` method
    def find_exp_observations(self):
        observations = list(mscfe_app.find({"inExperiment": True}))
        return observations

In [None]:
# Instantiate the MongoRepository class
repo = MongoRepository()
print("repo type:", type(repo))
repo

In [None]:
# Asign to groups and update the database with the data of the date
date = "2022-06-02"
repo.assign_to_groups(date)

In [None]:
# Chi square experiment
chi_square_power = GofChisquarePower()
group_size = math.ceil(
    chi_square_power.solve_power(effect_size=0.5, alpha=0.05, power=0.8)
)

print("Group size:", group_size)
print("Total # of applicants needed:", group_size * 2)

In [None]:
# Aggregate no-quiz applicants by sign-up date
result = mscfe_app.aggregate(
    [
        {"$match": {"admissionsQuiz": "incomplete"}},
        {
            "$group": {
                "_id":{"$dateTrunc": {"date": "$createdAt", "unit": "day"}},
                "count": {"$sum": 1}
            }
        }
    ]
)

# Load result into DataFrame
no_quiz_mscfe = (
    pd.DataFrame(result)
    .rename({"_id": "date", "count": "new_users"}, axis=1)
    .set_index("date")
    .sort_index()
    .squeeze()
)

print("no_quiz type:", type(no_quiz_mscfe))
print("no_quiz shape:", no_quiz_mscfe.shape)

In [None]:
mean = no_quiz_mscfe.describe()["mean"]
std = no_quiz_mscfe.describe()["std"]
print("no_quiz mean:", mean)
print("no_quiz std:", std)

In [None]:
# Let's calculate the mean and std for the 7 days duration of the experiment
exp_days = 7
sum_mean = mean * exp_days
sum_std = std * math.sqrt(exp_days)
print("Mean of sum:", sum_mean)
print("Std of sum:", sum_std)

In [None]:
# Calculating the probability of getting 65 people or more in the experiment with a 95% of confidence
prob_65_or_fewer = scipy.stats.norm.cdf(
    group_size * 2,
    loc=sum_mean,
    scale=sum_std,
)
prob_65_or_greater = 1 - prob_65_or_fewer

print(
    f"Probability of getting 65+ no_quiz in {exp_days} days:",
    round(prob_65_or_greater, 3),
)

In [None]:
# Running the experiment
exp = Experiment(repo=client, db="wqu-abtest", collection="mscfe-applicants")
exp.reset_experiment()
result = exp.run_experiment(days=exp_days, assignment=True)
print("result type:", type(result))
result

In [None]:
# Loading the observations in the dataframe
result = repo.find_exp_observations()
df = pd.DataFrame(result).dropna()

print("df type:", type(df))
print("df shape:", df.shape)
df.head()

In [None]:
# Calculating a crosstab in which the results of the experiment can be seen
data = pd.crosstab(
    index=df["group"],
    columns=df["admissionsQuiz"],
    normalize=False
)

print("data type:", type(data))
print("data shape:", data.shape)
data

In [None]:
# Let's represent the results in a bar chart
def build_contingency_bar():
    # Create side-by-side bar chart
    fig = px.bar(
        data_frame=data,
        barmode="group",
        title="MScFE: Admissions Quiz Completion by Group"
    )
    fig.update_layout(
        xaxis_title="Group",
        yaxis_title="Frequency [count]",
        legend={"title": "Admissions Quiz"}
    )
    return fig

In [None]:
# Create the contingency table
contingency_table = Table2x2(data.values)

print("contingency_table type:", type(contingency_table))
contingency_table.table_orig

In [None]:
# Results of the chi-square test of independency
chi_square_test = contingency_table.test_nominal_association()

print("chi_square_test type:", type(chi_square_test))
print(chi_square_test)

In [None]:
# Calculate the odds ratio
odds_ratio = contingency_table.oddsratio.round(1)
print("Odds ratio:", odds_ratio)