In [10]:
import pandas as pd
import sqlite3

# Exercise 04 : A/B-testing


## create a connection to the database using the library sqlite3

In [11]:
con = sqlite3.connect('data/checking-logs.sqlite')
con

<sqlite3.Connection at 0x79845809b2e0>

## using only one query for each of the groups, create two dataframes: test_results and control_results with the columns time and avg_diff and only two rows

In [12]:
test_query = """
WITH deltas AS (
    SELECT 
        uid,
        (julianday(first_view_ts) - julianday(first_commit_ts)) AS delta,
        first_view_ts
    FROM 
        test
    WHERE 
        labname != 'project1'
),
first_views AS (
    SELECT 
        uid,
        MIN(first_view_ts) AS first_view_ts
    FROM 
        deltas
    GROUP BY 
        uid
)
SELECT 
    'before' AS time,
    AVG(d.delta) AS avg_diff
FROM 
    deltas d
JOIN 
    first_views fv ON d.uid = fv.uid
WHERE 
    d.first_view_ts > fv.first_view_ts
GROUP BY 
    d.uid

UNION ALL

SELECT 
    'after' AS time,
    AVG(d.delta) AS avg_diff
FROM 
    deltas d
JOIN 
    first_views fv ON d.uid = fv.uid
WHERE 
    d.first_view_ts <= fv.first_view_ts
GROUP BY 
    d.uid
"""

test_results = pd.read_sql_query(test_query, con)
test_results

Unnamed: 0,time,avg_diff
0,before,6.134334
1,before,9.093637
2,before,10.729418
3,before,4.707477
4,before,-2.1005
5,before,6.540868
6,before,1.043807
7,before,7.277336
8,before,8.518705
9,before,7.028683


In [24]:
control_query = """
WITH deltas AS (
    SELECT 
        uid,
        first_commit_ts,
        first_view_ts,
        (julianday(first_view_ts) - julianday(first_commit_ts)) AS delta
    FROM 
        control
    WHERE 
        labname != 'project1'
),
first_views AS (
    SELECT 
        MIN(first_view_ts) AS min_first_view_ts
    FROM 
        deltas
)
SELECT 
    'before' AS time,
    AVG(d.delta) AS avg_diff
FROM 
    deltas d
JOIN 
    first_views fv ON 1=1
WHERE 
    d.first_commit_ts < fv.min_first_view_ts

UNION ALL

SELECT 
    'after' AS time,
    AVG(d.delta) AS avg_diff
FROM 
    deltas d
JOIN 
    first_views fv ON 1=1
WHERE 
    d.first_commit_ts >= fv.min_first_view_ts;
"""


control_results = pd.read_sql_query(control_query, con)
control_results

Unnamed: 0,time,avg_diff
0,before,15.438828
1,after,-6.784841


## close the connection

In [None]:
con.close()

## have the answer: did the hypothesis turn out to be true and the page does affect the students’ behavior?

In [35]:
test_before_avg = test_results.loc[test_results['time'] == 'before', 'avg_diff'].values[0]
test_after_avg = test_results.loc[test_results['time'] == 'after', 'avg_diff'].values[0]
control_before_avg = control_results.loc[control_results['time'] == 'before', 'avg_diff'].values[0]
control_after_avg = control_results.loc[control_results['time'] == 'after', 'avg_diff'].values[0]

hypothesis_true = (test_after_avg < test_before_avg) and (control_after_avg >= control_before_avg)

if hypothesis_true:
    print("\nГипотеза подтверждается: Новостная лента повлияла на поведение студентов.")
else:
    print("\nГипотеза не подтверждается: Новостная лента не повлияла на поведение студентов.")


Гипотеза не подтверждается: Новостная лента не повлияла на поведение студентов.
