In [2]:
%env KEYWORD=Obesity

env: KEYWORD=Obesity


In [3]:
%env KEYWORD2=Cancer

env: KEYWORD2=Cancer


In [4]:
from peewee import *
from playhouse.db_url import connect

import os

In [5]:
KEYWORD = os.environ['KEYWORD']
KEYWORD2 = os.environ['KEYWORD2']

In [6]:
QUERY_STR = """SELECT
    date_trunc('year', date) y,
    COUNT (id)
FROM
    dataset
WHERE
    keyword = %s AND date >= '2018-01-01' AND date < '2022-01-01'
GROUP BY
    y
ORDER BY
    y;"""


DB_URL = os.environ.get('DATABASE')

db = connect(DB_URL)

In [7]:
class BaseModel(Model):
    class Meta:
        database = db

class Dataset(BaseModel):
    date = DateField()
    title = TextField()
    abstract = TextField()
    keyword = TextField()

In [8]:
query = Dataset.raw(QUERY_STR, KEYWORD)

COUNT_DICT = {}

for item in query:
    COUNT_DICT[item.y] = item.count

In [9]:
query2 = Dataset.raw(QUERY_STR, KEYWORD2)

COUNT_DICT_2 = {}

for item in query2:
    COUNT_DICT_2[item.y] = item.count

In [10]:
data1 = list(COUNT_DICT.values())
data2 = list(COUNT_DICT_2.values())

In [16]:
COUNT_DICT_2

{datetime.datetime(2018, 1, 1, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)): 17488,
 datetime.datetime(2019, 1, 1, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)): 110114,
 datetime.datetime(2020, 1, 1, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)): 130725,
 datetime.datetime(2021, 1, 1, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)): 139984}

In [11]:
# Apply Parametric Test
# Student's t-test
from scipy.stats import ttest_ind

stat, p = ttest_ind(data1, data2)
print('stat={}, p={}'.format(stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

stat=-2.9219796632174075, p=0.026562774980460422
Probably different distributions


In [12]:
# Apply Non-Parametric Test
# Mann-Whitney U Test 
from scipy.stats import mannwhitneyu

stat, p = mannwhitneyu(data1, data2)
print('stat={}, p={}'.format(stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')


stat=3.0, p=0.2
Probably the same distribution


In [13]:
# Apply Non-Parametric Test
# Kruskal-Wallis H Test
from scipy.stats import kruskal

stat, p = kruskal(data1, data2)
print('stat={}, p={}'.format(stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

stat=2.083333333333332, p=0.14891467317876178
Probably the same distribution


In [14]:
# Effect Size Test
# Cohen's d
from statistics import mean, stdev
from math import sqrt

cohens_d = (mean(data1) - mean(data2)) / (sqrt((stdev(data1) ** 2 + stdev(data2) ** 2) / 2))

abs_cohens_d = abs(cohens_d)

effect = "huge"
if abs_cohens_d <= 0.01:
    effect = "very small"
elif abs_cohens_d < 0.2:
    effect = "small"
elif abs_cohens_d < 0.5:
    effect = "medium"
elif abs_cohens_d < 0.8:
    effect = "large"
elif abs_cohens_d < 1.2:
    effect = "very large"

print("Cohens'd value: {}, which shows a {} difference.".format(cohens_d, effect))

Cohens'd value: -2.0661516343502133, which shows a huge difference.
