In [1]:
import pandas as pd
import wmfdata as wmf
from wmfdata.utils import sig_figs
from numbers import Number

You can find the source for `wmfdata` at https://github.com/neilpquinn/wmfdata


In [2]:
%matplotlib inline

In [127]:
def fmt_num(x):
    if isinstance(x, Number):
        x = sig_figs(x, 3)
        
        if x < 5:
            return pct(x)
        else:
            return "{:,.0f}".format(x)
    else:
        return x

def pct(n):
    return "{:.1%}".format(n)

def find_latest(ser):
    nn_ser = ser[~ser.isnull()]
    return nn_ser.index[-1].date()

def calc_rpt(ser):
    nn_ser = ser[~ser.isnull()]
    cur_mo = nn_ser.index[-1].date()
    cur = nn_ser.iloc[-1]
    mo_prev = nn_ser.iloc[-2]
    
    try:
        yr_prev = nn_ser.iloc[-13]
        yoy_change = (cur / yr_prev) - 1
    except IndexError:
        yoy_change = None
    
    mom_change = (cur / mo_prev) - 1
    
    res = [cur_mo, cur, mom_change, yoy_change]
    return pd.Series(
        [fmt_num(n) for n in res],
        index=["latest_month", "value", "mom_change", "yoy_change"]
    )

In [61]:
mob_wikis = (
    "hiwiki",
    "bnwiki",
    "idwiki",
    "arwiki",
    "mrwiki",
    "fawiki",
    "swwiki",
    "tlwiki",
    "zhwikiquote",
    "thwiki",
    "arzwiki",
    "mlwiki",
    "tawiki",
    "knwiki",
    "ptwiktionary",
    "azwiki",
    "guwiki",
    "kywiki",
    "sqwiki",
    "mswiki"
)

gn_countries = (
    "AD", "AL", "AT", "AX", "BA", "BE", "BG", "CH", "CY", "CZ",
    "DE", "DK", "EE", "ES", "FI", "FO", "FR", "FX", "GB", "GG",
    "GI", "GL", "GR", "HR", "HU", "IE", "IL", "IM", "IS", "IT",
    "JE", "LI", "LU", "LV", "MC", "MD", "ME", "MK", "MT", "NL",
    "NO", "PL", "PT", "RO", "RS", "RU", "SE", "SI", "SJ", "SK",
    "SM", "TR", "VA", "AU", "CA", "HK", "MO", "NZ", "JP", "SG",
    "KR", "TW", "US", 
    # Have to include unlocated edits—otherwise they'll be counted as Global South
    "--"
)

# Global South countries

In [120]:
# Earliest data still available in geoeditors_daily
wmf.hive.run("""
select
    min(date)
from wmf.geoeditors_daily
where
    month >= "2001-01"
""")

Unnamed: 0,_c0
0,2018-07-01


## Edits and editors

In [122]:
gs_edits_r = wmf.hive.run("""
with gs_editors as (
    select 
        month,
        user_fingerprint_or_id as user_id,
        sum(edit_count) as edit_count,
        sum(namespace_zero_edit_count) as namespace_zero_edit_count,
        -- Treat the user as a bot if it matches on any wiki
        max(is_bot_by_name or array_contains(user_groups, "bot")) as bot
    from wmf.geoeditors_daily gd
    left join wmf.mediawiki_user_history muh
    on
        gd.wiki_db = muh.wiki_db and
        gd.user_fingerprint_or_id = muh.user_id and
        muh.snapshot = "{snapshot}" and
        muh.end_timestamp is null
    where
        month >= "{start}" and
        country_code not in {gn_countries} and
        not user_is_anonymous
    group by month, user_fingerprint_or_id
)
select
    month,
    sum(edit_count) as total_edits,
    sum(if(not bot, edit_count, 0)) as nonbot_edits,
    sum(cast(namespace_zero_edit_count >= 5 as int)) as active_editors
from gs_editors
group by month
""".format(
    gn_countries=repr(gn_countries),
    snapshot="2018-09",
    start="2018-04"
))

gs_edits_r.tail()

Unnamed: 0,month,total_edits,nonbot_edits,active_editors
0,2018-07,2115008,2115005,20703
1,2018-08,2261110,2261109,20618
2,2018-09,2096700,2096697,20258


In [130]:
gs_edits = gs_edits_r.copy()
gs_edits["month"] = pd.to_datetime(gs_edits["month"])
gs_edits = gs_edits.set_index("month")

gs_edits.apply(calc_rpt).transpose()

Unnamed: 0,latest_month,value,mom_change,yoy_change
total_edits,2018-09-01,2100000,-7.3%,
nonbot_edits,2018-09-01,2100000,-7.3%,
active_editors,2018-09-01,20300,-1.8%,


## Edits and editors, old method

In [131]:
old_gn_countries = (
    "AD", "AL", "AT", "AX", "BA", "BE", "BG", "CH", "CY", "CZ",
    "DE", "DK", "EE", "ES", "FI", "FO", "FR", "FX", "GB", "GG",
    "GI", "GL", "GR", "HR", "HU", "IE", "IL", "IM", "IS", "IT",
    "JE", "LI", "LU", "LV", "MC", "MD", "ME", "MK", "MT", "NL",
    "NO", "PL", "PT", "RO", "RS", "RU", "SE", "SI", "SJ", "SK",
    "SM", "TR", "VA", "AU", "CA", "HK", "MO", "NZ", "JP", "SG",
    "KR", "TW", "US", 
)

In [132]:
old_gs_edits = wmf.hive.run("""
with gs_editors as (
    select 
        wiki_db,
        user_fingerprint_or_id as user_id
    from wmf.geoeditors_daily
    where
        month >= "{start}" and
        country_code not in {gn_countries} and
        user_is_anonymous = 0
    group by wiki_db, user_fingerprint_or_id
)
select
    date_format(event_timestamp, "yyyy-MM-01") as month,
    count(*) as total_edits,
    sum(cast(
        !(event_user_is_bot_by_name or array_contains(event_user_groups, "bot")
    ) as int)) as nonbot_edits
from gs_editors ge
left join wmf.mediawiki_history mh
on
    ge.wiki_db = mh.wiki_db and
    ge.user_id = event_user_id and
    snapshot = "{snapshot}"
where
    event_entity = "revision" and
    event_type = "create" and
    event_timestamp >= "{start}"
group by date_format(event_timestamp, "yyyy-MM-01")
""".format(
    gn_countries=repr(old_gn_countries),
    snapshot="2018-09",
    start="2018-04"
))

In [133]:
old_gs_edits

Unnamed: 0,month,total_edits,nonbot_edits
0,2018-04-01,7576888,7576753
1,2018-05-01,8592750,8592464
2,2018-06-01,8093110,8092757
3,2018-07-01,8366023,8365815
4,2018-08-01,9596925,9596533
5,2018-09-01,9047193,9047065


In [None]:
old_gs_editors = wmf.hive.run("""
with gs_editors as (
    select 
        wiki_db,
        user_fingerprint_or_id as user_id
    from wmf.geoeditors_daily
    where
        month >= "{start}" and
        country_code not in {gn_countries} and
        user_is_anonymous = 0
    group by wiki_db, user_fingerprint_or_id
)
select
    month,
    sum(cast(content_edits >= 5 as int)) as active_editors
from (
    select
        date_format(event_timestamp, "yyyy-MM-01") as month,
        count(*) as content_edits
    from gs_editors ge
    left join wmf.mediawiki_history mh
    on
        ge.wiki_db = mh.wiki_db and
        ge.user_id = event_user_id and
        snapshot = "{snapshot}"
    where
        event_entity = "revision" and
        event_type = "create" and
        event_timestamp >= "{start}" and
        page_namespace_is_content = 1 and
        !(event_user_is_bot_by_name or array_contains(event_user_groups, "bot"))
    group by event_user_text, date_format(event_timestamp, "yyyy-MM-01")
) combined_eds
group by month
""".format(
    gn_countries=repr(old_gn_countries),
    snapshot="2018-09",
    start="2018-04"
))

In [137]:
old_gs_editors

Unnamed: 0,month,active_editors
0,2018-04-01,13980
1,2018-05-01,14713
2,2018-06-01,15724
3,2018-07-01,22466
4,2018-08-01,22254
5,2018-09-01,22964


## New editor retention

In [142]:
gs_ner = wmf.hive.run("""
with gs_new_edits as (
    select 
        gd.wiki_db,
        event_user_text as user_name,
        event_timestamp as edit_dt,
        event_user_creation_timestamp as registration_dt
    from wmf.geoeditors_daily gd
    left join wmf.mediawiki_history mh
    on
        gd.wiki_db = mh.wiki_db and
        gd.user_fingerprint_or_id = mh.user_id and
        mh.snapshot = "{snapshot}"
    where
        month >= "{start}" and
        country_code not in {gn_countries} and
        not event_user_is_anonymous and
        not event_user_is_created_by_system and
        not (event_user_is_bot_by_name or array_contains(event_user_groups, "bot")) and
        event_user_creation_timestamp >= "{start}" and
        event_user_creation_timestamp < "{end}"
)
select 
    1st_month.cohort as cohort,
    sum(cast(1st_month.edits >= 1 as int)) as new_editors,
    sum(cast(2nd_month.edits >= 1 as int)) / sum(cast(1st_month.edits >= 1 as int)) as new_editor_retention
from (
    select
        user_name,
        wiki_db as wiki,
        substr(registration_dt, 0, 7) as cohort,
        count(*) as edits
    from gs_new_edits
    where
        unix_timestamp(edit_dt, "yyyy-MM-dd HH:mm:ss.0") <
            (unix_timestamp(registration_dt, "yyyy-MM-dd HH:mm:ss.0") + (30*24*60*60))
    group by user_name, registration_dt, wiki_db
    ) 1st_month
left join (
    select
        user_name,
        wiki_db as wiki,
        substr(registration_dt, 0, 7) as cohort,
        count(*) as edits
    from gs_new_edits
    where
        unix_timestamp(edit_dt, "yyyy-MM-dd HH:mm:ss.0") >=
            (unix_timestamp(registration_dt, "yyyy-MM-dd HH:mm:ss.0") + (30*24*60*60)) and
        unix_timestamp(edit_dt, "yyyy-MM-dd HH:mm:ss.0") <
            (unix_timestamp(registration_dt, "yyyy-MM-dd HH:mm:ss.0") + (60*24*60*60))
        group by user_name, registration_dt, wiki_db
    ) 2nd_month
on
    (1st_month.user_name = 2nd_month.user_name and
    1st_month.wiki = 2nd_month.wiki and
    1st_month.cohort = 2nd_month.cohort)
group by 1st_month.cohort
""".format(
    gn_countries=repr(gn_countries),
    snapshot="2018-09",
    start="2018-04",
    end="2018-08"
))

gs_ner

Unnamed: 0,cohort,new_editors,new_editor_retention
0,2018-04,2125,0.009412
1,2018-05,2702,0.010733
2,2018-06,4639,0.010994
3,2018-07,35936,0.001419


## New editor retention, old method

In [8]:
old_gs_ner = wmf.hive.run("""
with gs_edits as (
    select 
        gd.wiki_db,
        event_user_text as user_name,
        event_timestamp as edit_dt,
        event_user_creation_timestamp as registration_dt
    from wmf.geoeditors_daily gd
    left join wmf.mediawiki_history mh
    on
        gd.wiki_db = mh.wiki_db and
        gd.user_fingerprint_or_id = event_user_id and
        snapshot = "{snapshot}"
    where
        month >= "{start}" and
        country_code not in {gn_countries} and
        gd.user_is_anonymous = 0 and
        event_entity = "revision" and
        event_type = "create" and
        event_timestamp >= "{start}" and
        event_user_is_created_by_system = 0 and
        event_user_creation_timestamp >= "{start}" and
        event_user_creation_timestamp < "{end}" and
        not (event_user_is_bot_by_name or array_contains(event_user_groups, "bot"))
)
select 
    1st_month.cohort,
    sum(cast(1st_month.edits >= 1 as int)) as new_editors,
    sum(cast(2nd_month.edits >= 1 as int)) / sum(cast(1st_month.edits >= 1 as int)) as new_editor_retention
from (
    select
        user_name,
        wiki_db as wiki,
        substr(registration_dt, 0, 7) as cohort,
        count(*) as edits
    from gs_edits
    where
        unix_timestamp(edit_dt, "yyyy-MM-dd HH:mm:ss.0") <
            (unix_timestamp(registration_dt, "yyyy-MM-dd HH:mm:ss.0") + (30*24*60*60))
    group by user_name, registration_dt, wiki_db
    ) 1st_month
left join (
    select
        user_name,
        wiki_db as wiki,
        substr(registration_dt, 0, 7) as cohort,
        count(*) as edits
    from gs_edits
    where
        unix_timestamp(edit_dt, "yyyy-MM-dd HH:mm:ss.0") >=
            (unix_timestamp(registration_dt, "yyyy-MM-dd HH:mm:ss.0") + (30*24*60*60)) and
        unix_timestamp(edit_dt, "yyyy-MM-dd HH:mm:ss.0") <
            (unix_timestamp(registration_dt, "yyyy-MM-dd HH:mm:ss.0") + (60*24*60*60))
        group by user_name, registration_dt, wiki_db
    ) 2nd_month
on
    (1st_month.user_name = 2nd_month.user_name and
    1st_month.wiki = 2nd_month.wiki and
    1st_month.cohort = 2nd_month.cohort)
group by 1st_month.cohort
""".format(
    gn_countries=repr(old_gn_countries),
    snapshot="2018-09",
    start="2018-04",
    end="2018-10"
))

In [9]:
old_gs_ner

Unnamed: 0,1st_month.cohort,new_editors,new_editor_retention
0,2018-04,1580,0.414557
1,2018-05,1977,0.516945
2,2018-06,4001,0.409648
3,2018-07,35658,0.04832
4,2018-08,37474,0.032156
5,2018-09,38596,


# Mobile-heavy wikis

In [11]:
mh_edits = wmf.hive.run("""
select
    date_format(event_timestamp, "yyyy-MM-01") as month,
    count(*) as total_edits,
    sum(cast(not (event_user_is_bot_by_name or array_contains(event_user_groups, "bot")) as int)) as nonbot_edits
from wmf.mediawiki_history
where
    snapshot = "{snapshot}" and
    event_entity = "revision" and
    event_type = "create" and
    wiki_db in {wikis} and
    event_timestamp >= "{start}" and
    event_timestamp < "{end}" and
    not event_user_is_anonymous
group by date_format(event_timestamp, "yyyy-MM-01")
""".format(
    snapshot="2018-09",
    start="2017-06",
    end="2018-10",
    wikis=repr(mob_wikis)
))

In [12]:
mh_edits["month"] = pd.to_datetime(mh_edits["month"])
mh_edits = mh_edits.set_index("month")
mh_edits = mh_edits.sort_index()

mh_edits.apply

In [13]:
mh_edits = mh_edits.sort_index()

In [14]:
calc_rpt(mh_edits["total_edits"])

latest_month    2018-09-01
value              847,000
mom_change          -30.1%
yoy_change          -31.9%
dtype: object

In [15]:
calc_rpt(mh_edits["nonbot_edits"])

latest_month    2018-09-01
value              443,000
mom_change           -9.2%
yoy_change            7.2%
dtype: object

In [16]:
mh_editors = wmf.hive.run("""
select
    month,
    sum(cast(content_edits >= 5 as int)) as active_editors
from (
    select
        date_format(event_timestamp, "yyyy-MM-01") as month,
        count(*) as content_edits
    from wmf.mediawiki_history
    where
        snapshot = "{snapshot}" and
        event_entity = "revision" and
        event_type = "create" and
        wiki_db in {wikis} and
        event_timestamp >= "{start}" and
        event_timestamp < "{end}" and
        not event_user_is_anonymous and
        not (event_user_is_bot_by_name or array_contains(event_user_groups, "bot")) and
        page_namespace_is_content
    group by date_format(event_timestamp, "yyyy-MM-01"), event_user_text
) editors
group by month
""".format(
    snapshot="2018-09",
    start="2017-06",
    end="2018-09",
    wikis=repr(mob_wikis)
))

In [17]:
mh_editors = mh_editors.sort_values("month")
mh_editors["month"] = pd.to_datetime(mh_editors["month"])
mh_editors = mh_editors.set_index("month")

mh_editors.tail()

Unnamed: 0_level_0,active_editors
month,Unnamed: 1_level_1
2018-04-01,3613
2018-05-01,3685
2018-06-01,3425
2018-07-01,3684
2018-08-01,3729


In [18]:
calc_rpt(mh_editors["active_editors"])

latest_month    2018-08-01
value                3,730
mom_change            1.2%
yoy_change            9.0%
dtype: object

In [19]:
mh_ner = wmf.hive.run("""
with mh_edits as (
    select 
        wiki_db,
        event_user_text as user_name,
        event_timestamp as edit_dt,
        event_user_creation_timestamp as registration_dt
    from wmf.mediawiki_history        
    where
        snapshot = "{snapshot}" and
        event_entity = "revision" and
        event_type = "create" and
        event_timestamp >= "{start}" and
        wiki_db in {wikis} and
        not event_user_is_anonymous and
        not event_user_is_created_by_system and
        event_user_creation_timestamp >= "{start}" and
        event_user_creation_timestamp < "{end}" and
        not (event_user_is_bot_by_name or array_contains(event_user_groups, "bot"))
)
select 
    1st_month.cohort,
    sum(cast(1st_month.edits >= 1 as int)) as new_editors,
    sum(cast(2nd_month.edits >= 1 as int)) / sum(cast(1st_month.edits >= 1 as int)) as new_editor_retention
from (
    select
        user_name,
        wiki_db as wiki,
        substr(registration_dt, 0, 7) as cohort,
        count(*) as edits
    from mh_edits
    where
        unix_timestamp(edit_dt, "yyyy-MM-dd HH:mm:ss.0") <
            (unix_timestamp(registration_dt, "yyyy-MM-dd HH:mm:ss.0") + (30*24*60*60))
    group by user_name, registration_dt, wiki_db
    ) 1st_month
left join (
    select
        user_name,
        wiki_db as wiki,
        substr(registration_dt, 0, 7) as cohort,
        count(*) as edits
    from mh_edits
    where
        unix_timestamp(edit_dt, "yyyy-MM-dd HH:mm:ss.0") >=
            (unix_timestamp(registration_dt, "yyyy-MM-dd HH:mm:ss.0") + (30*24*60*60)) and
        unix_timestamp(edit_dt, "yyyy-MM-dd HH:mm:ss.0") <
            (unix_timestamp(registration_dt, "yyyy-MM-dd HH:mm:ss.0") + (60*24*60*60))
        group by user_name, registration_dt, wiki_db
    ) 2nd_month
on
    (1st_month.user_name = 2nd_month.user_name and
    1st_month.wiki = 2nd_month.wiki and
    1st_month.cohort = 2nd_month.cohort)
group by 1st_month.cohort
""".format(
    snapshot="2018-09",
    start="2017-04",
    end="2018-08",
    wikis=repr(mob_wikis)
))

In [20]:
mh_ner

Unnamed: 0,1st_month.cohort,new_editors,new_editor_retention
0,2017-04,8227,0.042907
1,2017-05,7531,0.061346
2,2017-06,7390,0.052233
3,2017-07,7837,0.049381
4,2017-08,7617,0.042405
5,2017-09,7363,0.049301
6,2017-10,8550,0.044444
7,2017-11,8032,0.042953
8,2017-12,8298,0.043022
9,2018-01,8693,0.044173


In [59]:
mh_ner_2 = mh_ner.copy()
mh_ner_2["1st_month.cohort"] = pd.to_datetime(mh_ner_2["1st_month.cohort"])
mh_ner_2 = mh_ner_2.set_index("1st_month.cohort")

In [60]:
calc_rpt(mh_ner_2["new_editor_retention"])

latest_month    2018-07-01
value                 5.1%
mom_change            7.9%
yoy_change            2.7%
dtype: object