# Three Year Comparison

In [2]:
# import wmfdata as wmf
import wmfdata as wmf
from wmfdata import charting, mariadb, hive, spark
from wmfdata.utils import pct_str, pd_display_all, print_err
import pandas as pd
import requests
import re

import time
import datetime as dt 
from datetime import datetime, timedelta, date
import dateutil
#from dateutil.relativedelta import relativedelta

#%load_ext sql_magic
%matplotlib inline

You are using wmfdata v1.0.3, but v1.0.4 is available.

To update, run `pip install --upgrade git+https://github.com/neilpquinn/wmfdata.git@release`.

To see the changes, refer to https://github.com/neilpquinn/wmfdata/blob/release/CHANGELOG.md


In [9]:
%run 2b_data_handling.ipynb
%store -r query_vars

Stored 'query_vars' (dict)
Stored 'quality_vars' (dict)


#### Unique Devices - country <a class="anchor" id="devices"></a>
[Back to Table of Contents](#toc)

In [None]:
co_uds_r = spark.run("""
select
    country,
    year,
    sum(uniques_estimate) 
from wmf.unique_devices_per_domain_monthly
where
    year >= 2016
    AND country IN ({glow_countries})
group by country, year
""".format(**query_vars)) 

#  CONCAT(year,LPAD(month,2,'0')) >= 201809
#  AND CONCAT(year,LPAD(month,2,'0')) < 201910

In [None]:
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
co_uds = co_uds_r.pivot(index='country', columns='year', values='sum(uniques_estimate)')

In [None]:
co_uds.columns = ['2016','2017','2018', '2019', '2020_to_April']

In [None]:
co_uds['CAGR'] = ((co_uds['2019']/co_uds['2016']) **(1/4)-1)

#https://stackoverflow.com/questions/23981601/format-certain-floating-dataframe-columns-into-percentage-in-pandas
co_uds['CAGR'] = pd.Series(["{0:.2f}%".format(val * 100) for val in co_uds['CAGR']], index = co_uds.index)

co_uds['2019_monthly_avg'] = (co_uds['2019']/12)

In [None]:
#co_uds_cols = co_uds.columns.tolist()
co_uds_cols = ['2016', '2017', '2018', '2019', 'CAGR', '2019_monthly_avg','2020_to_April']
co_uds = co_uds[co_uds_cols]

In [None]:
co_uds.to_csv("../../data/processed/query_results/regional_counts/co_uds_2020_april.csv", sep=',', encoding = 'utf-8')

#### Unique Devices - domain <a class="anchor" id="devices"></a>
[Back to Table of Contents](#toc)

In [None]:
do_uds_r = hive.run("""
select
    domain, 
    country,
    year,
    sum(uniques_estimate) 
from wmf.unique_devices_per_domain_monthly
where
    year >= 2016
    AND country IN ({glow_countries})
    AND domain IN ({glow_domains})
group by country, domain, year
""".format(**query_vars)) 

In [None]:
#https://stackoverflow.com/questions/35414625/pandas-how-to-run-a-pivot-with-a-multi-index
do_uds_r.set_index(['country', 'domain', 'year']).unstack(level=-1)

#### Unique Devices - domain India <a class="anchor" id="devices"></a>
[Back to Table of Contents](#toc)

In [None]:
india_uds_domain_r = hive.run("""
select
    domain, 
    country,
    year,
    sum(uniques_estimate) 
from wmf.unique_devices_per_domain_monthly
where
    year >= 2016
    AND country_code IN ({india_countries})
    AND domain IN ({india_domains})
group by country, domain, year
""".format(**query_vars)) 

In [None]:
india_uds_domain_r

In [None]:
india_uds_domain = india_uds_domain_r.pivot(
                                    index='domain',
                                    columns='year', 
                                    values='_c3')

In [None]:
india_uds_domain.columns = ['2016','2017','2018', '2019_to_nov']

india_uds_domain['CAGR'] = ((india_uds_domain['2018']/india_uds_domain['2016']) **(1/3)-1)

#https://stackoverflow.com/questions/23981601/format-certain-floating-dataframe-columns-into-percentage-in-pandas
india_uds_domain['CAGR'] = pd.Series(["{0:.2f}%".format(val * 100) for val in india_uds_domain['CAGR']], index = india_uds_domain.index)

india_uds_domain['2018_monthly_avg'] = (india_uds_domain['2018']/12)

In [None]:
india_uds_domain.sort_values(['2018', 'CAGR'], 
               ascending=[False, False])

## Articles<a class="anchor" id="editors"></a>
[Back to Table of Contents](#toc)

## Historical Article Counts

### Get data using local

In [None]:
# see 5b_collect_wiki_hist_article_counts.ipynb

### Clean historical counts data

In [6]:
wiki_counts = pd.read_csv('../../data/processed/query_results/regional_counts/indonesia_wiki_counts.csv', thousands=',', encoding = 'utf-8')

In [7]:
#combine files
#wiki_counts = pd.concat([wiki_art_count_results_1920, wiki_art_count_results_1619], sort=True).drop_duplicates(subset=['count', 'date', 'lang'], keep='first').reset_index(drop =True)
#drop na
wiki_counts= wiki_counts.dropna()
#datetime handling
wiki_counts['date'] = pd.to_datetime(wiki_counts['date'])
wiki_counts['month_year']= wiki_counts['date'].dt.to_period('M')
wiki_counts['count'] = wiki_counts['count'].astype(str).astype(int)
#wiki_counts["year"] = wiki_counts['date'].dt.year
#wiki_counts['month'] = wiki_counts['date'].dt.month

In [19]:
#wiki_counts.to_csv("../../data/processed/query_results/regional_counts/wiki_counts_India_2016_2020.csv", sep=',', encoding = 'utf-8', index=False)
#wiki_counts.to_csv("../../data/processed/query_results/regional_counts/arabic/wiki_counts_mena_2016_2020.csv", sep=',', encoding = 'utf-8', index=False)

In [19]:
wiki_counts = pd.read_csv('../../data/processed/query_results/regional_counts/indonesia_wiki_counts.csv', thousands=',', encoding = 'utf-8')

### Current article Count <a class="anchor" id="new_articles"></a>
[Back to Table of Contents](#toc)

In [None]:
ac.to_csv("../../data/processed/query_results/regional_counts/ac2020_{}.csv", sep=',', encoding = 'utf-8', index=False)

## API counts 
##### note: edited-pages - root pages-related metrics on the api

In [None]:
#https://wikimedia.org/api/rest_v1/#/Edited%20pages%20data
#https://wikitech.wikimedia.org/wiki/Analytics/AQS/Wikistats_2#Total_article_count
#https://phabricator.wikimedia.org/T240253
#https://phabricator.wikimedia.org/T220524

In [11]:
# Get a list of project URLs (each one in a 1-tuple)
#https://hi.wikipedia.org

wp_domains = hive.run("""
select domain_name
from canonical_data.wikis
where database_group = "wikipedia"
      AND database_code IN ({indonesia_wiki_dbs})
""".format(**query_vars))

wp_domains_tuple_list = [tuple(r) for r in wp_domains .to_numpy()]
num_domains = len(wp_domains)

## Edited-pages/new

In [12]:
#pages created stats 
#The following parameters are {wiki}/{editor type}/{page type}/{granularity}/{start}/{end}

#adapted from:
#https://github.com/wikimedia-research/Editing-movement-metrics/blob/74dd6575703125a4386bfd8fea6546053458e2a6/02-calculation.ipynb

### Content metrics via API

In [13]:
NEW_PAGES_API = (
    # Replaces "https://wikimedia.org/api/rest_v1/metrics/" due to https://phabricator.wikimedia.org/P8605
    "http://aqs1004.eqiad.wmnet:7232/analytics.wikimedia.org/v1/" 
    "edited-pages/new/{project}/user/content/monthly/{start}/{end}"
)

def get_new_pages(
    project="all-projects",
    start=query_vars["api_metrics_month_start"],
    end=query_vars["api_metrics_month_end"]
):
    url = NEW_PAGES_API.format(
        project = project,
        start = start,
        end = end
    )
    
    r = requests.get(url, headers=headers)
    data = r.json()["items"][0]["results"]
    frame = pd.DataFrame(data)
    frame["timestamp"] = pd.to_datetime(frame["timestamp"])
    frame = frame.rename(columns={"timestamp": "month"})
    
    return frame

### Wikipedias

In [14]:
# Query the API for each project and append records to a list
results = []

for idx, val in enumerate(wp_domains_tuple_list):
    domain = val[0]
    
    if idx % 10 == 0:
        msg = "Now on the {}th project of {} ({})"
        print_err(msg.format(idx, num_domains, domain))
        
    frame = get_new_pages(project=domain).reset_index()
    frame["project"] = domain
    records = frame.to_dict("records")
    results.extend(records)
    
    # Sleep 20 milliseconds
    time.sleep(0.02)

# Turn the big list of records into a data frame
new_per_wp = pd.DataFrame(results)

# Sum across projects to get new Wikipedia articles per month
new_wp = new_per_wp.groupby("month").agg(
    {"new_pages": "sum"}
).rename(columns={"new_pages": "net_new_Wikipedia_articles"}).reset_index();


Now on the 0th project of 4 (id.wikipedia.org)


# clean epn

In [15]:
# Strip timezones returned by API so our month columns merge nicely
new_per_wp["date"] = pd.to_datetime(new_per_wp["month"])
new_per_wp['month_year']= new_per_wp['date'].dt.to_period('M')
new_per_wp.rename(columns={"new_pages": "net_new_content_pages"});

#make sure month column is in datetime format
#new_per_wp['month'] = pd.to_datetime(new_per_wp['month'])

#create new column, 'year'
#new_per_wp['year'] = new_per_wp['month'].dt.year



In [16]:
#create new countries column
new_per_wp['countries'] = new_per_wp['project'].apply(add_country_column).str[0]
#rename the MENA entry in the countries column
new_per_wp['countries'] = new_per_wp['countries'].replace({'M':'MENA'})

#format datetime column
#new_per_wp['month'] = new_per_wp['month'].map(lambda x: x.strftime('%Y-%m'))

del new_per_wp['index']

In [None]:
new_per_wp.to_csv("../../data/processed/query_results/regional_counts/edited_pages_new_{}.csv",sep=',', encoding = 'utf-8', index=False)

### Edited-pages/aggregate<a class="anchor" id="new_articles"></a>
[Back to Table of Contents](#toc)

In [None]:
EDITED_PAGES_AGG_API = (
    "http://aqs1004.eqiad.wmnet:7232/analytics.wikimedia.org/v1/" 
    "edited-pages/aggregate/{project}/user/content/all-activity-levels/monthly/20000101/20191201"
)

# Create container for results
api_epa_results = []

def get_edited_pages_agg_count(
    project="all-projects",
    start=query_vars["api_metrics_month_first_day"],
    end=query_vars["api_metrics_month_day_after"]
    
):
    epa_url = EDITED_PAGES_AGG_API.format(
        project = project,
        start = start,
        end = end
    )
    
    r = requests.get(epa_url, headers=headers)
    data = r.json()["items"][0]["results"]
    frame = pd.DataFrame(data)
    frame["timestamp"] = pd.to_datetime(frame["timestamp"])
    frame = frame.rename(columns={"timestamp": "month"})
    
    return frame

# Query the API for each project and append records to a list
epm_results = []

for idx, val in enumerate(wp_domains):
    domain = val[0]
    
    if idx % 10 == 0:
        msg = "Now on the {}th project of {} ({})"
        print(msg.format(idx, num_domains, domain))
        
    epm_frame = get_edited_pages_agg_count(project=domain).reset_index()
    epm_frame["project"] = domain
    epm_records = epm_frame.to_dict("records")
    epm_results.extend(epm_records)
    
    # Sleep 20 milliseconds
    time.sleep(0.02)

# Turn the big list of records into a data frame
edited_pages_m_r = pd.DataFrame(epm_results)