# GLOW - Data Handling

## Table of Contents  <a class="anchor" id="toc"></a>

* [Data Handling](#top)
    1. [functions](#nea)
    2. [filter definitions](#pagelen)
    3. [methods](#pageviews_detailed)

In [2]:
import datetime as dt 
from datetime import date
from datetime import datetime, timedelta
import dateutil
from dateutil.relativedelta import relativedelta
from dateutil.relativedelta import *
import gc
import pandas as pd

## Functions
[Back to Table of Contents](#toc)

## Data Handling
[Back to Table of Contents](#toc)

In [3]:
def top_10(df, col):
    return df.sort_values(col, ascending=False).head(10)

In [4]:
def merge_in(df, on="database_code"):
    global wikis
    wikis = pd.merge(wikis, df, how="left", on=on).fillna(0)

In [5]:
def merge_in_content(df, on="page_id"):
    global articles
    articles = pd.merge(articles, df, how="left", on=on).fillna(0)

In [6]:
def add_country_column(wiki):
    if wiki in glow_india.get_quoted_wiki_domains():
        return glow_india.get_countries()
    elif wiki in glow_indonesia.get_quoted_wiki_domains():
        return glow_indonesia.get_countries()
    elif wiki in glow_mena.get_quoted_wiki_domains():
        return 'MENA'
    elif wiki in glow_argentina.get_quoted_wiki_domains():
        return glow_argentina.get_countries()
    elif wiki in glow_nigeria.get_quoted_wiki_domains():
        return glow_nigeria.get_countries()
    else:
        return 'not_known'

In [7]:
def create_fill_column(x):
    if (x < quality_vars.get('contest_start_dt')):
        return('expanded')
    elif (x > quality_vars.get('contest_end_dt')):
        return('post')
    elif (x >= quality_vars.get('contest_start_dt')) & (x <= quality_vars.get('contest_end_dt')):
        return('new')

In [8]:
#create bins to seperate number of editors/links into categories
epa_bins = [0, 1, 2, 4, 6, 8, 10, 20, 100, 200]
oel_bins = [0, 1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 100, 200]
oext_bins = [0, 1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 30, 40, 50, 100, 200]

## Filter definitions
[Back to Table of Contents](#toc)

In [9]:
#Parent class
class GlowProject:
    '''
    A Glow project.
    '''

    #Expect boolean
    #glow_project = glow_project
    
    _instances = set()
    
    def __init__(self, countries, iso_codes, wiki_codes, wiki_names): 
        '''
        :param countries: names of the countries that are part of this project
        :type countries: list
        
        :param iso_codes: iso_codes of the countries that are part of this project
        :type iso_codes: list
        
        :param wiki_codes: abbreviated codes of the wikis that are part of this project
        :type wiki_codes: list
        
        :param wiki_names: full names of the wikis that are part of this project (expected to be corresponding to the codes in `wiki_code`)
        :type wiki_names: list
        
        :param start_date: start date of the project (as a YYYY-MM-dd formatted string)
        :type start_date: str
        
        :param end_date: end date of the project (as a YYYY-MM-dd formatted string)
        :type end_date: str

        
        '''
        
        self.countries = countries
        self.iso_codes = iso_codes
        self.wiki_codes = wiki_codes #wiki abbreviation
        self.wiki_names = wiki_names #language name for the wiki

        
        
        
    @classmethod
    def getinstances(cls):
        dead = set()
        for ref in cls._instances:
            obj = ref()
            if obj is not None:
                yield obj
            else:
                dead.add(ref)
        cls._instances -= dead
        
   
    def get_glow_meta_info(): #TO DO  
        for obj in GlowProject.get_instances():
            print (obj.countries)
    
    def get_countries(self):
        return (self.countries)
    
    def get_wiki_names(self):
        return(self.wiki_names)
    
    def get_wiki_codes(self):
        return(self.wiki_codes)
    
    
    
    
    def get_wiki_dbs_mariadb(self):
        '''
        Return a list of all the wiki_db+"wiki", databases,
        associated with this project for use with MariaDB queries.
        '''
        dbs = ["{}wiki".format(wiki_code.lower()) for wiki_code in self.wiki_codes]
        return(dbs)
    
    def get_quoted_wiki_dbs(self):
        '''
        Return a quoted, comma-separated list of the names of the wiki_dbs, databases,
        associated with this project.
        '''
        return(', '.join(['"{}wiki"'.format(wiki_code.lower()) for wiki_code in self.wiki_codes]))      

    def get_wiki_codes(self):
        '''
        Return a quoted, comma-separated list of the wiki_codes 
        associated with this project.
        '''
        return(', '.join(['{}'.format(wiki_code) for wiki_code in self.wiki_codes]))
       
    def get_quoted_wiki_projects(self):
        '''
        Return a quoted, comma-separated list of the wiki projects 
        associated with this project. Example: "bn.wikipedia".
        '''
        return(', '.join(['"{}.wikipedia"'.format(wiki_code.lower()) for wiki_code in self.wiki_codes]))
    
    def get_quoted_wiki_domains(self):
        '''
        Return a quoted, comma-separated list of the wiki domains 
        associated with this project. Example: "bn.wikipedia.org".
        '''
        return(', '.join(['"{}.wikipedia.org"'.format(wiki_code.lower()) for wiki_code in self.wiki_codes]))
    
    
    def get_quoted_glow_countries():
        '''
        Return a quoted, comma-separated list of the country or countries 
        associated with the Glow project at large.
        '''
        glow_countries = []
        for obj in gc.get_objects():
            if isinstance(obj, GlowProject):
                glow_countries.append(obj.countries)
    
        glow_countries_list = [item for gatherlist in list(glow_countries) for item in gatherlist]
        return(', '.join(['"{}"'.format(country) for country in glow_countries_list]))
    
    def get_quoted_glow_wiki_codes():
        '''
        Return a quoted, comma-separated list of the wiki_codes 
        associated with the Glow project at large.
        '''
        glow_wiki_codes = []
        for obj in gc.get_objects():
            if isinstance(obj, GlowProject):
                glow_wiki_codes.append(obj.wiki_codes)
    
        glow_wiki_codes_list = [item for gatherlist in list(glow_wiki_codes) for item in gatherlist]
        return(', '.join(['"{}"'.format(wiki_code) for wiki_code in glow_wiki_codes_list]))
    
    def get_quoted_glow_wiki_dbs():
        '''
        Return a quoted, comma-separated list of the wiki_dbs 
        associated with the Glow project at large.
        '''
        glow_wiki_dbs = []
        for obj in gc.get_objects():
            if isinstance(obj, GlowProject):
                glow_wiki_dbs.append(obj.wiki_codes)
    
        glow_wiki_codes_dbs = [item for gatherlist in list(glow_wiki_dbs) for item in gatherlist]
        return(', '.join(['"{}wiki"'.format(wiki_code.lower()) for wiki_code in glow_wiki_codes_dbs]))
    
    def get_quoted_glow_wiki_projects():
        '''
        Return a quoted, comma-separated list of the wiki projects 
        associated with the Glow project at large. Example: "bn.wikipedia".
        '''
        glow_wiki_projects = []
        for obj in gc.get_objects():
            if isinstance(obj, GlowProject):
                glow_wiki_projects.append(obj.wiki_codes)
    
        glow_wiki_codes_projects = [item for gatherlist in list(glow_wiki_projects) for item in gatherlist]
        return(', '.join(['"{}.wikipedia"'.format(wiki_code.lower()) for wiki_code in glow_wiki_codes_projects]))
    
    def get_quoted_glow_wiki_domains():
        '''
        Return a quoted, comma-separated list of the wiki domains 
        associated with the Glow project at large. Example: "bn.wikipedia.org".
        '''
        glow_wiki_domains = []
        for obj in gc.get_objects():
            if isinstance(obj, GlowProject):
                glow_wiki_domains.append(obj.wiki_codes)
    
        glow_wiki_codes_domains = [item for gatherlist in list(glow_wiki_domains) for item in gatherlist]
        return(', '.join(['"{}.wikipedia.org"'.format(wiki_code.lower()) for wiki_code in glow_wiki_codes_domains]))
    

In [10]:
#countries, iso_codes, wiki_codes, wiki_names

glow_india = GlowProject(
    ['India'],
    ['IN'],
    ['AS','BN','GU','HI','KN','ML','MR','OR','PA', #add in sorted order
     'PNB','SA','SAT','TA','TCY','TE','UR'],#add in sorted order
    ['Bengali Wikipedia','Hindi Wikipedia','Malayalam Wikipedia','Punjabi Wikipedia', 'Punjabi Western Wikipedia',
     'Tamil Wikipedia','Telugu Wikipedia', 'Assamese Wikipedia', 'Sanskrit Wikipedia', 'Kannada Wikipedia', 
     'Tulu Wikipedia', 'Gujarati Wikipedia', 'Marathi Wikipedia', 
     'Santali Wikipedia', 'Urdu Wikipedia', 'Odia Wikipedia'],
    
    ##Adding start and end time of the project. In this
    ##case they're defined by today, TBD
    #(dt.datetime.today() - dt.timedelta(days=30)).strftime('%Y-%m'),
    #dt.datetime.today().strftime('%Y-%m'),
)


glow_indonesia = GlowProject(
    ['Indonesia'],
    ['ID'],
    ['ID', 'JV', 'MIN', 'SU'],
    ['Indonesian Wikipedia','Sundanese Wikipedia','Javanese Wikipedia','Minangkabau Wikipedia'],    
)

glow_mena = GlowProject( 
    ['Egypt','Jordan','Tunisia','Algeria','Morocco','Lebanon','Iraq'],
    ['EG','JO','TN','DZ','MA','LB','IQ'],
    ['AR'],
    ['Arabic Wikipedia'],

)


In [11]:
#fyi, big wikis list is hard coded... "wikidatawiki", "commonswiki","enwiki"

#list format query variables for querying databases that can be accessed via wmf.mariadb
india_glow_wiki_dbs_mariadb = glow_india.get_wiki_dbs_mariadb() 

contest_start = '2019-10-10'
contest_end = '2020-02-11'

# Metric month. The mediawiki_history snapshot must be from the metrics month or later.
last_month = dt.date.today().replace(day=1) - dt.timedelta(days=1)
start = dt.datetime.strptime("2019-01-01", "%Y-%m-%d")
end = start+timedelta(days=365)

### Content metrics via API

In [12]:
headers = {
    "User-Agent": "Partnerships' GLOW project Data Analyst"
}

PAGES_START = "2019-01" #"2016-01" 

# so we can easily use them to format strings
metrics_month = pd.Period(PAGES_START)

### query_vars

In [13]:
#str/tuple/dt/datetime  format query variables for querying databases that can be accessed via wmf.hive 
query_vars = dict(
    india_countries = glow_india.get_countries(),
    india_wiki_codes = glow_india.get_wiki_codes(),
    india_wiki_dbs = glow_india.get_quoted_wiki_dbs(), 
    #india_country_codes = glow_india.get_quoted_iso_codes(), 
    india_wiki_projects = glow_india.get_quoted_wiki_projects(),
    india_domains = glow_india.get_quoted_wiki_domains(),
    
    indonesia_countries = glow_indonesia.get_countries(),
    indonesia_wiki_dbs = glow_indonesia.get_quoted_wiki_dbs(),
    #indonesia_country_codes = glow_indonesia.get_quoted_iso_codes(),
    indonesia_wiki_projects = glow_indonesia.get_quoted_wiki_projects(),
    indonesia_domains = glow_indonesia.get_quoted_wiki_domains(),
    
    mena_countries = glow_mena.get_countries(),
    mena_wiki_dbs = glow_mena.get_quoted_wiki_dbs(),
    #mena_country_codes = glow_mena.get_quoted_iso_codes(),
    mena_wiki_projects = glow_mena.get_quoted_wiki_projects(),
    mena_domains = glow_mena.get_quoted_wiki_domains(),
    
    glow_countries = GlowProject.get_quoted_glow_countries(),
    glow_wiki_codes = GlowProject.get_quoted_glow_wiki_codes(),
    glow_wiki_dbs = GlowProject.get_quoted_glow_wiki_dbs(),
    glow_projects = GlowProject.get_quoted_glow_wiki_projects(),
    #glow_iso_codes = GlowProject.get_quoted_glow_iso_codes(),
    glow_domains = GlowProject.get_quoted_glow_wiki_domains(),
    
    wikis_big = ("enwiki", "wikidatawiki", "commonswiki", "wikisource"),
        
    #india_country = glow_india.get_quoted_countries(),
    MWH_SNAPSHOT = last_month.strftime("%Y-%m"),
    
    TODAY_DATE = dt.datetime.now().strftime('%Y-%m'),
    TODAY_DATE_FULL= dt.datetime.now().strftime('%Y-%m-%d'),
    TODAY_DATE_pv = dt.datetime.now().strftime('%Y%m'),
    #TODAY_DATE_YEAR = dt.datetime.now().strftime('%Y'),
    #TODAY_DATE_MONTH = dt.datetime.now().strftime('%m'),
    
    M_START_DATE = (datetime.today() - timedelta(days=30)).strftime('%Y-%m'),
    #M_START_DATE_YEAR = (datetime.today() - timedelta(days=30)).strftime('%Y'),
    #M_START_DATE_MONTH = (datetime.today() - timedelta(days=30)).strftime('%m'),  
    
    M2_START_DATE = (datetime.today() - timedelta(days=60)).strftime('%Y-%m'),
    M2_START_DATE_pv = (datetime.today() - timedelta(days=60)).strftime('%Y%m'),
    
    Y_START_DATE = (datetime.today() - timedelta(days=365)).strftime('%Y-%m'),
    Y_START_DATE_pv = (datetime.today() - timedelta(days=365)).strftime('%Y%m'),
    Y_START_DATE_FULL = (datetime.today() - timedelta(days=365)).strftime('%Y-%m-%d'),
    #Y_START_DATE_YEAR = (datetime.today() - timedelta(days=365)).strftime('%Y'),
    #Y_START_DATE_MONTH = (datetime.today() - timedelta(days=365)).strftime('%m'),
    
    Y2_START_DATE = (datetime.today() - timedelta(days=730)).strftime('%Y-%m'),

#API data_params    
    api_metrics_month_start = metrics_month.asfreq("D", how="start").strftime("%Y%m%d"),
    api_metrics_month_end = (metrics_month + 14).asfreq("D", how="start").strftime("%Y%m%d"), #50
    
    contest_start_dt_first_day_of_m = datetime.strptime(contest_start, '%Y-%m-%d').replace(day=1).strftime('%Y-%m-%d'),
    contest_start_dt_first_day_5m_pre = (datetime.strptime(contest_start, '%Y-%m-%d') + relativedelta(months=-5, day=1)).strftime('%Y-%m-%d'),
    contest_start_dt_first_day_12m_pre = (datetime.strptime(contest_start, '%Y-%m-%d') + relativedelta(months=-12, day=1)).strftime('%Y-%m-%d'),
    
    contest_end_dt_first_day_next_m = (datetime.strptime(contest_end, '%Y-%m-%d') + relativedelta(months=1, day=1)).strftime('%Y-%m-%d'),
    #contest_end_dt_first_day_next_m = datetime.strptime(contest_end, '%Y-%m-%d').replace(day=1)+ relativedelta(months=1),
    #contest_end_dt_next_day = datetime.strptime(contest_end, '%Y-%m-%d') + dt.timedelta(days=+1),
    #contest_end_dt_first_day_5m_pre = datetime.strptime(contest_end, '%Y-%m-%d') + relativedelta(months=-5, day=1),
    #contest_end_dt_first_day_12m_pre = datetime.strptime(contest_start, '%Y-%m-%d') + relativedelta(months=-12, day=1),
    
    
#5c collect baselines date_params 
    contest_start_dt = datetime.strptime(contest_start, '%Y-%m-%d').strftime('%Y-%m'),
    contest_start_dt_FULL = datetime.strptime(contest_start, '%Y-%m-%d').strftime('%Y-%m-%d'),
    contest_start_dt_pv = datetime.strptime(contest_start, '%Y-%m-%d').strftime('%Y%m'),
    
    contest_start_dt_next_m_pv = (datetime.strptime(contest_start, '%Y-%m-%d') + relativedelta(months=1)).strftime('%Y%m'),    
    contest_start_dt_12m_pre_pv = (datetime.strptime(contest_start, '%Y-%m-%d') - timedelta(days=365)).strftime('%Y%m'),
    
    #prior round
    contest_start_dt_12m_pre = (datetime.strptime(contest_start, '%Y-%m-%d') - timedelta(days=365)).strftime('%Y-%m'),
    contest_start_dt_13m_pre = (datetime.strptime(contest_start, '%Y-%m-%d') + relativedelta(months=-13)).strftime('%Y-%m'),
    
    contest_start_dt_12m_pre_FULL  = (datetime.strptime(contest_start, '%Y-%m-%d') - timedelta(days=365)).strftime('%Y-%m-%d'),
    

    
    #current round
    contest_end_dt = datetime.strptime(contest_end, '%Y-%m-%d').strftime('%Y-%m'),
    contest_end_dt_FULL = datetime.strptime(contest_end, '%Y-%m-%d').strftime('%Y-%m-%d'),
    
    contest_end_dt_next_m = (datetime.strptime(contest_end, '%Y-%m-%d') + relativedelta(months=1)).strftime('%Y-%m'),
    contest_end_dt_next_m_pv = (datetime.strptime(contest_end, '%Y-%m-%d') + relativedelta(months=1)).strftime('%Y%m'),
    contest_end_dt_pv = datetime.strptime(contest_end, '%Y-%m-%d').strftime('%Y%m'),

)

In [19]:
#need country codes variable..now just have countries
query_vars['india_countries']

['India']

In [21]:
%store query_vars

Stored 'query_vars' (dict)


### content quality

In [75]:
#variables and dates for content quality notebook
#change pv month, country
quality_vars = dict(
    country_code  = "IN",
    contest_start = contest_start,
    contest_end   = contest_end,
    contest_start_dt        = datetime.strptime(contest_start, '%Y-%m-%d'),
    contest_end_dt          = datetime.strptime(contest_end, '%Y-%m-%d'),
    contest_end_dt_month    = datetime.strptime(contest_end, '%Y-%m-%d').strftime('%m'),
    contest_end_dt_day      = datetime.strptime(contest_end, '%Y-%m-%d').strftime('%d'),
    contest_end_dt_1M_month = (datetime.strptime(contest_end, '%Y-%m-%d') + timedelta(days=30)).strftime('%m'),
    contest_end_dt_1M_day   = (datetime.strptime(contest_end, '%Y-%m-%d') + timedelta(days=30)).strftime('%d'),
    MWH_SNAPSHOT = last_month.strftime("%Y-%m"),
)

In [None]:
%store quality_vars