In [2]:
import pandas as pd
import wmfdata as wmf
from wmfdata import charting, mariadb, hive, spark
from wmfdata.utils import pct_str, pd_display_all

You are using wmfdata v1.0.3, but v1.0.4 is available.

To update, run `pip install --upgrade git+https://github.com/neilpquinn/wmfdata.git@release`.

To see the changes, refer to https://github.com/neilpquinn/wmfdata/blob/release/CHANGELOG.md


In [8]:
articles = pd.read_csv("../../data/processed/query_results/content_quality/indonesia/CQ_all_articles.csv")

### get pageviews for all articles in csv containing articles from various wikis

In [19]:
#mediawiki_page_history - https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Edits/Mediawiki_page_history
#fyi you need to truncate. Recommend 30 day period: https://meta.wikimedia.org/wiki/Research:Autoconfirmed_article_creation_trial#H14:_The_survival_rate_of_newly_created_articles_by_autoconfirmed_users_will_remain_stable
# Article deletion rates and article timespan - http://files.grouplens.org/papers/lam_group2009_wikipedia-longer-tail.pdf

#note - archive table in Mariadb holds revisions for pages that have been deleted

#snapshot = "{MWH_SNAPSHOT}"
#AND event_timestamp >="{contest_start}"
#AND event_timestamp <"{contest_end}"
        
pageview_articles = []
pageview_all_articles = []

def get_pageviews_hive(df):

    pv_indonesia_users_july = """
    SELECT 
      project,
      page_id,
      page_title,
      SUM(view_count) as july_view_count_internal
    FROM  wmf.pageview_hourly pvh
    LEFT JOIN canonical_data.wikis cdw
        ON CONCAT(cdw.language_code, '.', cdw.database_group) = pvh.project 
    WHERE
        pvh.agent_type='user'
        AND pvh.country_code = 'ID'
        AND pvh.year = 2020
        AND pvh.month = 7
        AND pvh.page_id IN {ids}
        AND cdw.database_group = 'wikipedia'
        AND cdw.language_code = '{language_code}'
    GROUP BY 
      project, page_id, page_title
    """
    
    pv_all_users_july = """
    SELECT 
      project,
      page_id,
      page_title,
      SUM(view_count) as july_view_count_internal
    FROM  wmf.pageview_hourly pvh
    LEFT JOIN canonical_data.wikis cdw
        ON CONCAT(cdw.language_code, '.', cdw.database_group) = pvh.project 
    WHERE
        pvh.agent_type='user'
        AND pvh.year = 2020
        AND pvh.month = 7
        AND pvh.page_id IN {ids}
        AND cdw.database_group = 'wikipedia'
        AND cdw.language_code = '{language_code}'
    GROUP BY 
      project, page_id, page_title
    """
        
    for wiki in df['language_code'].unique():
        grouping = df.loc[df['language_code'] == wiki].groupby('language_code')['page_id'].apply(pd.DataFrame)
        ids = tuple(list(grouping[wiki]))
        _id_ = grouping.reset_index(drop=True).iloc[0][0]
        language_code = wiki
        pageviews_results = spark.run(pv_indonesia_users_july.format(ids=ids, language_code=language_code))                
        pageviews_all_results = spark.run(pv_all_users_july.format(ids=ids, language_code=language_code)) 
        pageview_articles.append(pageviews_results)
        pageview_all_articles.append(pageviews_all_results)
    
    return(pageview_articles)

In [None]:
get_pageviews_hive(articles)
pv_id_query_data = pd.concat(pageview_articles)
pv_all_query_data = pd.concat(pageviews_all_results)

In [10]:
pv_id_query_data['language_code'] = pv_id_query_data['project'].str.split('.').str[0]

In [5]:
pv_id_query_data.to_csv("../../data/raw/articles/2019/query_results/content_quality/per_wiki_full/Indonesia/july_incountry_pageviews.csv", sep=',', encoding = 'utf-8', index=False) 

In [10]:
pv_df = pd.merge(pv_id_query_data, articles, on=['language_code', 'page_id', 'page_title'], how='right').fillna(0)

In [11]:
pv_df.to_csv("../../data/raw/articles/2019/query_results/content_quality/per_wiki_full/Indonesia/articles_w_july_incountry_pageviews.csv", sep=',', encoding = 'utf-8', index=False) 

### get pageviews for GLOW articles in GLOW Hive table

In [16]:
pv_india_users_feb = spark.run("""
SELECT 
  gta.database_code AS wikicode,
  gta.page_id,
  gta.page_title,
  SUM(view_count) as feb_view_count_internal
FROM florez.glow_tiger_articles gta
LEFT JOIN canonical_data.wikis cdw
    ON cdw.database_code = gta.database_code
LEFT JOIN wmf.pageview_hourly pvh
    ON gta.page_id = pvh.page_id
    AND CONCAT(cdw.language_code, '.', cdw.database_group) = pvh.project 
    AND pvh.year = 2020
    AND pvh.month = 2
WHERE
    pvh.agent_type='user'
    AND pvh.country_code = 'IN'
GROUP BY 
  gta.database_code, gta.page_id, gta.page_title
""")

In [13]:
pv_all_users_feb = spark.run("""
SELECT 
  gta.database_code AS wikicode,
  gta.page_id,
  gta.page_title,
  SUM(view_count) as feb_view_count_global
FROM florez.glow_tiger_articles gta
LEFT JOIN canonical_data.wikis cdw
    ON cdw.database_code = gta.database_code
LEFT JOIN wmf.pageview_hourly pvh
    ON gta.page_id = pvh.page_id
    AND CONCAT(cdw.language_code, '.', cdw.database_group) = pvh.project 
    AND pvh.year = 2020
    AND pvh.month = 2
WHERE
    pvh.agent_type='user'
GROUP BY 
    gta.database_code, gta.page_id, gta.page_title
""")

In [None]:
pv_all_users_feb

In [23]:
pv_df = pd.merge(pv_all_users_feb, pv_india_users_feb, on=['database_code', 'page_id', 'page_title'], how = 'outer').fillna(0)

In [33]:
pv_df.to_csv("../../data/processed/query_results/content_quality/pv.csv", sep=',', encoding = 'utf-8', index=False) 

In [None]:
#merge with full df

In [None]:
final_frame_updated = pd.read_csv("../../data/processed/query_results/content_quality/b1_final_frame_updated.csv", sep=',', encoding = 'utf-8', parse_dates=True)  

In [None]:
final_frame_updated_pv  = pd.merge(pv_df, final_frame_updated, on=['wikicode', 'page_id', 'page_title'], how = 'right').fillna(0)

In [None]:
final_frame_updated_pv.to_csv("../../data/processed/query_results/content_quality/b3_final_frame_updated_pv.csv", sep=',', encoding = 'utf-8', index=False) 

In [None]:
pv_all_users_feb = spark.run("""
SELECT 
  gta.database_code AS wikicode,
  gta.page_id,
  gta.page_title,
  SUM(view_count) as feb_view_count_global
FROM florez.glow_tiger_articles gta
LEFT JOIN canonical_data.wikis cdw
    ON cdw.database_code = gta.database_code
LEFT JOIN wmf.pageview_hourly pvh
    ON gta.page_id = pvh.page_id
    AND CONCAT(cdw.language_code, '.', cdw.database_group) = pvh.project 
    AND pvh.year = 2020
    AND pvh.month = 2
WHERE
    pvh.agent_type='user'
GROUP BY 
    gta.database_code, gta.page_id, gta.page_title
""")

In [14]:
feb_global_views_referrer = spark.run("""
SELECT 
  gta.database_code AS wikicode,
  gta.page_id,
  gta.page_title,
  SUM(view_count) as feb_view_count_global,
  referer_class
FROM florez.glow_tiger_articles gta
LEFT JOIN canonical_data.wikis cdw
    ON cdw.database_code = gta.database_code
LEFT JOIN wmf.pageview_hourly pvh
    ON gta.page_id = pvh.page_id
    AND CONCAT(cdw.language_code, '.', cdw.database_group) = pvh.project 
    AND pvh.year = 2020
    AND pvh.month = 2
WHERE
    pvh.agent_type='user'
GROUP BY 
    gta.database_code, gta.page_id, gta.page_title, referer_class
""")