In [1]:
import pandas as pd
import wmfdata as wmf
from wmfdata import charting, mariadb, hive, spark
from wmfdata.utils import pct_str, pd_display_all

In [16]:
pv_india_users_feb = spark.run("""
SELECT 
  gta.database_code AS wikicode,
  gta.page_id,
  gta.page_title,
  SUM(view_count) as feb_view_count_internal
FROM florez.glow_tiger_articles gta
LEFT JOIN canonical_data.wikis cdw
    ON cdw.database_code = gta.database_code
LEFT JOIN wmf.pageview_hourly pvh
    ON gta.page_id = pvh.page_id
    AND CONCAT(cdw.language_code, '.', cdw.database_group) = pvh.project 
    AND pvh.year = 2020
    AND pvh.month = 2
WHERE
    pvh.agent_type='user'
    AND pvh.country_code = 'IN'
GROUP BY 
  gta.database_code, gta.page_id, gta.page_title
""")

In [None]:
pv_india_users_feb

In [13]:
pv_all_users_feb = spark.run("""
SELECT 
  gta.database_code AS wikicode,
  gta.page_id,
  gta.page_title,
  SUM(view_count) as feb_view_count_global
FROM florez.glow_tiger_articles gta
LEFT JOIN canonical_data.wikis cdw
    ON cdw.database_code = gta.database_code
LEFT JOIN wmf.pageview_hourly pvh
    ON gta.page_id = pvh.page_id
    AND CONCAT(cdw.language_code, '.', cdw.database_group) = pvh.project 
    AND pvh.year = 2020
    AND pvh.month = 2
WHERE
    pvh.agent_type='user'
GROUP BY 
    gta.database_code, gta.page_id, gta.page_title
""")

In [None]:
pv_all_users_feb

In [23]:
pv_df = pd.merge(pv_all_users_feb, pv_india_users_feb, on=['database_code', 'page_id', 'page_title'], how = 'outer').fillna(0)

In [None]:
pv_df

In [None]:
pv_df[pv_df.duplicated()]

In [31]:
dupe_check = pv_df[pv_df.duplicated(['page_id', 'wikicode'])]

In [32]:
len(dupe_check)

0

In [33]:
pv_df.to_csv("../../data/processed/query_results/content_quality/pv.csv", sep=',', encoding = 'utf-8', index=False) 

In [None]:
#merge with interim df

In [None]:
interim_frame_updated = pd.read_csv("../../data/processed/query_results/content_quality/1_interim_frame_updated.csv", sep=',', encoding = 'utf-8', parse_dates=True)  

In [None]:
interim_frame_pv  = pd.merge(pv_df, interim_frame_updated, on=['wikicode', 'page_id', 'page_title'], how = 'right').fillna(0)

In [None]:
interim_frame_pv.to_csv("../../data/processed/query_results/content_quality/3_interim_frame_pv.csv", sep=',', encoding = 'utf-8', index=False) 

In [None]:
#merge with full df

In [None]:
final_frame_updated = pd.read_csv("../../data/processed/query_results/content_quality/b1_final_frame_updated.csv", sep=',', encoding = 'utf-8', parse_dates=True)  

In [None]:
final_frame_updated_pv  = pd.merge(pv_df, final_frame_updated, on=['wikicode', 'page_id', 'page_title'], how = 'right').fillna(0)

In [None]:
final_frame_updated_pv.to_csv("../../data/processed/query_results/content_quality/b3_final_frame_updated_pv.csv", sep=',', encoding = 'utf-8', index=False) 

In [None]:
pv_all_users_feb = spark.run("""
SELECT 
  gta.database_code AS wikicode,
  gta.page_id,
  gta.page_title,
  SUM(view_count) as feb_view_count_global
FROM florez.glow_tiger_articles gta
LEFT JOIN canonical_data.wikis cdw
    ON cdw.database_code = gta.database_code
LEFT JOIN wmf.pageview_hourly pvh
    ON gta.page_id = pvh.page_id
    AND CONCAT(cdw.language_code, '.', cdw.database_group) = pvh.project 
    AND pvh.year = 2020
    AND pvh.month = 2
WHERE
    pvh.agent_type='user'
GROUP BY 
    gta.database_code, gta.page_id, gta.page_title
""")

In [14]:
feb_global_views_referrer = spark.run("""
SELECT 
  gta.database_code AS wikicode,
  gta.page_id,
  gta.page_title,
  SUM(view_count) as feb_view_count_global,
  referer_class
FROM florez.glow_tiger_articles gta
LEFT JOIN canonical_data.wikis cdw
    ON cdw.database_code = gta.database_code
LEFT JOIN wmf.pageview_hourly pvh
    ON gta.page_id = pvh.page_id
    AND CONCAT(cdw.language_code, '.', cdw.database_group) = pvh.project 
    AND pvh.year = 2020
    AND pvh.month = 2
WHERE
    pvh.agent_type='user'
GROUP BY 
    gta.database_code, gta.page_id, gta.page_title, referer_class
""")

In [None]:
feb_global_views_referrer.sort_values(by='page_id')

In [None]:
feb_global_views_referer = feb_global_views_referer.rename(columns = {'wikicode':'database_code'})

In [17]:
feb_global_views_referrer.to_csv("../../data/processed/query_results/content_quality/feb_global_views_referrer.csv", sep=',', encoding = 'utf-8', index=False) 