In [1]:
#https://pypi.org/project/chromedriver-binary/
#https://sites.google.com/a/chromium.org/chromedriver/getting-started/chromeos
#tools: https://chrome.google.com/webstore/detail/xpath-helper/hgimnogjllphhhkhlmebbmlgjoejdpjl?hl=en
#https://chromedriver.chromium.org/
import requests
import string
import pandas as pd
from lxml import html
import lxml
from bs4 import BeautifulSoup

from urllib.request import Request, urlopen
import time
import re
import os
 
from selenium import webdriver
import chromedriver_binary  # Adds chromedriver binary to path
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')

## Get urls to query

In [None]:
#run on local machine
#/Documents/projects/wikimedia/_data_collection/collect_wiki_article_counts.ipynb

In [6]:
#consider https://meta.wikimedia.org/wiki/List_of_Wikipedias/Table to be the ultimate source of truth on wiki counts

#use the api https://www.mediawiki.org/wiki/API:Revisions
#to get revid's for pages related to https://meta.wikimedia.org/w/index.php?title=List_of_Wikipedias/Table&action=history

S = requests.Session()

URL = "https://meta.wikimedia.org/w/api.php"

PARAMS = {
    "action": "query",
    "prop": "revisions",
    "titles": "List of Wikipedias/Table",
    "rvprop": "timestamp|ids",
    "rvdir": "newer",
    "rvstart": "2019-10-01T00:00:00.000Z",
    "formatversion": "2",
    "format": "json",
    "rvlimit": "500",
}

R = S.get(url=URL, params=PARAMS)
DATA = R.json()

for d in DATA['query']['pages']:
    for k, v, in d.items():
        inner = k, v
        res = inner[1]

rev_id_list = [i['revid'] for i in res]
rev_df = pd.DataFrame(res)

#alternate solution if you need to bypass the above:
#results = pd.DataFrame(DATA['query']['pages'])
#tmp = results.explode('revisions')
#rev_df = pd.DataFrame(list(tmp.pop('revisions')))

In [149]:
#format the timestamp column to datetime
rev_df['timestamp'] = pd.to_datetime(rev_df['timestamp'])

In [154]:
#copy the df just in case
df = rev_df.copy()

In [156]:
#grab only one revid per month
df['year'] = df.timestamp.map(lambda x: x.year)
df['month'] = df.timestamp.map(lambda x: x.month)
df['day'] = df.timestamp.map(lambda x: x.day)
select_revs = df.sort_values(['year', 'month']).groupby(['year', 'month']).first()
select_revs.reset_index();

In [218]:
#download the two main files to csv format
select_revs.to_csv('rev_ids_2019_2020_select.csv', sep=',', encoding = 'utf-8', index=False) #only the first rev id in each month
df.to_csv('rev_ids_2019_2020_all', sep=',', encoding = 'utf-8', index=False)

In [209]:
#create a list out of rev ids, just in case that's helpful
rev_ids_to_query = select_revs['revid'].tolist()

In [219]:
#add prefix to rev id column
df2= select_revs.copy()
prefix = 'https://meta.wikimedia.org/w/index.php?title=List_of_Wikipedias/Table&direction=prev&oldid='
df2['revid'] = prefix + df2['revid'].astype(str)

In [223]:
#create a list our of urls
urls_to_query = df2['revid'].tolist()

## Article Count per wiki

In [None]:
#scrape article counts by language
#pull language name, date of revision, article count on that date of revision
language_list = ['Urdu', 'Hindi','Kannada', 'Malayalam', 'Marathi', 'Odia', 'Punjabi', 'Western Punjabi', 'Santali', 'Sanskrit', 'Tamil', 'Tulu', 'Telugu', 'Assamese', 'Bangla', 'Gujarati']

driver = webdriver.Chrome()


ac = []

def getRows():
        for name in language_list:
            #get year, label, wiki_name, art_count
            print(URL)
            ac_xpath = "//a[text()='" + str(name) + "']/parent::td//following-sibling::td[3]/a"
            wiki_xpath = "//a[text()='" + str(name) + "']/parent::td" 
            date_xpath ="//*[@id='mw-revision-date']"
            
            ac_dict = {}
            try:
                get_date = driver.find_element_by_xpath(date_xpath).text
                get_wiki_name = driver.find_elements_by_xpath (wiki_xpath)
                get_article_count = driver.find_elements_by_xpath (ac_xpath)
            
            
                ac_dict['date'] = date = get_date
                ac_dict['lang'] = "".join([element.text for element in get_wiki_name]) 
                ac_dict['count'] = "".join([element.text for element in get_article_count]) 
            
            except Exception:
                pass
            
           
            #print(ac_dict)
            t_ac.append(ac_dict.copy())

for URL in urls_to_query:
    #driver = webdriver.Chrome()
    driver.get(URL)
    time.sleep(10)
    
    t_ac = []
    getRows()
    ac.append(t_ac.copy())

In [268]:
raw_results = pd.DataFrame(ac).T
raw_results.columns=raw_results.columns//1
lang_count_results = pd.concat([pd.DataFrame(x.values) for _,x in raw_results.groupby(level=0,axis=1)]).dropna(axis=0,thresh=1)
wiki_art_count_results = lang_count_results[0].apply(pd.Series)

In [274]:
#wiki_art_count_results.to_csv('../../data/processed/query_results/regional_counts/wiki_counts_India_GLOW_2016_2019.csv') 
wiki_art_count_results.to_csv('../../data/processed/query_results/regional_counts/wiki_counts_India_GLOW_2019_2020.csv', sep=',', encoding = 'utf-8', index=False)