In [1]:
#!pip install html5lib bs4 requests lingua-language-detector google-api-python-client google-auth-httplib2 google-auth-oauthlib google-analytics-data selenium --upgrade

#Import libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from lingua import Language, LanguageDetectorBuilder
from datetime import date
import calendar
import os.path
import os
import io
import requests
import concurrent.futures
from googleapiclient.http import MediaIoBaseDownload
from google.auth.transport.requests import Request                                             
from google.oauth2.credentials import Credentials                                              
from google_auth_oauthlib.flow import InstalledAppFlow                                       
from googleapiclient.discovery import build                                                  
from googleapiclient.errors import HttpError                                                   
from google.oauth2 import service_account
from google.analytics.data_v1beta import BetaAnalyticsDataClient
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from difflib import SequenceMatcher

#Set-up English or Indonesian language detector for checking the webscraper
detector = LanguageDetectorBuilder.from_languages(Language.ENGLISH,Language.INDONESIAN).build()

#Pandas dataframe display settings
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 1000)

In [2]:
#FUNCTIONS

index_missing_titles_neliti = [21,63,95,116,118,119,137,164]

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')

def selscraper(url,waittype,waittag):
    chrome_driver.get(url)
    wait = WebDriverWait(chrome_driver, 10)
    wait.until(EC.presence_of_element_located((waittype,waittag)))
    html_content = chrome_driver.page_source
    soup = BeautifulSoup(html_content,'html.parser')
    return soup

def get_most_similar_string(input_str, string_list):
    similarity_scores = np.array([SequenceMatcher(None, input_str, s).ratio() for s in string_list])
    most_similar_index = np.argmax(similarity_scores)
    return string_list[most_similar_index]

def fetch_title(url):
    work = False
    while work == False:
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text)
            title = soup.find(id='publication-title')
            if title is None:
                work = True
                return None
            else:
                work = True
                return title.get_text("").replace('\n','').strip()
        except requests.exceptions.RequestException as e:
            work = False

In [3]:
#WEBSCRAPING PUBLICATION TITLES FROM CIPS AND NELITI
#Selenium is needed because Wix uses javascript to dynamically load the page
#The 'wait' function on Selenium allows the entire page to load first before pulling the html

#CIPS WEBSITE
#Initialise the chrome driver using the options above
chrome_driver = webdriver.Chrome(options=chrome_options)

eng_titles_soup = selscraper("https://www.cips-indonesia.org/publications",By.CSS_SELECTOR,'h4')

#Extracts the text from all located 'h4' in the html excluding the web page's title
english_titles = [x.get_text(" ").strip() for x in eng_titles_soup.find_all('h4') if x.get_text(" ") != 'CIPS has more than 90 publications']

while str(detector.detect_language_of(str(english_titles[0]))).replace('Language.','').title() == 'Indonesian':
    eng_titles_soup = selscraper("https://www.cips-indonesia.org/publications",By.CSS_SELECTOR,'h4')
    english_titles = [x.get_text(" ").strip() for x in eng_titles_soup.find_all('h4') if x.get_text(" ") != 'CIPS has more than 90 publications']

#Repeats the same with the Indonesian titles
ind_titles_soup = selscraper("https://www.cips-indonesia.org/publications?lang=id",By.CSS_SELECTOR,'h4')
indonesian_titles = [x.get_text(" ").strip() for x in ind_titles_soup.find_all('h4') if x.get_text(" ") != 'CIPS memiliki lebih dari 90 publikasi']

while str(detector.detect_language_of(str(indonesian_titles[-1]))).replace('Language.','').title() == 'English':
    ind_titles_soup = selscraper("https://www.cips-indonesia.org/publications?lang=id",By.CSS_SELECTOR,'h4')
    indonesian_titles = [x.get_text(" ").strip() for x in ind_titles_soup.find_all('h4') if x.get_text(" ") != 'CIPS memiliki lebih dari 90 publikasi']

#Combines both language title lists into one
all_titles = english_titles + indonesian_titles

for index,x in enumerate(all_titles):
    if x == 'Perjanjian Regional Comprehensive Economic Partnership (RCEP):  Peluangnya bagi Indonesia dan Langkah Pemanfaatannya  Sebuah Perspektif Internal':
        all_titles[index] = 'Perjanjian Regional Comprehensive Economic Partnership (RCEP): Peluangnya bagi Indonesia dan Langkah Pemanfaatannya\u2028 Sebuah Perspektif Internal'
    elif x == 'Menuju Sistem Agropangan yang Lebih Berkelanjutan di Indonesia':
        all_titles[index] = 'Menuju Sistem Pertanian Pangan yang Lebih Berkelanjutan di Indonesia'

#NELITI
neliti_soup = selscraper("https://repository.cips-indonesia.org/browse/all",By.CLASS_NAME,'sr-title')
no_pages = neliti_soup.find_all(class_='pages')[-1].find('a').get_text()

nel_titles = []
for i in list(range(1,int(no_pages)+1,1)):
    soup = selscraper("https://repository.cips-indonesia.org/browse/all?page="+str(i),By.CLASS_NAME,'sr-title')
    page_titles = [x.get_text().replace('\n','').strip() for x in soup.find_all(class_='sr-title')]
    nel_titles.append(page_titles)
nel_titles = [i for x in nel_titles for i in x]

chrome_driver.quit()

In [4]:
#PULLING CURRENT GOOGLE SHEETS DASHBOARD AND NELITI FILES

#Define function for getting credentials
def credentials():

    #Define scope for relevant permissions and features
    scopes = ["https://www.googleapis.com/auth/spreadsheets","https://www.googleapis.com/auth/drive"]
    
    #Initialise blank credentials. Credentials necessary for using the API and accessing the appropriate sheets
    credentials = None

    #If there is an existing token.json, use this as the credentials. IF DOESN'T WORK THEN DELETE JSON AND CREDENTIALS AND REDOWNLOAD AUTHORISATION FROM GOOGLE CLOUD
    if os.path.exists("API_Files/token.json"):
        credentials = Credentials.from_authorized_user_file("API_Files/token.json",scopes)

    #If no existing credentials or existing credentials are invalid
    if not credentials or not credentials.valid:

        #If existing credentials and they're expired and there is an available refresh token, request to refresh the token
        if credentials and credentials.expired and credentials.refresh_token:
            credentials.refresh(Request())

        #If no existing credentials, runs new flow using the local json file and opens new window with request to login to Google account
        else:
            flow = InstalledAppFlow.from_client_secrets_file("API_Files/credentials.json",scopes)
            credentials = flow.run_local_server(port=0)

        #Writes a new json file with the credentials once logged in
        with open("API_Files/token.json","w") as token:
            token.write(credentials.to_json())
    return credentials

#Creates credentials
credentials = credentials()

#Setup the Google Sheets service using the generated credentials and specifies the ranges to get
service = build("sheets","v4",credentials=credentials)
ranges = ["Website Downloads","Neliti Downloads","Neliti Views"]

#Make a batch request for the relevant sheets on the dashboard
batch_response = service.spreadsheets().values().batchGet(spreadsheetId="1ACDkuDGNctxpdTk47watDaq2OystvRUOLheSotMwRts",ranges=ranges).execute()

#Assign sheet responses to respective variables
web_dl = batch_response.get("valueRanges")[0].get("values",[])
nel_dl = batch_response.get("valueRanges")[1].get("values",[])
nel_view = batch_response.get("valueRanges")[2].get("values",[])

#Converts the pulled data into dataframes with the appropriate data types
web_dl = pd.DataFrame(web_dl[1:],columns=web_dl[0])
web_dl = pd.concat([web_dl.iloc[:,:3],web_dl.iloc[:,3:].astype('int64')],axis=1)
nel_dl = pd.DataFrame(nel_dl[1:],columns=nel_dl[0])
nel_dl = pd.concat([nel_dl.iloc[:,:3],nel_dl.iloc[:,3:].astype('int64')],axis=1)
nel_view = pd.DataFrame(nel_view[1:],columns=nel_view[0])
nel_view = pd.concat([nel_view.iloc[:,:3],nel_view.iloc[:,3:].astype('int64')],axis=1)

#Setup the Google Drive service
service = build("drive","v3",credentials=credentials)

#Returns a dataframe of all the files in the Neliti folder
neliti_files = pd.DataFrame(service.files().list(q="'1lxX689eHFjmb6AJ-eRX831c2ypzYXkyO' in parents",fields="nextPageToken, files(id, name)").execute().get("files",[]))

if len(nel_dl.columns.tolist()) != len(nel_view.columns.tolist()):
    print("Unify columns for Neliti views and Neliti downloads")
    
elif len(nel_dl.columns.tolist()) == len(nel_view.columns.tolist()):
    neliti_missing_months = [x for x in neliti_files['name'].tolist() if x.replace('.xlsx','') not in nel_dl.columns.tolist()[3:]]
    neliti_missing_ids = [neliti_files.loc[neliti_files['name'] == x,'id'].values[0] for x in neliti_missing_months]
    neliti_missing_filenames = [neliti_files.loc[neliti_files['name'] == x,'name'].values[0] for x in neliti_missing_months]
    if len(neliti_missing_ids) != 0:
        for id,name in zip(neliti_missing_ids,neliti_missing_filenames):
            request = service.files().get_media(fileId=id)
            byter = io.BytesIO()
            downloader = MediaIoBaseDownload(fd=byter,request=request)
            done = False
            while not done:
                status, done = downloader.next_chunk()
                print('Download progress {0}'.format(status.progress() * 100))
            byter.seek(0)
            with open('Data/Neliti_Files/'+name,'wb') as f:
                f.write(byter.read())
                byter.close()

In [8]:
#CHECKING WHETHER TITLES PULLED FROM WEBSITE ARE FOUND IN GOOGLE ANALYTICS FILE DOWNLOAD HISTORY
#This is to check the formatting of the webscraped titles to exactly match the titles extracted from Google Analytics
#'Modernizing Indonesia's Agriculture', 'Memodernisasi Pertanian Indonesia', and 'Kerahasiaan Data dalam Peraturan Perundang-Undangan Perlindungan Data Pribadi di Indonesia'
#These three titles appear because they truly do not have any downloads since 2018
#IF OTHER TITLES APPEAR, CHECK TITLE FORMAT

#Activates credentials from the Google Cloud service account
credentials = service_account.Credentials.from_service_account_file('API_Files/cips-publication-dashboard-da13a6d6f0fd.json')

#Initialise the Google Analytics client using the credentials
client = BetaAnalyticsDataClient(credentials=credentials)

#Runs the report to pull the number of file downloads for each title in the time range 2018-01-01 to today
report = client.run_report({"property":"properties/353077506","date_ranges":[{"start_date":"2018-01-01","end_date":"today"}],\
                            "dimensions":[{"name":"eventName","name":"unifiedScreenClass"}],"metrics":[{"name":"eventCount"}],\
                            "dimension_filter":{"filter":{"field_name":"eventName","string_filter":{"value":"file_download"}}}})

#Initialise an empty list to append lists of the Google Analytics data. Each row contains one title and its downloads
data = []

#Loops through the report and appends lists to data with each row's data
for row in report.rows:
    dimension_values = [value.value for value in row.dimension_values]
    metric_values = [value.value for value in row.metric_values]
    data.append(dimension_values + metric_values)

#Converts the list of list to a dataframe containing the Google Analytics data
ganalytics_data = pd.DataFrame(data,columns=['Page Title','Downloads'])

#Prints out all the titles in the webscraped all_titles which do not appear in the list of titles found in the Google Analytics data
print("Titles on Website but not found on Google Analytics:")
missing_titles = [print(x) for x in all_titles if x not in ganalytics_data['Page Title'].tolist()]

Titles on Website but not found on Google Analytics:
Modernizing Indonesia’s Agriculture
Memodernisasi Pertanian Indonesia
Kerahasiaan Data dalam Peraturan Perundang-Undangan Perlindungan Data Pribadi di Indonesia


In [73]:
#ADDING NEW ENTRIES AND MONTHS TO THE DASHBOARD'S WEBSITE DOWNLOADS AND COMPLETING THE NEW MONTHS WITH GOOGLE ANALYTICS DATA

#Creates a list of all months from January 2019 to the month before the current month in the same format as found in web_dl
start_to_today = [x[:7] for x in pd.date_range('2019-01-01',date.today(),freq='M').astype('str').tolist()]

#Returns a list of all month columns in web_dl
dates_in_web_dl = [x for x in web_dl.columns.tolist() if x != 'Title' and x != 'Type' and x != 'Topic']

#Creates a list of missing months, if any
missing_months = [x for x in start_to_today if x not in dates_in_web_dl]

#Inserts new columns at the end of the click data for any missing months with a value of 0
for x in missing_months:
    web_dl.insert(len(web_dl.columns),x,[0] * len(web_dl))

#Creates a list of titles found on the webscraped titles list but not in web_dl
titles_not_in_web_dl = [x for x in all_titles if x not in web_dl['Title'].tolist()]

#Loops through the missing titles
for title in titles_not_in_web_dl:
    print(f"Title: '{title}'")
    
    possible_types = ['Discussion Paper','Policy Brief','Book','Policy Paper']
    for index,y in enumerate(possible_types):
        print(f'{index+1}. {y}')
    input_type = str(input('Please select a number from the list'))
    while input_type not in list(map(str,range(1,len(possible_types)+1,1))):
        input_type = str(input('Please select a number from the list'))
    input_type = possible_types[int(input_type)-1]

    possible_topics = ['Digital & Financial Literacy','Economic Opportunities','Community Livelihoods','Education','Food Security & Agriculture','Trade & Investment']
    for index,z in enumerate(possible_topics):
        print(f'{index+1}. {z}')
    input_topic = str(input('Please select a number from the list'))
    while input_topic not in list(map(str,range(1,len(possible_topics)+1,1))):
        input_topic = str(input('Please select a number from the list'))
    input_topic = possible_topics[int(input_topic)-1]

    web_dl.loc[len(web_dl)] = [title,input_type,input_topic] + [0] * len([x for x in web_dl.columns.tolist() if x != 'Title' and x != 'Type' and x != 'Topic'])


#Converts the data columns to integers
web_dl = pd.concat([web_dl.iloc[:,:3],web_dl.iloc[:,3:].astype('int64')],axis=1)

#Loops through the missing months 
for year in missing_months:

    #Returns the integer values of the year and month from the string year
    current_year, current_month = map(int,year.split('-'))

    #Uses the calendar library to return the last day of the month
    last_day = str(calendar.monthrange(current_year,current_month)[1])

    #Runs Google Analytics file download report for the relevant month and appends to the dataframe, ganalytics_data, as before
    report = client.run_report({"property":"properties/353077506","date_ranges":[{"start_date":year+"-01","end_date":year+"-"+last_day}],\
                            "dimensions":[{"name":"eventName","name":"unifiedScreenClass"}],"metrics":[{"name":"eventCount"}],\
                            "dimension_filter":{"filter":{"field_name":"eventName","string_filter":{"value":"file_download"}}}})
    data = []
    for row in report.rows:
        dimension_values = [value.value for value in row.dimension_values]
        metric_values = [value.value for value in row.metric_values]
        data.append(dimension_values + metric_values)
    ganalytics_data = pd.DataFrame(data,columns=['Page Title','Downloads'])
    ganalytics_data['Downloads'] = ganalytics_data['Downloads'].astype('int64')

    #Loops through the titles on web_dl
    for title in web_dl['Title'].tolist():

        #If the title is found in the Google Analytics report, adds the downloads to the web_dl dataframe for the particular month
        if title in ganalytics_data['Page Title'].tolist():
            web_dl.loc[web_dl['Title'] == title,year] = ganalytics_data.loc[ganalytics_data['Page Title'] == title,'Downloads'].values[0]
        #If the title is not found, adds a 0
        elif title not in ganalytics_data['Page Title'].tolist():
            web_dl.loc[web_dl['Title'] == title,year] = 0

In [74]:
titles_not_in_neliti_dash = [x for x in nel_titles if x not in nel_dl['Title'].tolist()]

for title in titles_not_in_neliti_dash:
    print(title)
    cont = input('Add to dashboard? Y/N')
    while cont.lower() != 'y' and cont.lower() != 'n':
        cont = input('Press Y for yes or N for no')
        
    if cont == 'n':
        continue

    elif cont == 'y':
        possible_types = ['Discussion Paper','Policy Brief','Book','Policy Paper']
        for index,y in enumerate(possible_types):
            print(f'{index+1}. {y}')
        input_type = str(input('Please select a number from the list'))
        while input_type not in list(map(str,range(1,len(possible_types)+1,1))):
            input_type = str(input('Please select a number from the list'))
        input_type = possible_types[int(input_type)-1]
    
        possible_topics = ['Digital & Financial Literacy','Economic Opportunities','Community Livelihoods','Education','Food Security & Agriculture','Trade & Investment']
        for index,z in enumerate(possible_topics):
            print(f'{index+1}. {z}')
        input_topic = str(input('Please select a number from the list'))
        while input_topic not in list(map(str,range(1,len(possible_topics)+1,1))):
            input_topic = str(input('Please select a number from the list'))
        input_topic = possible_topics[int(input_topic)-1]

        nel_dl.loc[len(nel_dl)] = [title,input_type,input_topic] + [0] * len([x for x in nel_dl.columns.tolist() if x != 'Title' and x != 'Type' and x != 'Topic'])
        nel_view.loc[len(nel_view)] = [title,input_type,input_topic] + [0] * len([x for x in nel_view.columns.tolist() if x != 'Title' and x != 'Type' and x != 'Topic'])

for file in os.listdir("Data/Neliti_Files"):
    month = pd.read_excel("Data/Neliti_Files/"+file,sheet_name=0,names=['Pages','Hits'])
    month['Pages'] = month['Pages'].str.replace('center-for-indonesian-policy-studies - ','https://')
    month['Type'] = month['Pages'].apply(lambda x: 'Download' if '.pdf' in x else 'View')
    month = month.reindex(columns=['Pages','Type','Hits'])
    
    month_view = month.loc[month['Type'] == 'View'].copy()
    urls = month_view['Pages'].tolist()
    with concurrent.futures.ThreadPoolExecutor(max_workers=7) as executor:
        titles = list(executor.map(fetch_title,urls))
    month_view['Month Titles'] = titles

    month_dl = month.loc[month['Type'] == 'Download'].copy()
    month_dl['Month Titles'] = month_dl['Pages'].apply(lambda x: x.split('publications/')[1].split('-',1)[1].replace('.pdf','').replace('-',' '))

    month = pd.concat([month_view,month_dl],axis=0,ignore_index=True)    
    
    dash_titles = [None if x is None else get_most_similar_string(x,nel_dl['Title'].tolist()) for x in month['Month Titles'].tolist()]
    month['Dashboard Titles'] = dash_titles

    month_dl = month.loc[month['Type'] == 'Download']
    nel_dl[file.replace('.xlsx','')] = [month_dl.loc[month_dl['Dashboard Titles'] == x,'Hits'].values[0] if x in month_dl['Dashboard Titles'].tolist() else 0 for x in nel_dl['Title'].tolist()]

    month_view = month.loc[month['Type'] == 'View']
    nel_view[file.replace('.xlsx','')] = [month_view.loc[month_view['Dashboard Titles'] == x,'Hits'].values[0] if x in month_view['Dashboard Titles'].tolist() else 0 for x in nel_view['Title'].tolist()]

Indonesian Food Trade Policy during COVID-19


Add to dashboard? Y/N n


Effects of High Food Prices on Non-Cash Food Subsidies (BPNT) in Indonesia - Case Study in East Nusa Tenggara


Add to dashboard? Y/N n


Political Economy of Rice Policy in Indonesia: A Perspective on the ASEAN Economic Opportunity


Add to dashboard? Y/N n


In [75]:
for file in os.listdir("Data/Neliti_Files"):
    os.remove("Data/Neliti_Files/" + file)

#Define function for getting credentials
def credentials():

    #Define scope for relevant permissions and features
    scopes = ["https://www.googleapis.com/auth/spreadsheets","https://www.googleapis.com/auth/drive"]
    
    #Initialise blank credentials. Credentials necessary for using the API and accessing the appropriate sheets
    credentials = None

    #If there is an existing token.json, use this as the credentials. IF DOESN'T WORK THEN DELETE JSON AND CREDENTIALS AND REDOWNLOAD AUTHORISATION FROM GOOGLE CLOUD
    if os.path.exists("API_Files/token.json"):
        credentials = Credentials.from_authorized_user_file("API_Files/token.json",scopes)

    #If no existing credentials or existing credentials are invalid
    if not credentials or not credentials.valid:

        #If existing credentials and they're expired and there is an available refresh token, request to refresh the token
        if credentials and credentials.expired and credentials.refresh_token:
            credentials.refresh(Request())

        #If no existing credentials, runs new flow using the local json file and opens new window with request to login to Google account
        else:
            flow = InstalledAppFlow.from_client_secrets_file("API_Files/credentials.json",scopes)
            credentials = flow.run_local_server(port=0)

        #Writes a new json file with the credentials once logged in
        with open("API_Files/token.json","w") as token:
            token.write(credentials.to_json())
    return credentials

#Creates credentials
credentials = credentials()

#Define try block for pulling current dashboard
try:
    #Setup the Google Sheets service using the generated credentials and specifies the ranges to get
    service = build("sheets","v4",credentials=credentials)

    body = {'values':[web_dl.columns.tolist()] + web_dl.values.tolist()}
    request = service.spreadsheets().values().update(spreadsheetId="1ACDkuDGNctxpdTk47watDaq2OystvRUOLheSotMwRts",range='Website Downloads',valueInputOption="USER_ENTERED",body=body).execute()

    body = {'values':[nel_dl.columns.tolist()] + nel_dl.values.tolist()}
    request = service.spreadsheets().values().update(spreadsheetId="1ACDkuDGNctxpdTk47watDaq2OystvRUOLheSotMwRts",range='Neliti Downloads',valueInputOption="USER_ENTERED",body=body).execute()
    
    body = {'values':[nel_view.columns.tolist()] + nel_view.values.tolist()}
    request = service.spreadsheets().values().update(spreadsheetId="1ACDkuDGNctxpdTk47watDaq2OystvRUOLheSotMwRts",range='Neliti Views',valueInputOption="USER_ENTERED",body=body).execute()
    
#Prints a http error if unable to pull
except HttpError as error:
    print(error)