# Requirements:

- https://selenium-python.readthedocs.io/locating-elements.html
- https://sites.google.com/a/chromium.org/chromedriver/downloads


## TODOS

# - Titlesearch AND abstractsearch

# - Fix ScienceDirect


In [None]:
from urllib.request import urlopen, Request

import pandas as pd
import numpy as np

from html.parser import HTMLParser
import tqdm
import math


from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException

from pybtex.database.input import bibtex
import pybtex.errors
pybtex.errors.set_strict_mode(False)



import itertools
from itertools import permutations 

import os
import time
import config

import bibtexparser

In [None]:
# Needed for IEEE
UNI_MAIL = config.UNI_MAIL
UNI_PWD = config.UNI_PWD
UNI_USER = config.UNI_USER

# Needed for ScienceDirect
ScienceDirect_MAIL = config.ScienceDirect_MAIL
ScienceDirect_PWD = config.ScienceDirect_PWD

IEEE_bib_files = []

# 0: No Screenshots
# 1: One Screenshot for each query (recommended)
# 2: Screenshots of different steps to find out why crawler might not work
DEBUG = 2

ieee_maxpage = math.inf
acm_maxpage = 39
sd_maxpage = 19

GLOBAL_ERROR_LIST = []
urls = []

# Settings for crawling


In [None]:
# "tit": Title only, 
# "titAbs": Title and Abstract, 
# "text": Full text / quicksearch - not fully tested
titlesearch = "tit" 

year_min = 1900 # Set to earliest year which should be crawled
year_max = 2022 # Set to latest year whichh should be crawled

LIBS = ["ScienceDirect", "ACM", "IEEE"]

# Search Keyword combinations
pres = [
    'selection',  'pointing'
        ]
sufs = ['virtual'
        ]
keywords = list(itertools.product(pres, sufs))
# Keywords should be a list of lists of strings. 
# The strings will be connected with AND for the search query.
keywords = [list(item) for item in keywords]



# Setup for crawler

## function to crawl: crawl(keywords, LIBRARY, titlesearch)


In [None]:
# Change paths for dl-folders (dl) to folders for each library [line 9,11,13]
def setupCrawler(dl_folder):
    options = webdriver.ChromeOptions()
    options.add_argument('window-size=1920,1080')
    
    dl = config.downloadfolder_default
    if dl_folder == "acm":
        dl = config.downloadfolder_acm
        options.add_argument('headless')
        options.add_argument("disable-gpu")
    elif dl_folder == "ieee":
        dl = config.downloadfolder_ieee
        options.add_argument('headless')
        options.add_argument("disable-gpu")
    elif dl_folder == "sd":
        dl = config.downloadfolder_sd
    p = {"download.default_directory": dl}
    options.add_experimental_option("prefs", p)
    driver = webdriver.Chrome(executable_path = "./chromedriver.exe", options=options)
    print("Driver setup complete.")
    return driver
# Method to crawl one single library: Keywords: [], library: String, title: bool, dl_folder: string
def crawl(keywords_list, library, searchWhere):
    print(f"Start crawling {library}")
    if library == "ACM":
        keywords = [[item.replace(" ", "+") for item in keywords] for keywords in keywords_list]
        saveACMBib(keywords, "acm", searchWhere)
    elif library == "IEEE":
        keywords = [[item.replace(" ", "%20") for item in keywords] for keywords in keywords_list]
        saveIEEEBib(keywords, "ieee", searchWhere)
    elif library == "ScienceDirect":
        keywords = [[item.replace(" ", "%20") for item in keywords] for keywords in keywords_list]
        saveScienceDirectBib(keywords, "sd", searchWhere)
    else:
        print(f"Library {library} not yet supported")
# Goal: keywords, lib, searchWhere(tit, titAbs, text)
def getURL(keywords, library, searchWhere, concatentation="AND"):
    URL = ""
    search = ""
    if library == "ACM":
        titleSearch = "doSearch?AllField="
        for i, keyword in enumerate(keywords):
            search += f"%22{keyword}%22"
            if (i < len(keywords)-1):
                search += f"+{concatentation}+"
        if searchWhere == "tit":
            print("Searching ACM for title only")
            titleSearch = f"doSearch?fillQuickSearch=false&expand=dl&field1=Title&text1={search}"
        elif searchWhere == "titAbs":
            print("Searching ACM for title and abstract")
            titleSearch = f"doSearch?fillQuickSearch=false&expand=dl&field1=Title&text1={search}&field2=Abstract&text2={search}"
        else:
            print("Quicksearching ACM")
        URL = f"https://dl.acm.org/action/{titleSearch}&pageSize=50&AfterYear={year_min}&BeforeYear={year_max}&startPage="
        return URL
    elif library == "IEEE":
        titleSearch = "doSearch?AllField="
        for i, keyword in enumerate(keywords):
            if searchWhere == "tit":
                if i == 0: print("Searching IEEE for title only")
                key = f'("Document%20Title":"{keyword}")'
            elif searchWhere == "titAbs":
                if i == 0: print("Searching IEEE for title and abstract")
                key = f'("Document%20Title":"{keyword}")+{concatentation}+("Abstract":"{keyword}")'
            else:
                if i == 0: print("Quicksearching IEEE")
                key = f"%22{keyword}%22"
            search += key
            if (i < len(keywords)-1):
                search += f"+{concatentation}+" 
        URL = f"https://ieeexplore.ieee.org/search/searchresult.jsp?&queryText={search}&highlight=true&returnFacets=ALL&returnType=SEARCH&matchPubs=true&ranges={year_min}_{year_max}_Year&rowsPerPage=50&pageNumber="
        return URL
    elif library == "ScienceDirect":
        titleSearch = "tak="
        if searchWhere == "tit":
            titleSearch = "title="
        elif searchWhere == "titAbs":
            titleSearch = "tak="
        for i, keyword in enumerate(keywords):
            search += f"%22{keyword}%22"
            if (i < len(keywords)-1):
                search += f"%20{concatentation}%20" 
        URL = f"https://www.sciencedirect.com/search?date={year_min}-{year_max}&{titleSearch}{search}&show=50&offset="
        print(URL)
        return URL
    else:
        print(f"Library {library} not yet supported")
    return URL

# IEEE


In [None]:
def loadIEEEBib (toOpen, driver):
    driver.get(toOpen)
    time.sleep(25)
    driver.find_element_by_class_name("main-section").find_elements_by_xpath(".//*")[5].click() #Click SELECT ALL
    time.sleep(5)
    
    export = driver.find_element_by_class_name("ng-Dashboard").find_elements_by_xpath(".//*")[10] # Find EXPORT
    if "Export" not in export.text:
        export = driver.find_element_by_class_name("ng-Dashboard").find_elements_by_xpath(".//*")[20] # Find EXPORT
    export.click()
    time.sleep(5)
    driver.find_element_by_class_name("tooltip-inner").find_elements_by_xpath(".//*")[4].click() # CLick Citations

    time.sleep(5)

    if DEBUG > 1: driver.save_screenshot("preBibTex.png")
    bibText_locator = driver.find_elements(By.NAME, "download-format")[1]
    
    bibText_locator.click()
    
    time.sleep(5)

    citAbs_locator = driver.find_elements(By.NAME, "citations-format")[1]
    
    citAbs_locator.click()
    if DEBUG > 1: driver.save_screenshot("postCitAbs.png")
    tooltip_inner = driver.find_element_by_class_name("tooltip-inner")
    export_locator = tooltip_inner.find_elements_by_xpath(".//*")[-1]

    time.sleep(5)

    curWindowHndl = driver.current_window_handle
    assert len(driver.window_handles) == 1
    export_locator.send_keys(Keys.CONTROL + Keys.ENTER) #open link in new tab keyboard shortcut
    time.sleep(10)
    if DEBUG > 1: driver.save_screenshot("ieee_dl.png")

    for window_handle in driver.window_handles:
        if window_handle != curWindowHndl:
            driver.switch_to.window(window_handle)
            break
    
    time.sleep(10) #wait until new tab finishes loading

    driver.switch_to.window(driver.window_handles[1]) #assuming new tab is at index 1
    
    bib = driver.find_element_by_xpath("/html/body").text
    time.sleep(2)
    driver.close() #closes new tab
    driver.switch_to.window(curWindowHndl)
    time.sleep(20)
    return bib

def saveIEEEBib(keywords_list, dl_folder, titleOnly):
    driver = setupCrawler(dl_folder)
    # bib_datas = []
    # glob_bib = ""
    for keywords in keywords_list:
        print(f"Search for: {keywords}")
        IEEE_URL = getURL(keywords, "IEEE", titleOnly)
        driver.get(IEEE_URL)#put here the adress of your page
        
        print(IEEE_URL)
        time.sleep(7)
        name = ""
        for word in keywords:
            name += f"{word}"
        if titleOnly == "tit":
            name += "_Title"
        elif titleOnly == "titAbs":
            name += "_TitleAbstract"
        if DEBUG > 0: driver.save_screenshot(f"IEEE_{name}.png")
        try:
            results = driver.find_element_by_class_name("ng-Dashboard").find_elements_by_xpath(".//*")[31]
            if results.text == "Set Search Alerts":
                results = driver.find_element_by_class_name("ng-Dashboard").find_elements_by_xpath(".//*")[41]
            results = results.text.split(" ")[0].split("-")[-1]
            if "," in results:
                results = results.replace(",", "")
        except NoSuchElementException:
            results = 0
        except IndexError:
            results = 0
        if results == "":
            results = 0
        
        results = int(results)
        r = int(np.min([math.ceil(results / 50), ieee_maxpage]))
        # Loop through all pages and save resulting bib files
        # warnings.filterwarnings("error") #to catch <INPUT> warning as error

        # warnings.filterwarnings("error")
        for i in tqdm.tqdm(range(r)):
            toOpen = IEEE_URL + str(i+1)

            bib = loadIEEEBib(toOpen, driver)
            bib_data = bibtexparser.loads(bib)
            try:
                with open(f"./ieee/ieee_{name}_page{i}.bib", 'w') as bibtex_file:
                    bibtexparser.dump(bib_data, bibtex_file)
            except:
                print(f"Something went wrong while saving: {name}_{i}") 
                urls.append((f"./ieee/ieee_{name}_page{i}.bib", toOpen))

            time.sleep(3)
        # warnings.resetwarnings() #to reset warnings
    with open(f"./ieee/ERRORS.txt", 'w') as error_file:
        for url in urls:
            error_file.write(url[0])
            error_file.write(" | ")
            error_file.write(url[1])
            error_file.write("\n")

In [None]:
crawl(keywords, "IEEE", "titAbs")

# ACM


In [None]:
def loadACMBib (toOpen, driver):
    driver.get(toOpen)#put here the adress of your page
    # delay = 3 # seconds
    driver.find_element_by_class_name("item-results__checkbox").click()
    time.sleep(5)
    driver.find_element_by_class_name("item-results__buttons.visible").find_elements_by_xpath(".//*")[0].click()
    time.sleep(20)
    driver.find_element_by_class_name("rlist--inline.separator").find_elements_by_xpath(".//*")[1].click()
    time.sleep(20)
def saveACMBib(keywords_list, dl_folder, titleOnly):
    driver = setupCrawler(dl_folder)
    for keywords in keywords_list:
        print(f"Search for: {keywords}")
        ACM_URL = getURL(keywords, "ACM", titleOnly)
        driver.get(ACM_URL)#put here the adress of your page
        time.sleep(3)
        name = ""
        for word in keywords:
            name += f"{word}"
        if titleOnly == "tit":
            name += "_TitleOnly"        
        if titleOnly == "titAbs":
            name += "_TitleAbstract"
        if DEBUG > 0: driver.save_screenshot(f"./acm_{name}.png")
        # get amount of results for for-loop
        try:
            results = driver.find_element_by_class_name("result__count")
            results = results.text.split(" ")[0]
            if "," in results:
                results = results.replace(",", "")
            results = int(results)
        except NoSuchElementException:
            results = 0
        r = np.min([math.ceil(results / 50), acm_maxpage])
        # Loop through all pages and save resulting bib files
        for i in tqdm.tqdm(range(r)):
            toOpen = ACM_URL + str(i)
            driver = setupCrawler(dl_folder)
            loadACMBib(toOpen, driver)
            try:
                os.rename('acm/acm.bib', f'acm/acm_{name}_page{i}.bib')
            except FileNotFoundError:
                print("Only 1 bib entry in that file.")

In [None]:
crawl(keywords, "ACM", "titAbs")

# ScienceDirect


In [None]:
def loginScienceDirect(driver):
    Login_URL = "https://www.sciencedirect.com/"
    driver.get(Login_URL)
    time.sleep(5)
    if DEBUG > 1: driver.save_screenshot("init.png")
    driver.find_element_by_link_text("Sign in").click()
    time.sleep(10)
    mail = driver.find_element_by_id("bdd-email")
    mail.send_keys(UNI_MAIL)
    time.sleep(1)
    mail.send_keys(Keys.ENTER)
    time.sleep(1)
    if DEBUG > 1: driver.save_screenshot("login.png")
    time.sleep(1)
    driver.find_element_by_id("bdd-elsPrimaryBtn").click()
    time.sleep(1)
    driver.find_element_by_id("username").send_keys(UNI_USER)
    time.sleep(1)
    pwd = driver.find_element_by_id("password")
    pwd.send_keys(UNI_PWD)
    time.sleep(1)
    pwd.send_keys(Keys.ENTER)
    time.sleep(2)
    # attemps = 0
    try:
        driver.find_element_by_id("institution-button").click()
    except:
        print("intitution button apparently no accessable")
        driver.save_screenshot("StaleElement.png")
        driver.find_element_by_id("institution-button").click()
    time.sleep(2)
    return driver
def loadScienceDirectBib(toOpen, driver):
    driver.get(toOpen)
    time.sleep(5)
    if DEBUG > 1: driver.save_screenshot("sciencedirect.png")
    driver.find_element_by_id("select-all-results").click()
    time.sleep(1)
    if DEBUG > 1: driver.save_screenshot("sciencedirect_clickall.png")
    driver.find_element_by_class_name("button-link.export-all-link-button.button-link-primary").click()
    time.sleep(5)
    driver.find_elements_by_class_name("button-link.button-link-primary.export-option.u-display-block")[2].click()
    time.sleep(10)
def saveScienceDirectBib(keywords_list, dl_folder, titleOnly):
    driver = setupCrawler(dl_folder)
    SD_URL = getURL(keywords_list[0], "ScienceDirect", titleOnly)
    driver.get('https://www.sciencedirect.com/')
    if DEBUG > 1: driver.save_screenshot("SD.png")
    try:
        driver = loginScienceDirect(driver)
    except NoSuchElementException:
        print("Already logged in or wrong credentials") 
    # loginScienceDirect(driver)
    if DEBUG > 1: driver.save_screenshot(f"{SD_URL}.png")
    for keywords in keywords_list:
        print(f"Search for: {keywords}")
        SD_URL = getURL(keywords, "ScienceDirect", titleOnly)
        driver.get(SD_URL)#put here the adress of your page
        time.sleep(3)
        try:
            results = driver.find_element_by_class_name("search-body-results-text")
            results = results.text.split(" ")[0]
            if "," in results:
                results = results.replace(",", "")
            results = int(results)
        except NoSuchElementException:
            results = 0
        
        r = np.min([math.ceil(results / 50), sd_maxpage])
        for i in tqdm.tqdm(range(r)):
            # driver = setupCrawler(dl_folder)
            toOpen = SD_URL + str(i*50)
            loadScienceDirectBib(toOpen, driver)

In [None]:
crawl(keywords, "ScienceDirect", "tit")