# Requirements:
* https://selenium-python.readthedocs.io/locating-elements.html
* https://sites.google.com/a/chromium.org/chromedriver/downloads

In [12]:
from urllib.request import urlopen, Request
# from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

from html.parser import HTMLParser
import json
import requests
import sqlite3
import tqdm
import math


from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException

from pybtex.database.input import bibtex
from pybtex.database import parse_string
import pybtex.errors
pybtex.errors.set_strict_mode(False)

import glob

import itertools
from itertools import permutations 

import os
import time
import config
import BibFilesMerge as bfm

In [4]:
# Needed for IEEE
UNI_MAIL = config.UNI_MAIL
UNI_PWD = config.UNI_PWD
UNI_USER = config.UNI_USER
# Needed for ScienceDirect
ScienceDirect_MAIL = config.ScienceDirect_MAIL
ScienceDirect_PWD = config.ScienceDirect_PWD

IEEE_bib_files = []

# 
ieee_maxpage = math.inf
acm_maxpage = 39
sd_maxpage = 19

GLOBAL_ERROR_LIST = []
urls = []

# Settings for crawling

In [6]:
fullsearch = False # Search keywords everywhere
titlesearch = True # Search keywords in title
abstractsearch = True # Search keywords in abstract
year_min = 1900 # Set to earliest year which should be crawled
year_max = 2022 # Set to latest year whichh should be crawled

LIBS = ["ScienceDirect", "ACM", "IEEE"]

# Search Keyword combinations
pres = [
    'pointing','selection',  'mid-air pointing', 'target-selection', 'target selection', 'gesture'
        ]
sufs = ['virtual environment','virtual reality', 'vr', 'large screen', 'large display', 'augmented reality'
        ]
keywords = list(itertools.product(pres, sufs))
# Keywords should be a list of lists of strings. 
# The strings will be connected with AND for the search query.
keywords = [list(item) for item in keywords]



# Setup for crawler
## function to crawl: crawl(keywords, LIBRARY, titlesearch)

In [7]:
# Change paths for dl-folders (dl) to folders for each library [line 9,11,13]
def setupCrawler(dl_folder):
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('window-size=1920x1080')
    options.add_argument("disable-gpu")
    dl = "C:\\Users\\Jan\\Documents\\GitHub\\MA-Thesis\\PyLitReview"
    if dl_folder == "acm":
        dl = config.downloadfolder_acm
    elif dl_folder == "ieee":
        dl = config.downloadfolder_ieee
    elif dl_folder == "sd":
        dl = config.downloadfolder_sd
    p = {"download.default_directory": dl}
    options.add_experimental_option("prefs", p)
    driver = webdriver.Chrome(executable_path = "./chromedriver.exe",   chrome_options=options)
    print("Driver setup complete.")
    return driver
# Method to crawl one single library: Keywords: [], library: String, title: bool, dl_folder: string
def crawl(keywords_list, library, titleOnly):
    print(f"Start crawling {library}")
    if library == "ACM":
        keywords = [[item.replace(" ", "+") for item in keywords] for keywords in keywords_list]
        saveACMBib(keywords, "acm", titleOnly)
    elif library == "IEEE":
        keywords = [[item.replace(" ", "%20") for item in keywords] for keywords in keywords_list]
        saveIEEEBib(keywords, "ieee", titleOnly)
    elif library == "ScienceDirect":
        keywords = [[item.replace(" ", "%20") for item in keywords] for keywords in keywords_list]
        saveScienceDirectBib(keywords, "sd", titleOnly)
    else:
        print(f"Library {library} not yet supported")
def getURL(keywords, library, titleOnly, concatentation="AND"):
    URL = ""
    search = ""
    if library == "ACM":
        titleSearch = "doSearch?AllField="
        for i, keyword in enumerate(keywords):
            search += f"%22{keyword}%22"
            if (i < len(keywords)-1):
                search += f"+{concatentation}+"
        if titleOnly == True:
            titleSearch = "doSearch?fillQuickSearch=false&expand=dl&field1=Title&text1="
        URL = f"https://dl.acm.org/action/{titleSearch}{search}&pageSize=50&AfterYear={year_min}&BeforeYear={year_max}&startPage="
        return URL
    elif library == "IEEE":
        titleSearch = "doSearch?AllField="
        for i, keyword in enumerate(keywords):
            if titleOnly == True:
                key = f'("Document%20Title":"{keyword}")'
            else:
                key = f"%22{keyword}%22"
            search += key
            if (i < len(keywords)-1):
                search += f"+{concatentation}+" 
        URL = f"https://ieeexplore.ieee.org/search/searchresult.jsp?&queryText={search}&highlight=true&returnFacets=ALL&returnType=SEARCH&matchPubs=true&ranges={year_min}_{year_max}_Year&rowsPerPage=50&pageNumber="
        return URL
    elif library == "ScienceDirect":
        titleSearch = "&tak="
        if titleOnly == True:
            titleSearch = "?title="
        for i, keyword in enumerate(keywords):
            search += f"%22{keyword}%22"
            if (i < len(keywords)-1):
                search += f"%20{concatentation}%20" 
        URL = f"https://www.sciencedirect.com/search?date={year_min}-{year_max}{titleSearch}{search}&show=50&offset="
        print(URL)
        return URL
    else:
        print(f"Library {library} not yet supported")
    return URL

# IEEE 

In [8]:
def loadIEEEBib (toOpen, driver):
    driver.get(toOpen)
    time.sleep(25)
    driver.find_element_by_class_name("main-section").find_elements_by_xpath(".//*")[5].click() #Click SELECT ALL
    time.sleep(5)
    # for en, child in enumerate(driver.find_element_by_class_name("ng-Dashboard").find_elements_by_xpath(".//*")[:21]):
    #     print("ng-Dashboard", en, child.text)
    export = driver.find_element_by_class_name("ng-Dashboard").find_elements_by_xpath(".//*")[10] # Find EXPORT
    if "Export" not in export.text:
        export = driver.find_element_by_class_name("ng-Dashboard").find_elements_by_xpath(".//*")[20] # Find EXPORT
    export.click()
    time.sleep(5)
    driver.find_element_by_class_name("tooltip-inner").find_elements_by_xpath(".//*")[4].click() # CLick Citations
    time.sleep(5)
    # for en, child in enumerate(driver.find_element_by_class_name("tooltip-inner").find_elements_by_xpath(".//*")):
    #     print("tooltip-inner", en, child.text)
    driver.find_element_by_class_name("tooltip-inner").find_elements_by_xpath(".//*")[25].click() # Click BibTex
    time.sleep(5)
    driver.find_element_by_class_name("tooltip-inner").find_elements_by_xpath(".//*")[40].click()# Click Citations & Abstract 
    time.sleep(5)
    elem = driver.find_element_by_class_name("tooltip-inner").find_elements_by_xpath(".//*")[44] #Click Export
    print("Text:",elem.text)
    driver.save_screenshot("ieee_dl.png")
    # window_after = driver.window_handles[1]
    # driver.switch_to.window(window_after)
    # driver.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.TAB)
    

    curWindowHndl = driver.current_window_handle
    assert len(driver.window_handles) == 1
    elem.send_keys(Keys.CONTROL + Keys.ENTER) #open link in new tab keyboard shortcut
    time.sleep(10)
    # Wait for the new window or tab
    # wait.until(EC.number_of_windows_to_be(2))
    for window_handle in driver.window_handles:
        if window_handle != curWindowHndl:
            driver.switch_to.window(window_handle)
            break
    
    time.sleep(10) #wait until new tab finishes loading
    # driver.switch_to.window(driver.window_handles[1])
    # driver.switch_to_window(driver.window_handles[1]) #assuming new tab is at index 1
    
    bib = driver.find_element_by_xpath("/html/body").text
    driver.close() #closes new tab
    driver.switch_to.window(curWindowHndl)
    time.sleep(20)
    return bib

def saveIEEEBib(keywords_list, dl_folder, titleOnly):
    driver = setupCrawler(dl_folder)
    bib_datas = []
    glob_bib = ""
    for keywords in keywords_list:
        print(f"Search for: {keywords}")
        IEEE_URL = getURL(keywords, "IEEE", titleOnly)
        driver.get(IEEE_URL)#put here the adress of your page
        
        print(IEEE_URL)
        time.sleep(7)
        name = ""
        for word in keywords:
            name += f"{word}"
        if titleOnly:
            name += "_Title"
        driver.save_screenshot(f"IEEE_{name}.png")
        try:
            # for en, child in enumerate(driver.find_element_by_class_name("ng-Dashboard").find_elements_by_xpath(".//*")):
            #     print(en, child.text)
            results = driver.find_element_by_class_name("ng-Dashboard").find_elements_by_xpath(".//*")[31]
            if results.text == "Set Search Alerts":
                results = driver.find_element_by_class_name("ng-Dashboard").find_elements_by_xpath(".//*")[41]
            results = results.text.split(" ")[0].split("-")[-1]
            if "," in results:
                results = results.replace(",", "")
        except NoSuchElementException:
            results = 0
        except IndexError:
            results = 0
        if results == "":
            results = 0
        
        results = int(results)
        r = int(np.min([math.ceil(results / 50), ieee_maxpage]))
        # Loop through all pages and save resulting bib files

        for i in tqdm.tqdm(range(r)):
            toOpen = IEEE_URL + str(i+1)
            bib = loadIEEEBib(toOpen, driver)
            try: 
                print("trying to save bib file")
                bib_data = parse_string(bib, "bibtex")
                bib_data.to_file(f"./PostReview/ieee/ieee_{name}_page{i}.bib")
            except:
                print(f"Something went wrong with: {name}_{i}") 
                IEEE_bib_files.append(bib)  
                urls.append(toOpen)
            time.sleep(3)

# ACM

In [9]:
def loadACMBib (toOpen, driver):
    driver.get(toOpen)#put here the adress of your page
    delay = 3 # seconds
    driver.find_element_by_class_name("item-results__checkbox").click()
    time.sleep(5)
    driver.find_element_by_class_name("item-results__buttons.visible").find_elements_by_xpath(".//*")[0].click()
    time.sleep(20)
    driver.find_element_by_class_name("rlist--inline.separator").find_elements_by_xpath(".//*")[1].click()
    time.sleep(20)
def saveACMBib(keywords_list, dl_folder, titleOnly):
    driver = setupCrawler(dl_folder)
    for keywords in keywords_list:
        print(f"Search for: {keywords}")
        ACM_URL = getURL(keywords, "ACM", titleOnly)
        driver.get(ACM_URL)#put here the adress of your page
        time.sleep(3)
        name = ""
        for word in keywords:
            name += f"{word}"
        if titleOnly:
            name += "_TitleOnly"
        driver.save_screenshot(f"./acmpost_{name}.jpg")
        # get amount of results for for-loop
        try:
            results = driver.find_element_by_class_name("result__count")
            results = results.text.split(" ")[0]
            if "," in results:
                results = results.replace(",", "")
            results = int(results)
        except NoSuchElementException:
            results = 0
        r = np.min([math.ceil(results / 50), acm_maxpage])
        # Loop through all pages and save resulting bib files
        for i in tqdm.tqdm(range(r)):
            toOpen = ACM_URL + str(i)
            driver = setupCrawler(dl_folder)
            loadACMBib(toOpen, driver)
            try:
                os.rename('PostReview/acm/acm.bib', f'PostReview/acm/acm_{name}_page{i}.bib')
            except FileNotFoundError:
                print("Only 1 bib entry in that file.")


# ScienceDirect

In [10]:
def loginScienceDirect(driver):
    # Login_URL = "https://www.sciencedirect.com/"
    # driver.get(Login_URL)
    mail = driver.find_element_by_id("bdd-email")
    mail.send_keys(UNI_MAIL)
    time.sleep(1)
    mail.send_keys(Keys.ENTER)
    time.sleep(1)
    driver.save_screenshot("login.jpg")
    time.sleep(1)
    driver.find_element_by_id("bdd-elsPrimaryBtn").click()
    time.sleep(1)
    driver.find_element_by_id("username").send_keys(UNI_USER)
    time.sleep(1)
    pwd = driver.find_element_by_id("password")
    pwd.send_keys(UNI_PWD)
    time.sleep(1)
    pwd.send_keys(Keys.ENTER)
    time.sleep(2)
    attemps = 0
    try:
        driver.find_element_by_id("institution-button").click()
    except StaleElementReferenceException:
        print("intitution button apparently no accessable")
        driver.save_screenshot("StaleElement.png")
        driver.find_element_by_id("institution-button").click()
    time.sleep(2)
    # mail = driver.find_element_by_id("bdd-email")
    # mail.send_keys(ScienceDirect_MAIL)
    # mail.send_keys(Keys.ENTER)
    # pwd = driver.find_element_by_id("bdd-password")
    # pwd.send_keys(ScienceDirect_PWD)
    # pwd.send_keys(Keys.ENTER)
    # time.sleep(1)
    # driver.find_element_by_id("institution-button").click()
    # pwd = driver.find_element_by_id("bdd-password")
    # pwd.send_keys(ScienceDirect_PWD)
    # pwd.send_keys(Keys.ENTER)
def loadScienceDirectBib(toOpen, driver):
    driver.get(toOpen)
    time.sleep(5)
    try:
        loginScienceDirect(driver)
    except NoSuchElementException:
        print("Already logged in or wrong credentials") 
    driver.save_screenshot("sciencedirect.png")
    driver.find_element_by_class_name("checkbox-check.partial.checkbox-small").click()
    time.sleep(1)
    driver.find_element_by_class_name("button-link.export-all-link-button.button-link-primary").click()
    time.sleep(5)
    driver.find_elements_by_class_name("button-link.button-link-primary.export-option.u-display-block")[2].click()
    time.sleep(10)
def saveScienceDirectBib(keywords_list, dl_folder, titleOnly):
    driver = setupCrawler(dl_folder)
    SD_URL = getURL(keywords_list[0], "ScienceDirect", titleOnly)
    driver.get('https://www.sciencedirect.com/')
    driver.save_screenshot("SD.png")
    try:
        loginScienceDirect(driver)
    except NoSuchElementException:
        print("Already logged in or wrong credentials") 
    # loginScienceDirect(driver)
    driver.save_screenshot(f"{SD_URL}.png")
    for keywords in keywords_list:
        print(f"Search for: {keywords}")
        SD_URL = getURL(keywords, "ScienceDirect", titleOnly)
        driver.get(SD_URL)#put here the adress of your page
        time.sleep(3)
        try:
            results = driver.find_element_by_class_name("search-body-results-text")
            results = results.text.split(" ")[0]
            if "," in results:
                results = results.replace(",", "")
            results = int(results)
        except NoSuchElementException:
            results = 0
        
        r = np.min([math.ceil(results / 50), sd_maxpage])
        for i in tqdm.tqdm(range(r)):
            driver = setupCrawler(dl_folder)
            toOpen = SD_URL + str(i*50)
            loadScienceDirectBib(toOpen, driver)

In [13]:
crawl(keywords, "ScienceDirect", True)

  driver = webdriver.Chrome(executable_path = "./chromedriver.exe",   chrome_options=options)


https://www.sciencedirect.com/search?date=2021?title=%22pointing%22%20AND%20%22virtual%20environment%22&show=50&offset=
Already logged in or wrong credentials
Search for: ['pointing', 'virtual%20environment']
https://www.sciencedirect.com/search?date=2021?title=%22pointing%22%20AND%20%22virtual%20environment%22&show=50&offset=


0it [00:00, ?it/s]


Search for: ['pointing', 'virtual%20reality']
https://www.sciencedirect.com/search?date=2021?title=%22pointing%22%20AND%20%22virtual%20reality%22&show=50&offset=


0it [00:00, ?it/s]


Search for: ['pointing', 'vr']
https://www.sciencedirect.com/search?date=2021?title=%22pointing%22%20AND%20%22vr%22&show=50&offset=


0it [00:00, ?it/s]


Search for: ['pointing', 'large%20screen']
https://www.sciencedirect.com/search?date=2021?title=%22pointing%22%20AND%20%22large%20screen%22&show=50&offset=


0it [00:00, ?it/s]


Search for: ['pointing', 'large%20display']
https://www.sciencedirect.com/search?date=2021?title=%22pointing%22%20AND%20%22large%20display%22&show=50&offset=


0it [00:00, ?it/s]


Search for: ['pointing', 'augmented%20reality']
https://www.sciencedirect.com/search?date=2021?title=%22pointing%22%20AND%20%22augmented%20reality%22&show=50&offset=


0it [00:00, ?it/s]


Search for: ['selection', 'virtual%20environment']
https://www.sciencedirect.com/search?date=2021?title=%22selection%22%20AND%20%22virtual%20environment%22&show=50&offset=


0it [00:00, ?it/s]


Search for: ['selection', 'virtual%20reality']
https://www.sciencedirect.com/search?date=2021?title=%22selection%22%20AND%20%22virtual%20reality%22&show=50&offset=


0it [00:00, ?it/s]


Search for: ['selection', 'vr']
https://www.sciencedirect.com/search?date=2021?title=%22selection%22%20AND%20%22vr%22&show=50&offset=


KeyboardInterrupt: 