## <center> Related Scientific Work Search </center>

Bla bla bla

### Objective
1. Search for most relavent scientific research paper based on query 
1. Spellcheck query 

In [39]:
import time, os, selenium, requests, re 

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from abc import ABCMeta, abstractmethod

In [40]:
class DataCrawler(metaclass=ABCMeta):
    """Base searcher to be used for all academic databases crawlers."""
    def __init__(self, output_directory, base_url):
        self.output_directory = output_directory
        self.base_url = base_url
        self.results = {}
    
    @abstractmethod
    def search(self):
        """Search a given query using base_url"""
        pass

    @abstractmethod
    def filter_by_year(self):
        """Processes raw data. This step should create the raw dataframe with all the required features. Shouldn't implement statistical or text cleaning."""
        pass

    @abstractmethod
    def save(self):
        """Saves processed data."""
        pass
    

In [41]:
class IEEESeach(DataCrawler):
    """IEEE document base searcher"""
    def __init__(self,output_dir):
        self.url = "https://ieeexplore.ieee.org"
        self.browser = webdriver.Chrome(executable_path='./chromedriver')
        super().__init__(os.path.join(output_dir, 'IEEE'),self.url)
    
    def search(self,query="A Lightweight Autoencoder"):
        """finds documents related to query in IEEE document base"""
        
        self.browser.get("https://ieeexplore.ieee.org")
        self.browser.implicitly_wait(10) #wait 10 sec for website to load 
        input_element = self.browser.find_element_by_class_name("Typeahead-input") #find the query input field
        input_element.send_keys(query) #pass the query to input field
        
        action = ActionChains(self.browser).send_keys(Keys.ENTER)
        action.perform() #press search button
        
        #get all the urls of the articles/documents found
        for item in self.browser.find_elements_by_class_name("List-results-items"):
            text = item.text.split("\n")
            title = text[0].strip()
            year = text[3].split("|")[0].split(":")[1].strip()
    
            #Get document link 
            link = item.find_element_by_tag_name("a").get_attribute("href")
            self.results[title] = {"year" : int(year), "link":link}
        
        #retrieve abstract text for each article/document in the results
        for name, doc_info in self.results.items():
            self.browser.get(doc_info['link'])
            time.sleep(0.05)
            abstract_text = self.browser.find_element_by_class_name("abstract-text").text
            self.results[name].update({"Abstract":abstract_text.split("\n")[1]})
        
        self.browser.quit()
            
    def filter_by_year(self):
        """extract any text if any .pdf, """
        print("Coming soon!!")
        
    def save(self):
        """Saves processed data."""
        print("Saving....")
    

In [89]:
test = IEEESeach("./")
test.search()

In [42]:
class arXiv(DataCrawler):
    """arXiv.org document base searcher"""
    def __init__(self,output_dir):
        self.url = "https://arxiv.org"
        self.browser = webdriver.Chrome(executable_path='./chromedriver')
        super().__init__(os.path.join(output_dir, 'arXiv'),self.url)
    
    def search(self,query="A Lightweight Autoencoder"):
        """finds documents related to query in arXiv document base"""
        
        self.browser.get(self.url)
        self.browser.implicitly_wait(10) #wait 10 sec for website to load 
        input_element = self.browser.find_element_by_name("query") #find the query input field
        input_element.send_keys(query) #pass the query to input field
        
        action = ActionChains(self.browser).send_keys(Keys.ENTER)
        action.perform() #press search button
        
        #expand the result div to show full abstract text
        expand = self.browser.find_elements_by_partial_link_text('▽ More')
        for x in range(0,len(expand)):
            expand[x].click()
        
        #get all the urls of the articles/documents found
        res_art = self.browser.find_elements_by_class_name("arxiv-result")
        self.results = {}
        for r in res_art:
            title = r.find_element_by_class_name("title").text
            abstract = r.find_element_by_class_name("abstract").text
            pdf_link = r.find_element_by_partial_link_text("pdf").get_attribute("href")
            dates = r.find_element_by_css_selector("p.is-size-7").text
            self.results[title] = { "year": dates, "abstract":abstract, "pdf_link": pdf_link, "title": title }

        self.browser.quit()
            
    def filter_by_year(self):
        """extract any text if any .pdf, """
        print("Coming soon!!")
        
    def save(self):
        """Saves processed data."""
        print("Saving....")

In [91]:
test2 = arXiv("./")
test2.search("Autoencoder")

In [43]:
class ScienceDirect(DataCrawler):
    """sciencedirect.com document base searcher"""
    def __init__(self,output_dir):
        self.url = "https://www.sciencedirect.com"
        self.browser = webdriver.Chrome(executable_path='./chromedriver')
        super().__init__(os.path.join(output_dir, 'sciencedirect'),self.url)
    
    def search(self,query="A Lightweight Autoencoder"):
        """finds documents related to query in science direct document base"""
        
        self.browser.get(self.url)
        self.browser.implicitly_wait(10) #wait 10 sec for website to load 
        input_element = self.browser.find_element_by_name("qs") #find the query input field
        input_element.send_keys(query) #pass the query to input field
        
        action = ActionChains(self.browser).send_keys(Keys.ENTER)
        action.perform() #press search button
        
        #expand the result div to show full abstract text
        self.results = {}
        time.sleep(0.01)
        res_items = self.browser.find_elements_by_class_name("ResultItem")
        time.sleep(0.01)
        for i in res_items:
            wait = WebDriverWait(i, 10)
            i.find_element_by_css_selector("[aria-label=Abstract]").click()
            element = wait.until(EC.presence_of_element_located((By.CLASS_NAME,'preview-body-container')))
            
            abstract = element.find_element_by_tag_name("p").text
            title = i.find_element_by_class_name("result-list-title-link").text
            pdf_link = i.find_element_by_partial_link_text("Download PDF").get_attribute("href")
            
            date = " ".join([j.text for j in i.find_element_by_class_name("SubType").find_elements_by_tag_name('span')])
            date = re.findall('(?:January|February|March|April|May|June|July|August|September|October|November|December)[\s-]\d{2,4}', date)
    
            self.results[title] = { "year": date, "abstract":abstract, "pdf_link": pdf_link, "title": title }
        
        self.browser.quit()
            
    def filter_by_year(self):
        """extract any text if any .pdf, """
        print("Coming soon!!")
        
    def save(self):
        """Saves processed data."""
        print("Saving....")

In [44]:
test3 = ScienceDirect("./")
test3.search("sugar")

In [34]:
#Science Direct

browser = webdriver.Chrome(executable_path='./chromedriver')

browser.get("https://www.sciencedirect.com")
browser.implicitly_wait(60) #wait 10 sec for website to load 
input_element = browser.find_element_by_name("qs") #find the query input field
# input_element.send_keys("A lightweight autoencoder") #pass the query to input field
input_element.send_keys("sugar")

action = ActionChains(browser).send_keys(Keys.ENTER)
action.perform() #press search button

In [46]:
res_items = browser.find_elements_by_class_name("ResultItem")
res_dict = {}


for i in res_items:
    try:
        wait = WebDriverWait(i, 10)
        i.find_element_by_css_selector("[aria-label=Abstract]").click()
#         element = wait.until(EC.presence_of_element_located((By.CLASS_NAME,'preview-body-container')))
        title = i.find_element_by_class_name("result-list-title-link").text
        text = i.find_element_by_class_name("preview-body-container").find_element_by_tag_name("p").text
        pdf_link = i.find_element_by_partial_link_text("Download PDF").get_attribute("href")
        dates = " ".join([j.text for j in i.find_element_by_class_name("SubType").find_elements_by_tag_name('span')])
        date = re.findall('(?:January|February|March|April|May|June|July|August|September|October|November|December)[\s-]\d{2,4}', dates)
    except:
        print("expect")
        i.find_element_by_css_selector("[aria-label=Abstract]").click()
        
    print(title)
    print(date)
    print(i.find_element_by_class_name("Authors").text)
    print(pdf_link)
    print(text)
    print()
    
browser.quit()  

In [None]:
#Researchgate - maybe

In [55]:
#mdpi
#Science Direct

browser = webdriver.Chrome(executable_path='./chromedriver')

browser.get("https://www.mdpi.com")
browser.implicitly_wait(60) #wait 10 sec for website to load 
input_element = browser.find_element_by_name("q") #find the query input field
# input_element.send_keys("A lightweight autoencoder") #pass the query to input field
input_element.send_keys("sugar")

action = ActionChains(browser).send_keys(Keys.ENTER)
action.perform() #press search button

In [63]:
items = browser.find_elements_by_class_name("article-item")

In [73]:
print(items[0].find_element_by_class_name("title-link").text)
print(items[1].find_element_by_class_name("abstract-full"))

Effects of Non-Leguminous Cover Crops on Yield and Quality of Baby Corn (Zea mays L.) Grown under Subtropical Conditions
<selenium.webdriver.remote.webelement.WebElement (session="c57b64efa80c3b25b1e46746686f0de4", element="bf5669c9-379c-4eb8-a81c-dcc595f98d37")>


In [88]:
a = items[10].find_element_by_partial_link_text("Read more.")