#  Challenge
Do a little scraping or API-calling of your own. Pick a new website and see what you can get out of it. Expect that you'll run into bugs and blind alleys, and rely on your mentor to help you get through.

Formally, your goal is to write a scraper that will:

1) Return specific pieces of information (rather than just downloading a whole page)
2) Iterate over multiple pages/queries
3) Save the data to your computer

Once you have your data, compute some statistical summaries and/or visualizations that give you some new insights into your scraping topic of interest. Write up a report from scraping code to summary and share it with your mentor.

## Aim

Scrape all current abstracts from Nature Microbiology, inclucing Authors, affiliatiotions and date.

In [None]:
# Resources
#https://stackoverflow.com/questions/23585992/making-post-request-using-scrapy
#https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess
import json




# get me the pubmed ids of all Nature Microbiology paper
class PubmedIDSpider(scrapy.Spider):
    name = "PubmedIDSpider"
    start_urls = [
            "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&retmode=json&retmax=1000&term=Nat+Microbiol%5BJOURNAL%5D"
    ]
    # 
    #retmode=json, returns json file instead of xml
    # retmax=1000, returns 1000 results instead of 20
    # if there are more than 1000 results switch to the next page by using &retstart=1000 into the url
    # eutils/esummary.fcgi? gives abstract information without abstract. Example http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&rettype=abstract&id=25081398,24792655
    # eutils/efetch.fcgi? gives the abstract. Example: http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=text&rettype=abstract&id=25081398
    
    def parse(self, response): 
        json_data = json.loads(response.body)
        results=json_data["esearchresult"]["idlist"]
        
        print('Number of IDs',len(results))
        yield {'IDs':results}
            
             
process = CrawlerProcess({
    'FEED_FORMAT': 'csv',
    'FEED_URI': 'PubmedIDs.csv',
    # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
    'ROBOTSTXT_OBEY': False,
    'USER_AGENT': 'NadineRuecker(nadineruecker@gmail.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True,
    'LOG_ENABLED': False,
    # We use CLOSESPIDER_PAGECOUNT to limit our scraper to the first 100 links.    
    'CLOSESPIDER_PAGECOUNT' : 10
})

In [None]:
# Starting the crawler with our spider.
process.crawl(PubmedIDSpider)
process.start()
print('Sucess')

In [3]:
import pandas as pd
IDs= pd.read_csv('PubmedIDs.csv').iloc[0,0].split(",")  # comes in a weird format, so the list of IDs has to be extracted from the first column

In [5]:
start_url='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&rettype=abstract&id='
start_urls = [start_url+ID for ID in IDs]
print(start_urls[0:3])

['http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&rettype=abstract&id=30787480', 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&rettype=abstract&id=30787479', 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&rettype=abstract&id=30787478']


In [6]:
# Now use those IDs to call another Spider and read the abstract information
import scrapy
from scrapy.crawler import CrawlerProcess
import json

# get me the pubmed ids of all Nature Microbiology paper
class PubmedAbstractSpider(scrapy.Spider):
    name = "PubmedAbstractSpider"
    start_url='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&rettype=abstract&id='
    start_urls = [start_url+ID for ID in IDs]
    # 
    # retmode=json, returns json file instead of xml
    # retmax=1000, returns 1000 results instead of 20
    # if there are more than 1000 results switch to the next page by using &retstart=1000 into the url
    # eutils/esummary.fcgi? gives abstract information without abstract. Example http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&retmode=json&rettype=abstract&id=25081398,24792655
    # eutils/efetch.fcgi? gives the abstract. Example: http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=text&rettype=abstract&id=25081398

    def parse(self, response): 
        json_data = json.loads(response.body)

        print('Number of IDs',len(results))
        yield {'IDs':ID,
               'Pubdate':json_data['result'][ID]['pubdate'],
               'Authors':json_data['result'][ID]['authors'],
               'LastAuthor':json_data['result'][ID]['lastauthor'],
               'Title':json_data['result'][ID]['Title']
              }
            
             
process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': 'ArticleData.json',
    # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
    'ROBOTSTXT_OBEY': False,
    'USER_AGENT': 'NadineRuecker(nadineruecker@gmail.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True,
    'LOG_ENABLED': False,
    # We use CLOSESPIDER_PAGECOUNT to limit our scraper to the first 100 links.    
    'CLOSESPIDER_PAGECOUNT' : 10
})

In [7]:
# Starting the crawler with our spider.
process.crawl(PubmedAbstractSpider)
process.start()
print('Sucess')

Sucess


In [None]:
class PubmedSpider(Spider):
    name = "pubmed"
    cur_page = 1
    max_page = 3
    start_urls = [
            "https://www.ncbi.nlm.nih.gov/pubmed/?term=Nat+Microbiol%5BJOURNAL%5D"
    ]

    def parse(self, response):
        sel = Selector(response)
        pubmed_results = sel.xpath('//div[@class="rslt"]')
        #next_page_url = sel.xpath('//div[@id="gs_n"]//td[@align="left"]/a/@    href').extract()[0]
        self.cur_page = self.cur_page + 1
        print 'cur_page ','*' * 30, self.cur_page

        form_data = {'term':'Nat+Microbiol%5BJOURNAL%5D',
                    'EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Entrez_Pager.Page':'results',
                    'email_subj':'cancer+drug+toxic+-+PubMed',
                    'EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Entrez_Pager.CurrPage':str(self.cur_page),
                    'email_subj2':'cancer+drug+toxic+-+PubMed',
                    'EntrezSystem2.PEntrez.DbConnector.LastQueryKey':'2',
                    'EntrezSystem2.PEntrez.DbConnector.Cmd':'PageChanged',
                    'p%24a':'EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Entrez_Pager.Page',
                    'p%24l':'EntrezSystem2',
                    'p%24':'pubmed',
                    }

        for pubmed_result in pubmed_results:
            item = PubmedItem()

            item['title'] = lxml.html.fromstring(pubmed_result.xpath('.//a')[0].extract()).text_content()
            item['link'] = pubmed_result.xpath('.//p[@class="title"]/a/@href').extract()[0]

            #modify following lines
            if self.cur_page < self.max_page:
                yield FormRequest("https://www.ncbi.nlm.nih.gov/pubmed/?term=Nat+Microbiol%5BJOURNAL%5D",formdata = form_data,
                callback = self.parse2, method="POST")

            yield item

    def parse2(self, response):
        with open('response_html', 'w') as f:
            f.write(response.body)

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess


class NCBISpider(scrapy.Spider):
    name = "NCBI-Spi"
    
    # Here is where we insert our API call.
    start_urls = [
        'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=cancer+toxic+drug'
        ]

    # Identifying the information we want from the query response and extracting it using xpath.
    def parse(self, response):
        for item in response.xpath('//lh'):
            # The ns code identifies the type of page the link comes from.  '0' means it is a Wikipedia entry.
            # Other codes indicate links from 'Talk' pages, etc.  Since we are only interested in entries, we filter:
            if item.xpath('@ns').extract_first() == '0':
                yield {
                    'title': item.xpath('@title').extract_first() 
                    }
        # Getting the information needed to continue to the next ten entries.
        next_page = response.xpath('continue/@lhcontinue').extract_first()
        
        # Recursively calling the spider to process the next ten entries, if they exist.
        if next_page is not None:
            next_page = '{}&lhcontinue={}'.format(self.start_urls[0],next_page)
            yield scrapy.Request(next_page, callback=self.parse)
            
    
process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': 'PythonLinks.json',
    # Note that because we are doing API queries, the robots.txt file doesn't apply to us.
    'ROBOTSTXT_OBEY': False,
    'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True,
    'LOG_ENABLED': False,
    # We use CLOSESPIDER_PAGECOUNT to limit our scraper to the first 100 links.    
    'CLOSESPIDER_PAGECOUNT' : 10
})
                                         

# Starting the crawler with our spider.
process.crawl(WikiSpider)
process.start()
print('First 100 links extracted!')