In [None]:
# 1. Search PubMed

Search PubMed for papers

https://www.ncbi.nlm.nih.gov/pubmed/

https://www.ncbi.nlm.nih.gov/books/NBK25499/

In [1]:
import lcp.reuse as reuse
from Bio import Entrez
from IPython.display import display

### Trying to refine search query

ie. Searching the most general term: `mimic-ii OR mimic-iii`, gives this false positive: https://www.ncbi.nlm.nih.gov/pubmed/12403307. "*Synthesis of a new antischistosomally active and toxicologically tolerant C-12 monothione surrogate of the universal antihelmintic praziquantel*".

In [8]:
entrez_email = 'mimic-support@physionet.org'

search_strings = [
    'mimic-ii OR mimic-iii',
    #'(mimic-ii OR mimic-iii OR "mimic 2" OR "mimic 3") AND (database OR clinical OR waveform OR icu OR physionet)',
    '(mimic-ii OR mimic-iii) AND (database OR clinical OR waveform OR icu OR physionet)',
    '(mimic-ii OR mimic-iii) AND (database OR clinical OR waveform OR icu)',
    '(mimic-ii OR mimic-iii) AND (database OR clinical OR waveform)',
    '(mimic-ii OR mimic-iii) AND (database OR clinical)',
    '(mimic-ii OR mimic-iii) AND (database)',
    '(mimic-ii OR mimic-iii) AND (clinical)',
    
    
    # Can manually go through this over-capturing query. See false positives
    '(mimic-ii OR mimic-iii OR mimicii OR mimiciii OR mimic-2 OR mimic-3 OR mimic2 OR mimic3)',
    '(mimic-ii OR mimic-iii OR mimicii OR mimiciii OR mimic-2 OR mimic-3 OR mimic2 OR mimic3) AND (icu OR "intensive care" OR physionet)',
    '(mimic-ii OR mimic-iii OR mimicii OR mimiciii OR mimic-2 OR mimic-3 OR mimic2 OR mimic3) AND (icu OR physionet)',
]



In [9]:
search_results = reuse.search_list(search_strings, entrez_email)

In [10]:
for ss in search_strings:
    search_result = search_results[ss]
    print('%s:\n - Count: %s' % (search_result.search_string, search_result.count))
    print('\n')

mimic-ii OR mimic-iii:
 - Count: 124


(mimic-ii OR mimic-iii) AND (database OR clinical OR waveform OR icu OR physionet):
 - Count: 121


(mimic-ii OR mimic-iii) AND (database OR clinical OR waveform OR icu):
 - Count: 120


(mimic-ii OR mimic-iii) AND (database OR clinical OR waveform):
 - Count: 117


(mimic-ii OR mimic-iii) AND (database OR clinical):
 - Count: 117


(mimic-ii OR mimic-iii) AND (database):
 - Count: 102


(mimic-ii OR mimic-iii) AND (clinical):
 - Count: 76


(mimic-ii OR mimic-iii OR mimicii OR mimiciii OR mimic-2 OR mimic-3 OR mimic2 OR mimic3):
 - Count: 169


(mimic-ii OR mimic-iii OR mimicii OR mimiciii OR mimic-2 OR mimic-3 OR mimic2 OR mimic3) AND (icu OR "intensive care" OR physionet):
 - Count: 105


(mimic-ii OR mimic-iii OR mimicii OR mimiciii OR mimic-2 OR mimic-3 OR mimic2 OR mimic3) AND (icu OR physionet):
 - Count: 85




In [None]:
for i in range(6):
    reuse.showdiff(search_results[search_strings[i]], search_results[search_strings[i+1]])
    

### Conclusion

Only the most general query results in any number of false positives. 

Between the most and second most general queries, the false positives are:
- 'HIV fusion peptide penetrates, disorders, and softens T-cell membrane mimics.' https://www.ncbi.nlm.nih.gov/pubmed/20655315
- 'Synthesis of a new antischistosomally active and toxicologically tolerant C-12 monothione surrogate of the universal antihelmintic praziquantel.' https://www.ncbi.nlm.nih.gov/pubmed/12403307

The false negatives (missed results) are:
- 'Automated Diagnosis Coding with Combined Text Representations.' https://www.ncbi.nlm.nih.gov/pubmed/28423783

# Conclusion 2

We are not using these queries to construct the GS queries.

For pubmed, manually look at most general query results from query: '*(mimic-ii OR mimic-iii OR mimicii OR mimiciii OR mimic-2 OR mimic-3 OR mimic2 OR mimic3)*' with 169 results.

There were 41 FPs.

**Total of 128 mimic papers on pubmed.**

# Attempting to parse GS automatically failed. Below is evidence of failure. Can ignore...

# 2. Search Google Scholar

Packages found online:
- https://github.com/ckreibich/scholar.py
- https://github.com/venthur/gscholar
- https://github.com/adeel/google-scholar-scraper
- http://code.activestate.com/recipes/523047-search-google-scholar/
- https://github.com/erdiaker/torrequest
- https://github.com/NikolaiT/GoogleScraper


- https://stackoverflow.com/questions/8049520/web-scraping-javascript-page-with-python


Query: `("mimic ii" OR "mimic iii") AND ("database" OR "clinical" OR "waveform" OR ICU)`

https://scholar.google.com/scholar?q=%28mimic-ii+OR+mimic-iii%29&btnG=&hl=en&as_sdt=1%2C22&as_vis=1

https://scholar.google.com/scholar/help.html


https://superuser.com/questions/565722/how-to-config-tor-to-use-a-http-socks-proxy

## Requirements

1. Browse with JS enabled. requests library uses http. Otherwise google will think (correctly) that you are a robot.
2. Change IP every time, or google will block.

In [None]:
#from torrequest import TorRequest
from bs4 import BeautifulSoup
import urllib2
import getpass
import sys

import stem
import stem.connection

from stem.control import Controller

In [None]:
# Show IP address
with TorRequest(proxy_port=9050, ctrl_port=9051, password=None) as tr:
    response = tr.get('http://ipecho.net/plain')
    print(response.text)
    tr.reset_identity


In [None]:
# Show IP address
with TorRequest(proxy_port=9050, ctrl_port=9051, password=None) as tr:
    response = tr.get('http://ipecho.net/plain')
    print(response.text)
    tr.reset_identity

In [None]:
with TorRequest(proxy_port=9050, ctrl_port=9051, password=None) as tr:

    # Specify HTTP verb and url.
    resp = tr.get('https://scholar.google.com/scholar?q=%28mimic-ii+OR+mimic-iii%29&hl=en&as_sdt=1%2C22&as_vis=1&as_ylo=2017&as_yhi=2017')
    print(resp.text)

    # Change your Tor circuit,
    # and likely your observed IP address.
    tr.reset_identity()



In [None]:
type(resp.text)

In [None]:
soup = BeautifulSoup(resp.text,'html.parser')

In [None]:
for anchor in soup.find_all('a'):
    print(anchor.get('href', '/'))

In [None]:
from bs4 import BeautifulSoup
import urllib2

webpage = urllib2.urlopen('http://en.wikipedia.org/wiki/Main_Page')
soup = BeautifulSoup(webpage,'html.parser')
for anchor in soup.find_all('a'):
    print(anchor.get('href', '/'))

In [None]:
with TorRequest() as tr:
  response = tr.get('http://ipecho.net/plain')
  print(response.text)  # not your IP address


In [None]:
with TorRequest(proxy_port=9050, ctrl_port=9051, password=None) as tr:

  # Specify HTTP verb and url.
  resp = tr.get('https://scholar.google.com/scholar?q=%28mimic-ii+OR+mimic-iii%29&hl=en&as_sdt=1%2C22&as_vis=1&as_ylo=2017&as_yhi=2017')
  print(resp.text)

#   # Send data. Use basic authentication.
#   resp = tr.post('https://api.example.com', 
#     data={'foo': 'bar'}, auth=('user', 'pass'))'
#   print(resp.json)

  # Change your Tor circuit,
  # and likely your observed IP address.
  tr.reset_identity()

  # TorRequest object also exposes the underlying Stem controller 
  # and Requests session objects for more flexibility.

  print(type(tr.ctrl))            # a stem.control.Controller object
  tr.ctrl.signal('CLEARDNSCACHE') # see Stem docs for the full API

  print(type(tr.session))         # a requests.Session object
  c = cookielib.CookieJar()
  tr.session.cookies.update(c)    # see Requests docs for the full API

In [None]:
scholar_url = 'https://scholar.google.com/scholar?as_vis=1&q=sepsis+mimic-iii&hl=en&as_sdt=1,22'
echo_ip_url = 'https://www.atagar.com/echo.php'
test_js_url = 'http://127.0.0.1:81/test-js.html'

In [None]:
with TorRequest(proxy_port=9050, ctrl_port=9051, password=None) as tr:

  # Specify HTTP verb and url.
  resp = tr.get('https://scholar.google.com/scholar?q=%28mimic-ii+OR+mimic-iii%29&hl=en&as_sdt=1%2C22&as_vis=1&as_ylo=2017&as_yhi=2017')
  print(resp.text)

#   # Send data. Use basic authentication.
#   resp = tr.post('https://api.example.com', 
#     data={'foo': 'bar'}, auth=('user', 'pass'))'
#   print(resp.json)

  # Change your Tor circuit,
  # and likely your observed IP address.
  tr.reset_identity()

In [None]:
import io
import pycurl

import stem.process

from stem.util import term

SOCKS_PORT = 9000

def query(url):
  """
  Uses pycurl to fetch a site using the proxy on the SOCKS_PORT.
  """

  output = io.BytesIO()

  query = pycurl.Curl()
  query.setopt(pycurl.URL, url)
  query.setopt(pycurl.PROXY, 'localhost')
  query.setopt(pycurl.PROXYPORT, SOCKS_PORT)
  query.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5_HOSTNAME)
  query.setopt(pycurl.WRITEFUNCTION, output.write)

  try:
    query.perform()
    return output.getvalue()
  except pycurl.error as exc:
    return "Unable to reach %s (%s)" % (url, exc)


# Start an instance of Tor configured to only exit through Russia. This prints
# Tor's bootstrap information as it starts. Note that this likely will not
# work if you have another Tor instance running.

def print_bootstrap_lines(line):
  if "Bootstrapped " in line:
    print(term.format(line, term.Color.BLUE))


print(term.format("Starting Tor:\n", term.Attr.BOLD))

tor_process = stem.process.launch_tor_with_config(
  config = {
    'SocksPort': str(SOCKS_PORT),
    'ExitNodes': '{ru}',
  },
  init_msg_handler = print_bootstrap_lines,
)

print(term.format("\nChecking our endpoint:\n", term.Attr.BOLD))
print(term.format(query("https://www.atagar.com/echo.php"), term.Color.BLUE))

tor_process.kill()  # stops tor

In [None]:
q = query("https://www.atagar.com/echo.php")

In [None]:
import dryscrape
s = dryscrape.Session()
s.set_proxy(port=9050)

In [1]:
import stem
from stem.control import Controller
from stem.process import launch_tor_with_config
import requests
import dryscrape
import time

class TorRequest(object):
  def __init__(self, 
      proxy_port=9050, 
      ctrl_port=9051,
      password=None):

    self.proxy_port = proxy_port
    self.ctrl_port = ctrl_port
    
    self._tor_proc = None
    if not self._tor_process_exists():
      self._tor_proc = self._launch_tor()

    self.ctrl = Controller.from_port(port=self.ctrl_port)
    self.ctrl.authenticate(password=password)

    self.session = requests.Session()
    self.session.proxies.update({
      'http': 'socks5://localhost:%d' % self.proxy_port,
      'https:': 'socks5://localhost:%d' % self.proxy_port,
    })

  def _tor_process_exists(self):
    try:
      ctrl = Controller.from_port(port=self.ctrl_port)
      ctrl.close()
      return True
    except:
      return False

  def _launch_tor(self):
    return launch_tor_with_config(
      config={
        'SocksPort': str(self.proxy_port),
        'ControlPort': str(self.ctrl_port)
      },
      take_ownership=True)

  def close(self):
    try: 
      self.session.close()
    except: pass

    try: 
      self.ctrl.close()
    except: pass

    if self._tor_proc:
      self._tor_proc.terminate()

  def reset_identity_async(self):
    self.ctrl.signal(stem.Signal.NEWNYM)

  def reset_identity(self):
    self.reset_identity_async()
    time.sleep(self.ctrl.get_newnym_wait())

  def get(self, *args, **kwargs):
    return self.session.get(*args, **kwargs)

  def post(self, *args, **kwargs):
    return self.session.post(*args, **kwargs)

  def put(self, *args, **kwargs):
    return self.session.put(*args, **kwargs)

  def patch(self, *args, **kwargs):
    return self.session.patch(*args, **kwargs)
    
  def delete(self, *args, **kwargs):
    return self.session.delete(*args, **kwargs)

  def __enter__(self):
    return self

  def __exit__(self, *args):
    self.close()

In [2]:
# Show IP address
with TorRequest(proxy_port=9050, ctrl_port=9051, password='16:872860B76453A77D60CA2BB8C1A7042072093276A3D701AD684053EC4C') as tr:
    response = tr.get('http://ipecho.net/plain')
    print(response.text)
    tr.reset_identity

UnreadableCookieFile: Authentication failed: '/var/run/tor/control.authcookie' doesn't exist

In [None]:
import dryscrape
import sys
from bs4 import BeautifulSoup
import time

scholar_url = 'https://scholar.google.com/scholar?as_vis=1&q=sepsis+mimic-iii&hl=en&as_sdt=1,22'
echo_ip_url = 'http://ipecho.net/plain'
test_js_url = 'http://127.0.0.1:81/test-js.html'

if 'linux' in sys.platform:
    # start xvfb in case no X is running. Make sure xvfb 
    # is installed, otherwise this won't work!
    dryscrape.start_xvfb()

In [None]:
s = dryscrape.Session()
s.visit(test_js_url)
s.body()

#s.visit('https://scholar.google.com/scholar?as_vis=1&q=sepsis+mimic-iii&hl=en&as_sdt=1,22')
# waiting for the first data row in a table to be present
# s.wait_for(lambda: s.at_css("tr.data-row0"))

# soup = BeautifulSoup(s.body(), 'lxml')

In [None]:
s = dryscrape.Session()

s.set_proxy(host = "localhost", port = 8118)
#time.sleep(20)
s.visit(echo_ip_url)
#s.body()

In [None]:
s.body()