In [427]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from lxml.html import fromstring
from itertools import cycle
import traceback
import string
import time
import pickle

In [None]:
price_data = pd.read_csv('../data/raw/NADAC__National_Average_Drug_Acquisition_Cost_.csv')
drug_data = pd.read_csv('../data/raw/drugsComTrain_raw.tsv',sep='\t')

In [None]:
"""
URL format on askapatient: 
    https://www.askapatient.com/viewrating.asp?drug=NUM&page=PAGE
    NUM ranges up to at least 200000, but does not exists for all integers in that range
    PAGE ranges up to about 50 or 100, and for PAGE greater than the max page limit, it repeats the last page
    
Issues: 
    Access to page blocked when using 'requests' library
"""

In [45]:
def get_proxies():
    url = 'https://free-proxy-list.net/'
    response = requests.get(url)
    parser = fromstring(response.text)
    proxies = []
    for i in parser.xpath('//tbody/tr'):
        if i.xpath('.//td[7][contains(text(),"yes")]'):
            #Grabbing IP and corresponding PORT
            proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
            proxies.append(proxy)
    return proxies

desktop_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
                 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
                 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
                 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
                 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
                 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0']

def random_headers():
    return {'User-Agent': np.random.choice(desktop_agents),'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
                

In [343]:
"""
Primary code ripped from https://srome.github.io/Parsing-HTML-Tables-in-Python-with-BeautifulSoup-and-pandas/
Proxy rotation ripped from https://www.scrapehero.com/how-to-rotate-proxies-and-ip-addresses-using-python-3/
"""


      

In [541]:
def parse_for_page_count(url):
        
        proxy = np.random.choice(get_proxies())
        try:
            response = requests.get(url,proxies={"http": proxy, "https": proxy},headers=random_headers())
        except: 
            print("Failed to connect")
            time.sleep(np.random.rand()*3)
            return parse_for_page_count(url)
            
        soup = BeautifulSoup(response.text, 'lxml')
        a_s = soup.find_all('a')
        
        if len(soup.find_all('table')) == 0:
            print('Got blocked')
            time.sleep(np.random.rand()*3)
            return parse_for_page_count(url)
        
        else:
            to_return = 1
            for a in a_s:
                try:
                    r = a.get('href')[:20]
                    s = a.get('href')
                    if (r == 'viewrating.asp?drug=') and ('page' in s):
                        to_return += 1
                except:
                    continue
                    
            print("Successful page count!") 
            return to_return
        
def parse_drug_list(url):
        
    proxy = np.random.choice(get_proxies())
    try:
        response = requests.get(url,proxies={"http": proxy, "https": proxy},headers=random_headers())
    except: 
        print("Failed to connect")
        time.sleep(np.random.rand()*3)
        return parse_drug_list(url)
    
    soup = BeautifulSoup(response.text, 'lxml')
    a_s = soup.find_all('a')
    to_return = [a.get('href')[20:].split('&')[0] for a in a_s if a.get('href')[:20]=='viewrating.asp?drug=']
    
    if len(to_return) == 0:
        print('Got blocked')
        time.sleep(np.random.rand()*3)
        return parse_drug_list(url)
    else:
        print("Successful drug list parse!")
        return to_return
    
def parse_html_table(table):
        
    cells = table.find_all('td')[8:][8:]
    ratings = [c.get_text()[1:] for c in cells[::8]]
    reasons = [c.get_text()[1:] for c in cells[1::8]]
    side_effects = [c.get_text() for c in cells[2::8]]
    comments = [c.get_text() for c in cells[3::8]]
    sexes = [c.get_text() for c in cells[4::8]]
    ages = [c.get_text() for c in cells[5::8]]
    dds = [c.get_text() for c in cells[6::8]]
    dates = [c.get_text() for c in cells[7::8]]
    df = pd.DataFrame({'RATING':[],'REASON':[],'SIDE EFFECTS':[],'COMMENTS':[],'SEX':[],'AGE':[],'DURATION/DOSAGE':[],'DATE ADDED':[]})
    df['RATING'] = ratings
    df['REASON'] = reasons
    df['SIDE EFFECTS'] = side_effects
    df['COMMENTS'] = comments
    df['SEX'] = sexes
    df['AGE'] = ages
    df['DURATION/DOSAGE'] = dds
    df['DATE ADDED'] = dates

    return df

def parse_url_for_tables(url): 
    
    proxy = np.random.choice(get_proxies())
    try:
        response = requests.get(url,proxies={"http": proxy, "https": proxy},headers=random_headers())
    except: 
        print("Failed to connect")
        time.sleep(np.random.rand()*3)
        return parse_url_for_tables(url)

    soup = BeautifulSoup(response.text, 'lxml')
    all_tables = soup.find_all('table')

    if len(all_tables) == 0:
        print('Got blocked')
        time.sleep(np.random.rand()*3)
        return parse_url_for_tables(url)
    else:
        drug = all_tables[0].find_all('h1')[0].get_text().split(' ')[0]
        reviews = parse_html_table(all_tables[1])
        print("Successful review table parse!")
        return drug,reviews
    
drug_table_tuples = []
for l in string.ascii_uppercase:
    drug_indices = parse_drug_list('https://www.askapatient.com/drugalpha.asp?letter='+l)
    for i in drug_indices:
        if i[-1].isdigit():
            num_pages = parse_for_page_count('https://www.askapatient.com/viewrating.asp?drug='+i)
            for j in range(1,num_pages+1):
                review_url = 'https://www.askapatient.com/viewrating.asp?drug='+i+'&page='+str(j)
                drug,reviews = parse_url_for_tables(review_url)
                drug_table_tuples.append((drug,reviews))
                print('Added reviews for '+drug+", page "+str(j))
                with open('test.pkl','wb') as f:
                    pickle.dump(drug_table_tuples,f)

Failed to connect
Failed to connect
Failed to connect
Successful drug list parse!
Failed to connect
Failed to connect
Failed to connect
Got blocked
Got blocked
Got blocked
Failed to connect
Failed to connect
Failed to connect
Successful page count!
Successful review table parse!
Added reviews for A/T/S, page 1
Successful page count!
Failed to connect
Failed to connect
Failed to connect
Failed to connect
Got blocked
Got blocked
Got blocked
Failed to connect
Got blocked
Failed to connect
Got blocked
Failed to connect
Successful review table parse!
Added reviews for ABILIFY, page 1
Failed to connect
Got blocked
Successful review table parse!
Added reviews for ABILIFY, page 2
Successful review table parse!
Added reviews for ABILIFY, page 3
Failed to connect
Failed to connect
Got blocked
Successful review table parse!
Added reviews for ABILIFY, page 4
Failed to connect
Successful review table parse!
Added reviews for ABILIFY, page 5
Successful review table parse!
Added reviews for ABILIFY, 

KeyboardInterrupt: 

In [542]:
with open('test.pkl','rb') as f:
    aaaa=pickle.load(f)