In [1]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import os, sys
import re
import fire

In [2]:
#%%writefile ../pyscrap_url.py

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content  #.encode(BeautifulSoup.original_encoding)
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('finding',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('findaing all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
            
        return res

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))    
    
    
if get_ipython().__class__.__name__ == '__main__':
    fire(get_tag_elements)

In [4]:
# scrapping names of journalists in Ghana
res = get_elements('http://www.businessghana.com/site/news/General/168602/Avance-Media-Announces-2018-Top-50-Ghanaian-Journalist-List', tag='li')

In [9]:
res
# res = re.findall(r"[\w]", res[0])
journ = []
journ_names = []
for i in res:
    journ.append(re.findall(r"[\w .]+", i))
for i in journ:
    if len(i) == 2:
        journ_names.append(i)
journ_names.pop(0)

# converting list into a dataframe
journ_df = pd.DataFrame(journ_names)
journ_df
# journ_df.to_csv("journ.csv")

Unnamed: 0,0,1
0,Abdul K. Malik Baako,The New Crusading Guide
1,Afia Adutwumwaa Morosa,Peace FM
2,Afia Pokuaa,Adom FM
3,Akosua Sarpong,UTV
4,Akwasi Sarpong,BBC
5,Ameyaw Debrah,AmeyawDebrah.com
6,Anas Aremeyaw Anas,Tiger Eye PI
7,Bernard Avle,Citi FM
8,Bismark Brown,Atinka TV
9,Bright Nana Amfom,TV3
