## Web scrapping using python

#### References
1. [Practical Introduction to Web Scraping in Python](https://realpython.com/python-web-scraping-practical-introduction/)
2. [Web Scraping using Python](https://www.datacamp.com/community/tutorials/web-scraping-using-python)

In [1]:
# $ python3 -m venv venv
# $ . ./venv/bin/activate

In [2]:
# #Better
# !pip install requests BeautifulSoup4 fire

In [3]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import os, sys

import fire

In [4]:
#%%writefile ../pyscrap_url.py

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content  #.encode(BeautifulSoup.original_encoding)
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('findaing',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('findaing all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
            
        return res

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))    
    
    
if get_ipython().__class__.__name__ == '__main__':
    fire(get_tag_elements)

## Getting the Top 10 Influencers

In [5]:
# get data on the most influential Twitter users
influential_users_url = "https://africafreak.com/100-most-influential-twitter-users-in-africa"
influential_users_tag = "h2"

influential_users = get_elements(influential_users_url, tag = influential_users_tag)
influential_users[:5]

['100. Jeffrey Gettleman (@gettleman)',
 '99. Africa24 Media (@a24media)',
 '98. Scapegoat (@andiMakinana)',
 '97. Africa Check (@AfricaCheck)',
 '96. James Copnall (@JamesCopnall)']

In [6]:
# save to a dataframe
users_df = pd.DataFrame(influential_users)
users_df.head()

Unnamed: 0,0
0,100. Jeffrey Gettleman (@gettleman)
1,99. Africa24 Media (@a24media)
2,98. Scapegoat (@andiMakinana)
3,97. Africa Check (@AfricaCheck)
4,96. James Copnall (@JamesCopnall)


In [7]:
# split into rank, user, and handle
users_split = users_df[0].str.split('.', expand = True)
users_df['rank'] = users_split[0]

users_split = users_split[1].str.split('(', expand = True)
users_df['user'] = users_split[0]

users_split = users_split[1].str.split(')', expand = True)
users_df['handle'] = users_split[0]
users_df.head()

Unnamed: 0,0,rank,user,handle
0,100. Jeffrey Gettleman (@gettleman),100,Jeffrey Gettleman,@gettleman
1,99. Africa24 Media (@a24media),99,Africa24 Media,@a24media
2,98. Scapegoat (@andiMakinana),98,Scapegoat,@andiMakinana
3,97. Africa Check (@AfricaCheck),97,Africa Check,@AfricaCheck
4,96. James Copnall (@JamesCopnall),96,James Copnall,@JamesCopnall


In [8]:
# drop the unneeded original column
users_df = users_df.drop(columns = 0)
users_df.head()

Unnamed: 0,rank,user,handle
0,100,Jeffrey Gettleman,@gettleman
1,99,Africa24 Media,@a24media
2,98,Scapegoat,@andiMakinana
3,97,Africa Check,@AfricaCheck
4,96,James Copnall,@JamesCopnall


In [9]:
# previewing the last 5 records
users_df.tail()

Unnamed: 0,rank,user,handle
100,Subscribe to the list,,
101,Tweet about Africa?,,
102,Celebrate Wild Africa With Us!,,
103,You have successfully subscribed,Thank you!,
104,11 Comments,,


In [10]:
# dropping rows with missing values
users_df = users_df.dropna(axis = 0)
users_df.tail()

Unnamed: 0,rank,user,handle
94,6,Helen Zille,@helenzille
95,5,Julius Sello Malema,@Julius_S_Malema
96,4,News24,@News24
98,2,Gareth Cliff,@GarethCliff
99,1,Trevor Noah,@Trevornoah


In [11]:
# changing datatype of 'rank' to integer
users_df['rank'] = users_df['rank'].astype('int64')
users_df.dtypes

rank       int64
user      object
handle    object
dtype: object

In [12]:
# sort by rank
users_df.sort_values('rank')

Unnamed: 0,rank,user,handle
99,1,Trevor Noah,@Trevornoah
98,2,Gareth Cliff,@GarethCliff
96,4,News24,@News24
95,5,Julius Sello Malema,@Julius_S_Malema
94,6,Helen Zille,@helenzille
...,...,...,...
4,96,James Copnall,@JamesCopnall
3,97,Africa Check,@AfricaCheck
2,98,Scapegoat,@andiMakinana
1,99,Africa24 Media,@a24media


In [13]:
# select top 10 based on ranking
users_top_10 = users_df.sort_values('rank')[:10]
users_top_10

Unnamed: 0,rank,user,handle
99,1,Trevor Noah,@Trevornoah
98,2,Gareth Cliff,@GarethCliff
96,4,News24,@News24
95,5,Julius Sello Malema,@Julius_S_Malema
94,6,Helen Zille,@helenzille
93,7,mailandguardian,@mailandguardian
92,8,5FM,@5FM
91,9,loyiso gola,@loyisogola
90,10,Computicket,@Computicket
89,11,MTV Base Africa,@MTVbaseAfrica


In [14]:
# save to CSV
users_top_10.to_csv('Karen_top_10.csv', index = False)

## Getting the list of Government Officials

In [15]:
# get data on government officials
government_officials_url = "https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa"
government_officials_tag = "blockquote"
government_officials_search = {"class" : "twitter-tweet"}

government_officials = get_elements(government_officials_url, tag = government_officials_tag,
                                    search = government_officials_search)
government_officials[0]

"The Deputy Prime Minister Themba Masuku has today met representatives of the private sector and employees' unions to map a collaborative effort in the fight against #COVID19. pic.twitter.com/EIYNGOEKRN— Eswatini Government (@EswatiniGovern1) March 20, 2020"

In [16]:
# split to get only the bottom part
gov_split = []

for item in government_officials:
    gov_split.append(item.split('—')[1])
    
gov_split[:3]

[' Eswatini Government (@EswatiniGovern1) March 20, 2020',
 ' Malawi Government (@MalawiGovt) March 18, 2020',
 ' Hage G. Geingob (@hagegeingob) March 18, 2020']

In [17]:
# save to a dataframe
gov_df = pd.DataFrame(gov_split)
gov_df.head()

Unnamed: 0,0
0,Eswatini Government (@EswatiniGovern1) March ...
1,"Malawi Government (@MalawiGovt) March 18, 2020"
2,"Hage G. Geingob (@hagegeingob) March 18, 2020"
3,Seychelles Ministry of Finance (@FinanceSC) M...
4,"PresidencyZA (@PresidencyZA) March 19, 2020"


In [18]:
# split into Gov. Official and their respective Handle
gov_split_further = gov_df[0].str.split('(', expand = True)
gov_df['Gov. Official'] = gov_split_further[0]

gov_split_further = gov_split_further[1].str.split(')', expand = True)
gov_df['Handle'] = gov_split_further[0]

gov_df = gov_df.drop(columns = 0)
gov_df.head()

Unnamed: 0,Gov. Official,Handle
0,Eswatini Government,@EswatiniGovern1
1,Malawi Government,@MalawiGovt
2,Hage G. Geingob,@hagegeingob
3,Seychelles Ministry of Finance,@FinanceSC
4,PresidencyZA,@PresidencyZA


In [19]:
# preview the last 5 records
gov_df.tail()

Unnamed: 0,Gov. Official,Handle
31,Issoufou Mahamadou,@IssoufouMhm
32,Muhammadu Buhari,@MBuhari
33,Macky Sall,@Macky_Sall
34,President Julius Maada Bio,@PresidentBio
35,Ministère de la Santé et de l'hygiène Publique,@MSPS_Togo


In [20]:
# get the top 10
gov_top_10 = gov_df.head(10)
gov_top_10

Unnamed: 0,Gov. Official,Handle
0,Eswatini Government,@EswatiniGovern1
1,Malawi Government,@MalawiGovt
2,Hage G. Geingob,@hagegeingob
3,Seychelles Ministry of Finance,@FinanceSC
4,PresidencyZA,@PresidencyZA
5,Ministry of Health Zambia,@mohzambia
6,President of Zimbabwe,@edmnangagwa
7,MinSantédj,@MinSantedj
8,Yemane G. Meskel,@hawelti
9,State House Kenya,@StateHouseKenya


In [21]:
# save to CSV
gov_top_10.to_csv("Karen_government_officials.csv", index = False)