# More Scraping

## Introduction
Scraping twitter handles of Kenyan governors and other generally popular people from [Soko Directory](https://sokodirectory.com/2019/03/here-are-the-official-twitter-handles-for-all-the-47-governors-in-kenya), [Travel Start](http://www.travelstart.co.ke/blog/the-50-best-kenyans-to-follow-on-twitter/) and  [Royal Trendia](https://royaltrendia.com/top-twitter-profiles-kenya/) sites.

## Table of Contents
1. [Libraries](#Libraries)
2. [Governors](#Governors)
3. [Travel Start](#Travel-Start)
4. [Royal Trendia](#Royal-Trendia)
5. [Mega DataFrame](#Mega-DF)


### Libraries

In [1]:
from requests  import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import os, sys

import fire

import sys
import os
import json
import matplotlib.pyplot as plt
import re
import string

import matplotlib.dates as mdates
import seaborn as sns

# to view all columns
pd.set_option("display.max.columns", None)

import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy import API
from tweepy import Cursor
from datetime import datetime, date, time, timedelta
from collections import Counter
import sys


import preprocessor as p

In [2]:
#%%writefile ../pyscrap_url.py

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content  #.encode(BeautifulSoup.original_encoding)
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('findaing',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('findaing all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
            
        return res
    
def get_tag_elements(url, tag='h2'):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    response = simple_get(url)

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        names = set()
        for li in html.select(tag):
            for name in li.text.split('\n'):
                if len(name) > 0:
                    names.add(name.strip())
        return list(names)

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url)) 
    
    
if get_ipython().__class__.__name__ == '__main__':
    fire(get_tag_elements)

### Governors

In [18]:
#governors
url= 'https://sokodirectory.com/2019/03/here-are-the-official-twitter-handles-for-all-the-47-governors-in-kenya/'
url = simple_get(url)
gs = get_elements(url, tag = 'ol', search  = {})
gs

['Ali Hassan Joho (Mombasa County) – @HassanAliJohoSalim Mvurya (Kwale County) – @GovernorMvuryaAmason Kingi (Kilifi County) – @governorkingiDhadho Godhana (Tana River County) – @EGodhanaFahim Twaha (Lamu County) – @fahimyasintwahaGranton Samboja (Taita Taveta County) – @SambojagrantonBunow Korane (Garissa County) – @HonAliKoraneMohamed Abdi Mohamud (Wajir County) – @GovernorWajirAli Ibrahim Roba (Mandera County) – @aliirobaMohamud Ali (Marsabit County) – @GovMohamudAliMohammed Kuti (Isiolo County) – @GovernorkutiKiraitu Murungi (Meru County) – @GovernorKiraituOnesimus Njuki (Tharaka Nithi County) – @MuthomiNjukiMartin Wambora (Embu County) – @WamboraCharity Ngilu (Kitui County) – @mamangiluAlfred Mutua (Machakos County) – @DrAlfredMutuaKivutha Kibwana (Makueni County) – @governorkibwanaFrancis Kimemia (Nyandarua County) – @GovFKimemiaMutahi Kahiga (Nyeri County) – @GovernorKahigaAnne Waiguru (Kirinyaga County) – @AnneWaiguruMwangi wa Iria (Murang’a County) – @MwangiWaIriaFerdinand Wai

In [24]:
governors = []
for element in gs:
    for string in element.split(' '):
        if string.startswith('@'):
            governors.append(string)
        
governors

['@HassanAliJohoSalim',
 '@GovernorMvuryaAmason',
 '@governorkingiDhadho',
 '@EGodhanaFahim',
 '@fahimyasintwahaGranton',
 '@SambojagrantonBunow',
 '@HonAliKoraneMohamed',
 '@GovernorWajirAli',
 '@aliirobaMohamud',
 '@GovMohamudAliMohammed',
 '@GovernorkutiKiraitu',
 '@GovernorKiraituOnesimus',
 '@MuthomiNjukiMartin',
 '@WamboraCharity',
 '@mamangiluAlfred',
 '@DrAlfredMutuaKivutha',
 '@governorkibwanaFrancis',
 '@GovFKimemiaMutahi',
 '@GovernorKahigaAnne',
 '@AnneWaiguruMwangi',
 '@MwangiWaIriaFerdinand',
 '@GovernorBabayaoJosphat',
 '@GovernorNanokJohn',
 '@JohnlonyangapuoMoses',
 '@HELenolkulalPatrick',
 '@GovP_KhaembaJackson',
 '@GvnMandagoAlex',
 '@Governor_TolgosStephen',
 '@Araap_SangStanley',
 '@GovernorKiptisNdiritu',
 '@NdirituMuriithiLee',
 '@GovLeeKinyanjuiSamuel',
 '@SamuelTunaiJoseph',
 '@joelenkuPaul',
 '@GovChepkwonyJoyce',
 '@LabosoJoyceWycliffe',
 '@GovWOparanyaWilber',
 '@GovernorVihigaWycliffe',
 '@GovWWangamatiSospeter',
 '@GovOjaamongCornel',
 '@Rasanga_CornelAnya

In [25]:
governors_df = pd.DataFrame({'handles':governors})
governors_df.to_csv('governors.csv')
governors_df.head()

Unnamed: 0,handles
0,@HassanAliJohoSalim
1,@GovernorMvuryaAmason
2,@governorkingiDhadho
3,@EGodhanaFahim
4,@fahimyasintwahaGranton


### Travel Start

In [28]:
#random kenyans
url= 'http://www.travelstart.co.ke/blog/the-50-best-kenyans-to-follow-on-twitter/'
url = simple_get(url)
ks = get_elements(url, tag = 'ol', search  = {})
ks

['NTV Kenya @ntvkenya',
 'Uhuru Kenyatta @UKenyatta',
 'Julius Kanyi @kanyicool',
 'Jeff Koinange MBS @KoinangeJeff',
 'Lempuris @majani_',
 'Bob Collymore @bobcollymore',
 'Dr Willy M Mutunga @WMutunga',
 'Larry Madowo @LarryMadowo',
 'Aly-Khan Satchu @alykhansatchu',
 'Erik Hersman @whiteafrican',
 'John-Allan Namu @johnallannamu',
 'Yvonne Okwara @@YvonneOkwara',
 'Robert Alai @RobertAlai',
 'Dennis Itumbi @OleItumbi',
 'Dan Ndambuki aka Churchill @MwalimChurchill',
 'Carol Radull @CarolRadull',
 'Gina Din @gina_din',
 'Peter Nduati @PeterNduati',
 'Bankelele @bankelele',
 'Chris Kirubi @CKirubi',
 'Oliver Mathenge @OliverMathenge',
 'Mark Kaigwa @MKaigwa',
 'Charles Onyango-Obbo @cobbo3',
 'Patricia Kihoro @Misskihoro',
 'Julie Gichuru @juliegichuru',
 'Caroline Mutoko @CarolineMutoko',
 'EKODYDDA @ekodydda01',
 'Juliani @JulianiKenya',
 'Maina Kageni @ItsMainaKageni',
 'RAMpunzZyl @RamzZy_',
 'The Trend Setter @xtiandela',
 'NyaKundiH @C_NyaKundiH',
 'CN @crazynairobian',
 'Masaku

In [29]:
kenyans = []
for element in ks:
    for string in element.split(' '):
        if string.startswith('@'):
            kenyans.append(string)
        
kenyans

['@ntvkenya',
 '@UKenyatta',
 '@kanyicool',
 '@KoinangeJeff',
 '@majani_',
 '@bobcollymore',
 '@WMutunga',
 '@LarryMadowo',
 '@alykhansatchu',
 '@whiteafrican',
 '@johnallannamu',
 '@@YvonneOkwara',
 '@RobertAlai',
 '@OleItumbi',
 '@MwalimChurchill',
 '@CarolRadull',
 '@gina_din',
 '@PeterNduati',
 '@bankelele',
 '@CKirubi',
 '@OliverMathenge',
 '@MKaigwa',
 '@cobbo3',
 '@Misskihoro',
 '@juliegichuru',
 '@CarolineMutoko',
 '@ekodydda01',
 '@JulianiKenya',
 '@ItsMainaKageni',
 '@RamzZy_',
 '@xtiandela',
 '@C_NyaKundiH',
 '@crazynairobian',
 '@masaku_',
 '@Brianmbunde',
 '@ComedyCentralKE',
 '@SokoAnalyst',
 '@JoeWMuchiri',
 '@AKenyanGirl',
 '@Evabulence',
 '@PatNanyaro',
 '@bonifacemwangi',
 '@Ngendo87',
 '@nanciemwai',
 '@truthslinger',
 '@itsbuddhablaze',
 '@sunnysunwords',
 '@lwalubengo',
 '@AnneKiguta',
 '@DavidBurudi',
 '@blogs_kenya?',
 '@vinnieo',
 '@mslushmakeup',
 '@PeterQuotlant']

In [30]:
k_df = pd.DataFrame({'handles':kenyans})
k_df.to_csv('k_df.csv')
k_df.head()

Unnamed: 0,handles
0,@ntvkenya
1,@UKenyatta
2,@kanyicool
3,@KoinangeJeff
4,@majani_


### Royal Trendia

In [32]:
#more Kenyans
url= 'https://royaltrendia.com/top-twitter-profiles-kenya/'
url = simple_get(url)
k = get_elements(url, tag = 'a', search  = {})
k

['Home',
 'About Us',
 'Services',
 'Blog',
 'Contact Us',
 'Home',
 'About Us',
 'Services',
 'Blog',
 'Contact Us',
 'Get a quote',
 'Social Media',
 'Kenyans on Twitter',
 'Top 100 Most Followed Users on Twitter in Kenya',
 'Top Twitter users in Kenya',
 'Twitter Kenya',
 'Who has the most followers on twitter in Kenya?',
 'Twitter',
 'Uhuru Kenyatta (@UKenyatta)',
 'NTV Kenya (@ntvkenya)',
 'Citizen TV Kenya (@citizentvkenya)',
 'William Samoei Ruto (@WilliamsRuto)',
 'ktn (@KTNKenya)',
 'Raila Odinga (@RailaOdinga)',
 'Daily Nation (@dailynation)',
 'Larry Madowo, first of his name (@LarryMadowo)',
 'Cyprian, Is Nyakundi (@C_NyaKundiH)',
 'Jeff Koinange, MBS (@KoinangeJeff)',
 'Capital FM Kenya (@CapitalFMKenya)',
 'Thee Trend Setter ™ (@xtiandela)',
 'Bob Collymore (@bobcollymore)',
 'Safaricom Limited (@SafaricomLtd)',
 'Julie Gichuru (@JulieGichuru)',
 'Robert ALAI (Nyakwar Nyokwongi) (@RobertAlai)',
 'Mutahi Ngunyi (@MutahiNgunyi)',
 'Boniface Mwangi (@bonifacemwangi)',
 'K24 

In [36]:
k_1 = []
for element in k:
    for string in element.split(' '):
        if '@' in string:
            k_1.append(string)
        
k_1

['(@UKenyatta)',
 '(@ntvkenya)',
 '(@citizentvkenya)',
 '(@WilliamsRuto)',
 '(@KTNKenya)',
 '(@RailaOdinga)',
 '(@dailynation)',
 '(@LarryMadowo)',
 '(@C_NyaKundiH)',
 '(@KoinangeJeff)',
 '(@CapitalFMKenya)',
 '(@xtiandela)',
 '(@bobcollymore)',
 '(@SafaricomLtd)',
 '(@JulieGichuru)',
 '(@RobertAlai)',
 '(@MutahiNgunyi)',
 '(@bonifacemwangi)',
 '(@K24Tv)',
 '(@Ma3Route)',
 '(@KenyaRedCross)',
 '(@StandardKenya)',
 '(@VictorMochere)',
 '(@CKirubi)',
 '(@WMutunga)',
 '(@KideroEvans)',
 '(@Safaricom_Care)',
 '(@MarthaKarua)',
 '(@UNEP)',
 '(@johnallannamu)',
 '(@TheStarKenya)',
 '(@KenyaPower_Care)',
 '(@OleItumbi)',
 '(@KenyaAirways)',
 '(@Peter_Kenneth)',
 '(@StateHouseKenya)',
 '(@ItsMainaKageni)',
 '(@BD_Africa)',
 '(@sautisol)',
 '(@AMB_A_Mohammed)',
 '(@AnneKiguta)',
 '(@alykhansatchu)',
 '(@CarolRadull)',
 '(@Donsarigo)',
 '(@LinusKaikai)',
 '(@TerryanneChebet)',
 '(@lKeepltReal)',
 '(@WillisRaburu)',
 '(@lindahoguttu)',
 '(@IEBCKenya)',
 '(@MarkMasai)',
 '(@jamessmat)',
 '(@Sakaja

In [37]:
def trim_word(word, from_start=0, from_end=0):
    return word[from_start:len(word) - from_end]
fellow_kenyans = []
for element in k_1:
    element = trim_word(element, 1,1)
    fellow_kenyans.append(element)
    
fellow_kenyans

['@UKenyatta',
 '@ntvkenya',
 '@citizentvkenya',
 '@WilliamsRuto',
 '@KTNKenya',
 '@RailaOdinga',
 '@dailynation',
 '@LarryMadowo',
 '@C_NyaKundiH',
 '@KoinangeJeff',
 '@CapitalFMKenya',
 '@xtiandela',
 '@bobcollymore',
 '@SafaricomLtd',
 '@JulieGichuru',
 '@RobertAlai',
 '@MutahiNgunyi',
 '@bonifacemwangi',
 '@K24Tv',
 '@Ma3Route',
 '@KenyaRedCross',
 '@StandardKenya',
 '@VictorMochere',
 '@CKirubi',
 '@WMutunga',
 '@KideroEvans',
 '@Safaricom_Care',
 '@MarthaKarua',
 '@UNEP',
 '@johnallannamu',
 '@TheStarKenya',
 '@KenyaPower_Care',
 '@OleItumbi',
 '@KenyaAirways',
 '@Peter_Kenneth',
 '@StateHouseKenya',
 '@ItsMainaKageni',
 '@BD_Africa',
 '@sautisol',
 '@AMB_A_Mohammed',
 '@AnneKiguta',
 '@alykhansatchu',
 '@CarolRadull',
 '@Donsarigo',
 '@LinusKaikai',
 '@TerryanneChebet',
 '@lKeepltReal',
 '@WillisRaburu',
 '@lindahoguttu',
 '@IEBCKenya',
 '@MarkMasai',
 '@jamessmat',
 '@SakajaJohnson',
 '@MikeSonko',
 '@KTNNews',
 '@SmritiVidyarthi',
 '@VictorWanyama',
 '@BookLoversgate',
 '@iHub

In [38]:
fellow_kenyans_df = pd.DataFrame({'handles':fellow_kenyans})
fellow_kenyans_df.to_csv('fellow_kenyans_df.csv')
fellow_kenyans_df.head()

Unnamed: 0,handles
0,@UKenyatta
1,@ntvkenya
2,@citizentvkenya
3,@WilliamsRuto
4,@KTNKenya


### Mega DF

In [44]:
mega_df = pd.concat([fellow_kenyans_df, k_df, governors_df ])
mega_df = mega_df.drop_duplicates(subset = ['handles'])
mega_df.to_csv('mega2.csv')
mega_df.head()

Unnamed: 0,handles
0,@UKenyatta
1,@ntvkenya
2,@citizentvkenya
3,@WilliamsRuto
4,@KTNKenya


In [43]:
mega_df.shape

(183, 1)

#### [Back to top](#More-Scraping)