## Web Scrapping Example

In [48]:
# package use
import requests
import urllib.request
import time
import re
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import copy
from collections import Counter

In [2]:
# set the url to the website and access the site with our requests library
url = 'http://web.mta.info/developers/turnstile.html'
response = requests.get(url)
response

<Response [200]>

In [3]:
# Next we parse the html with BeautifulSoup so that we can work with a nicer, nested BeautifulSoup data structure
soup = BeautifulSoup(response.text, "html.parser")
# print soup to show the nested data structure
# soup

In [4]:
# We use the method .findAll to locate all of our <a> tags for the first one to ten records
soup.findAll('a')[1:10]

[<a href="http://www.mta.info"><img alt="Go to MTA homepage" src="/template/images/mta_info.gif"/></a>,
 <a href="/accessibility">Accessibility</a>,
 <a href="http://assistive.usablenet.com/tt/http://www.mta.info">Text-only</a>,
 <a href="/selfserve">Customer Self-Service</a>,
 <a href="/mta/employment/">Employment</a>,
 <a href="/faqs.htm">FAQs/Contact Us</a>,
 <a href="http://www.mta.info" style="padding-left:18px;">Home</a>,
 <a href="http://www.mta.info">MTA Home</a>,
 <a href="http://www.mta.info/nyct">NYC Subways and Buses</a>]

### Explanation for the tag records above:
- Explanation: This code gives us every line of code that has an <a> tag. 
- The information that we are interested in starts on line 36. Not all links are relevant to what we want, but most of it is, so we can easily slice from line 36.

In [5]:
# let’s extract the actual link that we want. Let’s test out the first link
# Notice that all the .txt files are inside the <a> tag following the line above
one_a_tag = soup.findAll('a')[36]
print(one_a_tag)
# extract the address of txt
link = one_a_tag['href']

<a href="data/nyct/turnstile/turnstile_190323.txt">Saturday, March 23, 2019</a>


### Explanation for operations above:
- This code saves ‘data/nyct/turnstile/turnstile_190316.txt’ to our variable link. The full url to download the data is actually ‘http://web.mta.info/developers/data/nyct/turnstile/turnstile_190316.txt’ which I discovered by clicking on the first data file on the website as a test.
- We can use our urllib.request library to download this file path to our computer. We provide request.urlretrieve with two parameters: file url and the filename. For my files, I named them “turnstile_180922.txt”, “turnstile_180901”, etc.

In [6]:
# create full download url string
download_url = 'http://web.mta.info/developers/'+ link
urllib.request.urlretrieve(download_url,'./'+link[link.find('/turnstile_')+1:]) 

('./turnstile_190323.txt', <http.client.HTTPMessage at 0x16e82b723c8>)

In [7]:
# Last but not least, we should include this line of code 
# so that we can pause our code for a second 
# so that we are not spamming the website with requests.
# This helps us avoid getting flagged as a spammer
time.sleep(1)

## Read Table

- useful tutorial websites: 

https://www.dataquest.io/blog/web-scraping-tutorial-python/

https://stackoverflow.com/questions/46015006/how-to-scrape-the-first-n-paragraphs-from-a-url

https://cfss.uchicago.edu/webdata005_scraping.html

In [2]:
# read table and show the first five lines
director_table = pd.read_csv('director table.csv', encoding='ISO-8859-1')
director_table['bio_url'] = np.nan
director_table.head(3)

Unnamed: 0,photoimage,actorname,actorimdb,dateofbirth,placeofbirth,minibio,trivia,race,gender,Domestic,bio_url
0,,A.R. Murugadoss,http://www.imdb.com/name/nm1436693/,,,,,,,0,
1,,Aanand Rai,http://www.imdb.com/name/nm2399862/,,,,,,,0,
2,,Aaron Schneider,http://www.imdb.com/name/nm0773689/,,,,,,,1,


In [16]:
# fill in
# director_table['Domestic'][0] = np.nan
# director_table.head(3)

In [17]:
# mean_ = np.mean(director_table[director_table['Domestic'] != np.nan])['Domestic']
# mean_

In [18]:
# director_table['Domestic'] = director_table['Domestic'].fillna(mean_)
# director_table.head()

### gender-guesser package
This package uses the underlying data from the program “gender” by Jorg Michael (described here). 
Its use is pretty straightforward.

https://pypi.org/project/gender-guesser/

## Some hint on how to extract those information
- dateofbirth: extract from the profile
- placeofbirth: extract from the profie
- minibio: extract from the "view more bio" link
- trivia: extract from the "vie more bio" link
- race: depends
- gender: deduce from the mini_bio by detecting him or her & otherwise use gender-guesser package

In [27]:
# detect whether people have "view more bio" or not and record
def detect_bio_link(url):
    response = requests.get(url)
    # form the txt
    # call BeautifulSoup data structure to work
    soup = BeautifulSoup(response.text, "html.parser")
    
    mini_bio_url_num = 0
    # Find all the links on the page
    for link in soup.find_all('a', href=True):
        # find the mini_bio page
        if "bio_sm" in link['href']:
            # calculate how many it finds
            mini_bio_url_num += 1
            mini_bio_url = link['href']
            mini_bio_url = "https://www.imdb.com" + mini_bio_url
    # check
    if mini_bio_url_num == 1:
        return True, mini_bio_url
    else:
        mini_bio_url = " "
        return False, mini_bio_url

In [None]:
# decide number of directors to scrap
length = len(director_table)
# make a copy to modify the original table
director_table_bioAdd = copy.deepcopy(director_table)

# update and fill the new table
for i in range(length):
    judge, mini_bio_url_add = detect_bio_link(director_table['actorimdb'][i])
    if judge:
        director_table_bioAdd['bio_url'][i] = mini_bio_url_add
    # keep track of process
    if i%200 == 0:
        print(i)
# write csv to store fist
director_table_bioAdd.to_csv('bio_url.csv')

In [1]:
# after check
director_table_bioAdd.head(3)

NameError: name 'director_table_bioAdd' is not defined

## Operation on director with bio urls first

In [253]:
bio_url_fill = director_table_bioAdd[director_table_bioAdd['bio_url'].notnull()]
bio_url_fill = bio_url_fill.reset_index(drop=True)

bio_url_notfill = director_table_bioAdd.append(bio_url_fill).drop_duplicates(keep=False)
bio_url_notfill = bio_url_notfill.reset_index(drop=True)
bio_url_fill.head(3)

Unnamed: 0,photoimage,actorname,actorimdb,dateofbirth,placeofbirth,minibio,trivia,race,gender,Domestic,bio_url
0,,A.R. Murugadoss,http://www.imdb.com/name/nm1436693/,,,,,,,0,https://www.imdb.com/name/nm1436693/bio?ref_=n...
1,,Aanand Rai,http://www.imdb.com/name/nm2399862/,,,,,,,0,https://www.imdb.com/name/nm2399862/bio?ref_=n...
2,,Aaron Schneider,http://www.imdb.com/name/nm0773689/,,,,,,,1,https://www.imdb.com/name/nm0773689/bio?ref_=n...


## Extract dateofbirth and placeofbirth information

In [254]:
print("length of bio url filled data set: ", len(bio_url_fill))

length of bio url filled data set:  1362


In [264]:
# extract birth date & place at least
# and minibio and trivia information
def extract_info(url):
    response_mini = requests.get(url)
    # still form the text
    soup_mini = BeautifulSoup(response_mini.text, "html.parser")
    
    # extract birth date and place first
    birth_monthday = " "
    birth_year = " "
    placeofbirth = " "
    for link in soup_mini.find_all('a', href=True):
        if "birth_monthday" in link['href']:
            birth_monthday = link.string
        if "birth_year" in link['href']:
            birth_year = link.string
        if "birth_place" in link['href']:
            placeofbirth = link.string
    # form date of birth string
    dateofbirth = birth_monthday+" "+birth_year
    
    # extract all relevant information -- topics and content
    # soda odd and soda even for content, li_group for title
    table = soup_mini.find_all(True, {"class": {"soda odd", "soda even", "li_group"}})
    # extract all the text first
    table_text = []
    for i in range(len(table)):
        table_text.append(table[i].get_text(strip=True))
    
    # extract mini_bio and trivia now
    mini_trivia_para = [" ", " "]
    search_title = ["Mini Bio", "Trivia"]
    for j in range(len(search_title)):
        for i in range(len(table_text)):
            if search_title[j] in table_text[i]:
                # extract the number of records to append after
                numOfRecord = re.sub("\D", "", table_text[i])
                # fill in the information
                mini_trivia_para[j] = '/'.join(table_text[(i+1):(i+1+int(numOfRecord))])
                break
                
    return dateofbirth, placeofbirth, mini_trivia_para[0], mini_trivia_para[1]

In [None]:
# fill the information now
length = len(bio_url_fill)
for count in range(length):
    url = bio_url_fill['bio_url'][count]
    birthday_fill, place_fill, mini_fill, trivia_fill = extract_info(url)
    bio_url_fill['dateofbirth'][count] = birthday_fill
    bio_url_fill['placeofbirth'][count] = place_fill
    bio_url_fill['minibio'][count] = mini_fill
    bio_url_fill['trivia'][count] = trivia_fill
    if count % 100 == 0:
        print(count)
# write to csv
bio_url_fill.to_csv("bio_url_fill.csv")

## Operation on director without bio urls now

- Since they have no bio urls, minibio and trivia information would be absent
- Try to extract their birthplace and birthday information if possible

In [258]:
print("length of bio url not filled data set: ", len(bio_url_notfill))

length of bio url not filled data set:  75


In [259]:
bio_url_notfill.head(3)

Unnamed: 0,photoimage,actorname,actorimdb,dateofbirth,placeofbirth,minibio,trivia,race,gender,Domestic,bio_url
0,,Aaron Seltzer,http://www.imdb.com/name/nm0783536/,,,,,,,1,
1,,Abhishek Varman,http://www.imdb.com/name/nm2831530/,,,,,,,0,
2,,Adam Chapman,http://www.imdb.com/name/nm7920865/,,,,,,,1,


In [274]:
# extract birth information for notfill data set--function
def extract_birth_notfill(url):
    response_mini = requests.get(url)
    # still form the text
    soup_mini = BeautifulSoup(response_mini.text, "html.parser")

    birth_monthday = " "
    birth_year = " "
    placeofbirth = " "
    for link in soup_mini.find_all('a', href=True):
        if "birth_monthday" in link['href']:
            birth_monthday = link.string
        if "birth_year" in link['href']:
            birth_year = link.string
        if "birth_place" in link['href']:
            placeofbirth = link.string
    # form date of birth string
    dateofbirth = birth_monthday+" "+birth_year
    
    return dateofbirth, placeofbirth

In [None]:
# extract birth information for notfill data set
length_notfill = len(bio_url_notfill)
for i in range(length_notfill):
    date, place = extract_birth_notfill(bio_url_notfill['actorimdb'][i])
    bio_url_notfill['dateofbirth'][i] = date
    bio_url_notfill['placeofbirth'][i] = place

In [325]:
# combine the total information
bio_url_total = bio_url_fill.append(bio_url_notfill, ignore_index=True)
# clean messy string "Born Today"
bio_url_total['dateofbirth'] = bio_url_total['dateofbirth'].str.replace("Born Today", "")
# write to csv
bio_url_total.to_csv("bio_url_total.csv", index=False)

## Deduce the gender information from previous columns

## Deduce the race information from previous columns

## Image Collection and bio/trivia text analysis
- collect their images to code their demographic background, possibly from IMDB by collecting data on observations that has a value of 1 on column 'Domestic' (1170 out of 1437). 
- how to sort out necessary information from the bio / trivia.

## Clean the data frame eventually