## EDSA Traffic Analysis and Visualization: Extraction
This script is in charge of extracting sufficient data. It is important to define the target_runs variable since this dictates the timeframe in which data will be extracted. Since the MMDA website live updates every 15 minutes, then the script must run accordingly.

The following libraries were imported:
* For scheduling
    1. schedule - used to track the runtime of the script and to prompt extraction at 15 minute intervals
    2. time - used to take note of the time stamp.
    3. datetime - used to format time.
    4. os, os.path - used to count the amount of extracted files which will determine if the target number of runs has been reached.
    5. exit (from sys) - used to stop script termination once the target number of runs has been achieved.

Technically, the interval in which the script extracts data from the MMDA website is flexible. Hence, the same script can be used for other scraping tasks. 

* For visualization
    1. numpy - in particular, numpy arrays were used for its flexibility.
    2. matplotlib - this is a powerful and reliable visualization tool. 

* For Data Extraction
    1. BeautifulSoup - used to process html data. 
    2. various selenium sublibraries - this was used to extract DYNAMIC web content.

In [None]:
#Define constants here
global target_runs
global interval

target_runs = 672 #7 days
interval = 15 # in MINUTES

#Libraries For Scheduling
import schedule
import time
import datetime
import os, os.path 
from sys import exit

#Libraries for visualization
%matplotlib notebook
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.animation as animation

#Libraries For Data Extraction
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.common.exceptions import WebDriverException   

### Data Extraction
The flow is as follows:
1. The webdriver from selenium will load the page in question. Of course, the url must first be specified. Note that to use webdriver, the appropriate driver must be installed. This has to be done in accordance to the browser being used. For example, Mozilla Firefox uses geckodriver, Google Chrome uses chromedriver, etc.
2. Once the dynamic web content has been extracted, the data can now be processed using Beautiful Soup. The appropriate data will be extracted according to the webpage's html structure. 
3. Finally, after some pre-processing, the data is ready to be written to a .csv file and used for visualization. 

In [None]:
def data_extraction():
    #specify the url
    url = 'http://mmdatraffic.interaksyon.com/line-view-edsa.php'
    
    # Start the WebDriver and load the page
    wd = webdriver.Firefox()
    wd.get(url)

    # Wait for the dynamically loaded elements to show up
    WebDriverWait(wd, 10).until(
        EC.visibility_of_element_located((By.CLASS_NAME, "line-col")))

    # And grab the page HTML source
    html_page = wd.page_source
    wd.quit()

    # Now you can use html_page as you like
    soup = BeautifulSoup(html_page)
    
    #set time stamp.
    ts = time.time()
    timestamp = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H_%M_%S')
    stringdata = "TIMESTAMP: "+ str(timestamp)+"\n"+"\n"

    #extract name of lines
    list_of_names_html=soup.find_all('div',{'class':'line-name'})
    list_of_names = []
    

    for children in list_of_names_html:
        grandchildren = children.findChild("p")
        temp = grandchildren.get_text(separator=' ')
        temp = temp.split(' ')
        list_of_names.append(temp[0])
    list_of_names.pop(0) 
    
    #extract southbound/northbound volume
    list_of_volume_html=soup.find_all('div',{'class':'line-status'})
    list_of_southbound = []
    list_of_northbound = []

    i=1
    for children in list_of_volume_html:
        temp = children.text
        temp = temp.split()

        if(temp[1]=="LIGHT"):
            temp[1]=0
        elif(temp[1]=="MODERATE"):
            temp[1]=1
        elif(temp[1]=="HEAVY"):
            temp[1]=2
        elif(temp[1]=="NO"):
            temp[1]=0 # Place holder first

        if(i%2 == 0):
            list_of_northbound.append(temp[1])
        else:
            list_of_southbound.append(temp[1])

        i=i+1
        
    #save data into one variable
    for i in range(len(list_of_names)):
        stringdata = stringdata + '{:>12}  {:>12}  {:>12}'.format(list_of_names[i], str(list_of_southbound[i]), str(list_of_northbound[i])) + "\n"
    
    #Save raw data into csv file
    filename = "rawdata_"+ str(timestamp)+".csv"
    file = open(filename,"a")
    file.write(stringdata)   
    file.close()

    #return list containing data lists
    list_of_lists = [list_of_names, list_of_southbound, list_of_northbound, timestamp]
    return list_of_lists

### Visualization

In [None]:
# Visualization using Matplotlib
def visualize_overall(list_of_names, list_of_southbound, list_of_northbound,timestamp):
    #Visualize overall EDSA traffic at certain timestamp
    roadnum = range(0,len(list_of_names))
   
    plt.scatter(roadnum, list_of_southbound, label="SOUTHBOUND", color="g",marker = "s")
    plt.scatter(roadnum, list_of_northbound, label = "NORTHBOUND", color = "m",marker = "|")

    plt.xlabel("Road")
    plt.ylabel("Volume")

    plt.title(label="EDSA TRAFFIC: "+timestamp)
    plt.legend(bbox_to_anchor=(0, 1), loc='lower center', ncol=1)
    plt.savefig("EDSA TRAFFIC "+timestamp+".png") #finally, save the figure  

### Driver

In [None]:
def job():
    try: # In case of power/network interruption, keep running      
        #extract data at current time stamp
        list_of_lists = data_extraction()

        #visualize overall data.
        visualize_overall(list_of_lists[0],list_of_lists[1],list_of_lists[2],list_of_lists[3])

        #track number of runs.
        num_of_runs = (len([file for file in os.listdir('.') if file.endswith('.png')]))
        print("Run number " + str(num_of_runs))

        #if target of runs has been reached, visualize individual data.
        if (num_of_runs == target_runs):
            exit(0) #exit program!
    except WebDriverException:
        pass  

schedule.every(0.1).minutes.do(job)
while 1:
    schedule.run_pending()
    time.sleep(1) 