In [1]:
import pandas as pd
import json
import requests
import glob
import os
from sqlalchemy import create_engine
from bs4 import BeautifulSoup

# Hackathon #2 - Data Wrangling (Instructor solution)

### Data in Website

Looking at the contents of the webpage can give us helpful information on what tables we need to parse

![website](https://i.imgur.com/MMD2MXZ.png)

We have a `<table>` named `dataframe` in each page, in a total of 500 pages. We're going to need all of them. 

In [12]:
content = []
failed = []
base_url = 'https://s02-infrastructure.s3.eu-west-1.amazonaws.com/hackathon-2-batch6/'
for i in range(1, 500):
    if i % 50 == 0:
        print(i)
        print(f'{base_url}{i}.html')
    response = requests.get(f'{base_url}{i}.html')
    if response.ok:
        content.append(response.content)
    else:
        failed.append(i)
        

50
https://s02-infrastructure.s3.eu-west-1.amazonaws.com/hackathon-2-batch6/50.html
100
https://s02-infrastructure.s3.eu-west-1.amazonaws.com/hackathon-2-batch6/100.html
150
https://s02-infrastructure.s3.eu-west-1.amazonaws.com/hackathon-2-batch6/150.html
200
https://s02-infrastructure.s3.eu-west-1.amazonaws.com/hackathon-2-batch6/200.html
250
https://s02-infrastructure.s3.eu-west-1.amazonaws.com/hackathon-2-batch6/250.html
300
https://s02-infrastructure.s3.eu-west-1.amazonaws.com/hackathon-2-batch6/300.html
350
https://s02-infrastructure.s3.eu-west-1.amazonaws.com/hackathon-2-batch6/350.html
400
https://s02-infrastructure.s3.eu-west-1.amazonaws.com/hackathon-2-batch6/400.html
450
https://s02-infrastructure.s3.eu-west-1.amazonaws.com/hackathon-2-batch6/450.html


In [13]:
len(failed)

0

##### Main Data

In [22]:
def tableDataText(table):
    """
    Function to convert an html table into an array 
    """
    rows = []
    trs = table.find_all('tr')
    headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] # header row
    if headerow: # if there is a header row include first
        rows.append(headerow)
        trs = trs[1:]
    for tr in trs: # for every table row
        rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data row
    return rows

In [60]:
def get_website_data(content):
    '''
    Get the data from all the pages in the website
    
    param content: a list of requests.get(...).content
    param columns: a list of the columns in each table of the website
    '''

    pages = []
    for i in range(len(content)):
        if i%100==0:
            print(i)

        soup = BeautifulSoup(content[i], 'html.parser')
        main_data_div = soup.find('table' , {'class': 'dataframe'})
        main_data_list_table = tableDataText(main_data_div)
        main_data_df = pd.DataFrame(main_data_list_table[1:], columns=main_data_list_table[0][1:],)

        pages.append(main_data_df)
    
    df = pd.concat(pages, axis=0, ignore_index=True)
    
    return df
    

In [65]:
df_website = get_website_data(content)

0
100
200
300
400


In [66]:
df_website = df_website.set_index("Timestamp")
df_website

Unnamed: 0_level_0,EMA200,MA200,RSI200,%K30
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1325803800,,,,98.039216
1325806500,,,,98.692810
1325806740,,,,100.000000
1325806860,,,,100.000000
1325806980,,,,100.000000
...,...,...,...,...
1351989660,10.625978,10.59670,50.050111,100.000000
1351991700,10.624227,10.59605,48.503938,0.000000
1351994460,10.624384,10.59565,49.791687,82.608696
1351994760,10.624440,10.59520,49.725912,78.260870


In [68]:
df_website.to_csv('website_data.csv')

In [69]:
! head -n 1 website_data.csv

Timestamp,EMA200,MA200,RSI200,%K30
