In [2]:
import pandas as pd
import json
import requests
import glob
import os
from sqlalchemy import create_engine
from bs4 import BeautifulSoup

# Hackathon #2 - Data Wrangling (Instructor solution)

## Get the Data

### Data in API

In [3]:
base_url = 'https://xauengvo49.execute-api.eu-west-1.amazonaws.com/dev/'
response = requests.get(base_url + 'missingdata/'+ '1')
print(response.text)

{"PopUpWindow": 0, "SubmitInfoToEmail": 0, "IframeOrFrame": 0, "MissingTitle": 0, "ImagesOnlyInForm": 1, "SubdomainLevelRT": 1, "UrlLengthRT": 0, "PctExtResourceUrlsRT": 0, "AbnormalExtFormActionR": 1, "ExtMetaScriptLinkRT": 0, "PctExtNullSelfRedirectHyperlinksRT": 0}



In [7]:
import aiohttp
import asyncio
import time
import pandas as pd

start_time = time.time()


async def get_website(session, url, id):
    async with session.get(url) as resp:
        response = {}
        if resp.status == 200:
            response = await resp.json()
            response['id'] = id
            return response
        return None

async def main():
    responses = {}
    async with aiohttp.ClientSession() as session:

        tasks = []
        for number in range(1, 10001):
            url = f'https://xauengvo49.execute-api.eu-west-1.amazonaws.com/dev/missingdata/{number}'
            tasks.append(asyncio.ensure_future(get_website(session, url, number)))

        websites = await asyncio.gather(*tasks)
        for website in websites:
            if website:
                responses[website['id']] = website
    
    df = pd.DataFrame.from_dict(responses, orient='index')
    df.to_csv('raw_api_data.csv', index=False)

#asyncio.run(main())
await main()
print("--- %s seconds ---" % (time.time() - start_time))



--- 9.930180072784424 seconds ---


In [8]:
! head raw_api_data.csv

PopUpWindow,SubmitInfoToEmail,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,id
0,0,0,0,1,1,0,0,1,0,0,1
0,0,0,0,0,1,-1,0,1,0,0,2
0,0,0,0,0,1,0,0,1,0,0,3
0,1,0,0,0,1,-1,0,1,0,0,4
0,0,1,0,0,1,1,0,0,0,0,5
0,0,1,1,0,1,1,0,1,0,0,6
0,0,0,0,0,1,0,0,1,0,0,7
0,0,0,0,0,1,1,0,1,0,0,8
0,0,0,0,0,-1,-1,0,1,0,0,9


In [None]:
%%time
content = pd.DataFrame()
failed = []
i = 0
while i < 10001:
    url = f'{base_url}/missingdata/{i}'
    # print progress
    if i % 100 == 0:
        print(i)
    
    # if request fails, just ignore that datapoint and move on
    try:
        response = requests.get(url)
        if response.ok:
            data = json.loads(response.text)
            data['id'] = i
            content = content.append(data, ignore_index=True)
        else:
            failed.append(i)
    except:
        pass
            
    i+=1
content.to_csv('raw_api_data.csv')

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300


In [None]:
print(len(failed))

In [9]:
content = pd.read_csv('raw_api_data.csv', index_col=0)

In [10]:
content

Unnamed: 0_level_0,SubmitInfoToEmail,IframeOrFrame,MissingTitle,ImagesOnlyInForm,SubdomainLevelRT,UrlLengthRT,PctExtResourceUrlsRT,AbnormalExtFormActionR,ExtMetaScriptLinkRT,PctExtNullSelfRedirectHyperlinksRT,id
PopUpWindow,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,0,0,1,1,0,0,1,0,0,1
0,0,0,0,0,1,-1,0,1,0,0,2
0,0,0,0,0,1,0,0,1,0,0,3
0,1,0,0,0,1,-1,0,1,0,0,4
0,0,1,0,0,1,1,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...
0,0,0,0,0,1,1,0,1,0,0,9996
0,0,1,0,0,1,0,0,1,0,0,9997
0,0,0,0,0,1,0,0,1,0,0,9998
0,0,1,0,0,1,1,0,1,0,0,9999


In [11]:
content.columns= content.columns.str.lower()
content = content[pd.to_numeric(content['id'], errors='coerce').notnull()].set_index('id')
content.index = content.index.astype(int)
content

Unnamed: 0_level_0,submitinfotoemail,iframeorframe,missingtitle,imagesonlyinform,subdomainlevelrt,urllengthrt,pctextresourceurlsrt,abnormalextformactionr,extmetascriptlinkrt,pctextnullselfredirecthyperlinksrt
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,0,0,1,1,0,0,1,0,0
2,0,0,0,0,1,-1,0,1,0,0
3,0,0,0,0,1,0,0,1,0,0
4,1,0,0,0,1,-1,0,1,0,0
5,0,1,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
9996,0,0,0,0,1,1,0,1,0,0
9997,0,1,0,0,1,0,0,1,0,0
9998,0,0,0,0,1,0,0,1,0,0
9999,0,1,0,0,1,1,0,1,0,0


In [12]:
content.to_csv("api_data.csv")