In [12]:
import pandas as pd
import json
import requests
import glob
import os
from sqlalchemy import create_engine
from bs4 import BeautifulSoup

# Hackathon #2 - Data Wrangling (Instructor solution)

### Data in Website

First, we need to get the contents of the webpage in a "soup"

In [13]:
response = requests.get('https://s02-infrastructure.s3.eu-west-1.amazonaws.com/hackathon-2/website.html')
soup = BeautifulSoup(response.content, 'html.parser')

Looking at the contents of the webpage can give us helpful information on what tables we need to parse

![website](https://i.imgur.com/MMD2MXZ.png)

We have a `<div>` named `main-data` and a `<div>` named `missing-ids`, each with a table inside. We're going to need both. 

In [14]:
def tableDataText(table):
    """
    Function to convert an html table into an array 
    """
    rows = []
    trs = table.find_all('tr')
    headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] # header row
    if headerow: # if there is a header row include first
        rows.append(headerow)
        trs = trs[1:]
    for tr in trs: # for every table row
        rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data row
    return rows

##### Main Data

In [15]:
main_data_div = soup.find('div' , {'class': 'main-data'})
main_data_table = main_data_div.find('table')

In [16]:
main_data_list_table = tableDataText(main_data_table)

In [17]:
main_data_df = pd.DataFrame(main_data_list_table[1:], columns=main_data_list_table[0],)

#drop rows with id == 999999
main_data_df = main_data_df.drop(main_data_df.index[main_data_df['id'] == '999999'])
                                 
print(main_data_df.shape)
print(main_data_df.dtypes)
display(main_data_df)

(2665, 39)
id                               object
NumDots                          object
SubdomainLevel                   object
PathLevel                        object
UrlLength                        object
NumDash                          object
NumDashInHostname                object
AtSymbol                         object
TildeSymbol                      object
NumUnderscore                    object
NumPercent                       object
NumQueryComponents               object
NumAmpersand                     object
NumHash                          object
NumNumericChars                  object
NoHttps                          object
RandomString                     object
IpAddress                        object
DomainInSubdomains               object
DomainInPaths                    object
HttpsInHostname                  object
HostnameLength                   object
PathLength                       object
QueryLength                      object
DoubleSlashInPath            

Unnamed: 0,id,NumDots,SubdomainLevel,PathLevel,UrlLength,NumDash,NumDashInHostname,AtSymbol,TildeSymbol,NumUnderscore,...,ExtFavicon,InsecureForms,RelativeFormAction,ExtFormAction,AbnormalFormAction,PctNullSelfRedirectHyperlinks,FrequentDomainNameMismatch,FakeLinkInStatusBar,RightClickDisabled,CLASS_LABEL
0,,1,0,3,116,0,0,0,0,3,...,1,1,0,0,0,0,0,0,0,0
1,,4,1,2,44,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,1
2,,2,0,2,108,0,0,0,0,2,...,0,1,0,0,0,0,0,0,0,0
3,,3,1,2,40,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,,2,1,3,42,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2660,688,2,0,1,26,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,1
2661,3639,2,1,3,44,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2662,8270,3,1,3,60,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2663,8611,1,0,0,25,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


##### Missing Ids

In [18]:
missing_ids_div = soup.find('div' , {'class': 'missing-ids'})
missing_ids_table = missing_ids_div.find('table')
missing_ids_list_table = tableDataText(missing_ids_table)

In [19]:
missing_ids_df = pd.DataFrame(missing_ids_list_table[1:], columns=missing_ids_list_table[0],)
missing_ids_df

Unnamed: 0,id
0,7368
1,3881
2,5393
3,1401
4,4925
...,...
994,8768
995,783
996,4709
997,6746


In [20]:
main_data_df.loc[0:998,'id']

0      <NA>
1      <NA>
2      <NA>
3      <NA>
4      <NA>
       ... 
994    <NA>
995    <NA>
996    <NA>
997    <NA>
998    <NA>
Name: id, Length: 999, dtype: object

In [21]:
missing_ids_df.id

0      7368
1      3881
2      5393
3      1401
4      4925
       ... 
994    8768
995     783
996    4709
997    6746
998    4868
Name: id, Length: 999, dtype: object

In [22]:
##### Combine tables
main_data_df.loc[0:998,'id'] = missing_ids_df.id
website_df = main_data_df.apply(pd.to_numeric).set_index('id')
website_df.columns= website_df.columns.str.lower()

print(website_df.shape)
print(website_df.dtypes)
display(website_df)

(2665, 38)
numdots                          int64
subdomainlevel                   int64
pathlevel                        int64
urllength                        int64
numdash                          int64
numdashinhostname                int64
atsymbol                         int64
tildesymbol                      int64
numunderscore                    int64
numpercent                       int64
numquerycomponents               int64
numampersand                     int64
numhash                          int64
numnumericchars                  int64
nohttps                          int64
randomstring                     int64
ipaddress                        int64
domaininsubdomains               int64
domaininpaths                    int64
httpsinhostname                  int64
hostnamelength                   int64
pathlength                       int64
querylength                      int64
doubleslashinpath                int64
numsensitivewords                int64
embeddedbrandn

Unnamed: 0_level_0,numdots,subdomainlevel,pathlevel,urllength,numdash,numdashinhostname,atsymbol,tildesymbol,numunderscore,numpercent,...,extfavicon,insecureforms,relativeformaction,extformaction,abnormalformaction,pctnullselfredirecthyperlinks,frequentdomainnamemismatch,fakelinkinstatusbar,rightclickdisabled,class_label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7368,1,0,3,116,0,0,0,0,3,0,...,1,1,0,0,0,0,0,0,0,0
3881,4,1,2,44,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,1
5393,2,0,2,108,0,0,0,0,2,0,...,0,1,0,0,0,0,0,0,0,0
1401,3,1,2,40,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4925,2,1,3,42,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,2,0,1,26,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,1
3639,2,1,3,44,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8270,3,1,3,60,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8611,1,0,0,25,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [23]:
website_df.isna().sum()

numdots                          0
subdomainlevel                   0
pathlevel                        0
urllength                        0
numdash                          0
numdashinhostname                0
atsymbol                         0
tildesymbol                      0
numunderscore                    0
numpercent                       0
numquerycomponents               0
numampersand                     0
numhash                          0
numnumericchars                  0
nohttps                          0
randomstring                     0
ipaddress                        0
domaininsubdomains               0
domaininpaths                    0
httpsinhostname                  0
hostnamelength                   0
pathlength                       0
querylength                      0
doubleslashinpath                0
numsensitivewords                0
embeddedbrandname                0
pctexthyperlinks                 0
pctextresourceurls               0
extfavicon          

In [24]:
website_df.to_csv('website_data.csv')