In [1]:
import pandas as pd
import json
import requests
import glob
import os
from sqlalchemy import create_engine
from bs4 import BeautifulSoup

# Hackathon #2 - Data Wrangling (Instructor solution)

## Table of Contents
- [Get the Data](#Get-the-Data)
    - [Data in files](#Data-in-files)
    - [Data in Database](#Data-in-Database)
    - [Data in Website](#Data-in-Website)
- [Merge the Data](#Merge-the-Data)
- [Data in API](#Data-in-API)
- [Create Train Dataset](#Create-Train-Dataset)

## Get the Data

### -> Data in Database

First we need to check what information we have in the database

In [2]:
username = 'ldsa_student'
password = 'R4Fr4P3aAgMYBqqP'
host_name = 'batch4-s02-db-instance.ctq2kxc7kx1i.eu-west-1.rds.amazonaws.com'
port = 5432
db_name = 'batch4_s02_db'
schema = 'hackathon_students'

In [3]:
conn_str = 'postgresql://{}:{}@{}:{}/{}'.format(username, password, host_name, port, db_name)
conn_args = {'options': '-csearch_path={}'.format(schema)}

In [4]:
engine = create_engine(conn_str, connect_args=conn_args)

In [5]:
query= """
SELECT *
FROM pg_catalog.pg_tables
WHERE schemaname = 'hackathon_students';
"""
df_db_table_info = pd.read_sql_query(query, engine)
display(df_db_table_info)

Unnamed: 0,schemaname,tablename,tableowner,tablespace,hasindexes,hasrules,hastriggers,rowsecurity
0,hackathon_students,missingdata,postgres,,True,False,False,False


We have a single table, named "missingdata". We'll be selecting everything from that table and saving it in a dataframe

In [6]:
def show_dtypes_na(df: pd.DataFrame) -> pd.DataFrame: 
    df_show = pd.concat(
        [
            df.dtypes.sort_index(), 
            df.isna().sum().sort_index()
        ],
        axis=1
    ).rename({0: 'dtypes', 1: 'na'}, axis=1)
    
    return df_show

In [7]:
query_missingdata = """
SELECT *
FROM missingdata
"""
df = pd.read_sql_query(query_missingdata, engine).drop(columns=['index'])
df.columns= df.columns.str.lower()

df

Unnamed: 0,id,numdots,subdomainlevel,pathlevel,urllength,numdash,numdashinhostname,atsymbol,tildesymbol,numunderscore,...,extfavicon,insecureforms,relativeformaction,extformaction,abnormalformaction,pctnullselfredirecthyperlinks,frequentdomainnamemismatch,fakelinkinstatusbar,rightclickdisabled,class_label
0,7859,2,1,3,100,0,0,0,0,0,...,1,1,0,1,0,0,0,0,0,0
1,1843,1,0,5,47,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
2,dit is poep,1,0,4,54,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
3,2685,2,0,7,74,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,6565,1,0,2,87,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2661,6187,5,1,2,121,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2662,9826,4,1,2,54,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2663,4403,4,1,3,50,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2664,3270,2,1,3,72,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [8]:
show_dtypes_na(df)

Unnamed: 0,dtypes,na
abnormalformaction,int64,0
atsymbol,int64,0
class_label,int64,0
domaininpaths,int64,0
domaininsubdomains,int64,0
doubleslashinpath,int64,0
embeddedbrandname,int64,0
extfavicon,int64,0
extformaction,int64,0
fakelinkinstatusbar,int64,0


In [9]:
# remove non numerical ids
df = df[pd.to_numeric(df['id'], errors='coerce').notnull()].set_index('id')
df.index = df.index.astype(int)
df

Unnamed: 0_level_0,numdots,subdomainlevel,pathlevel,urllength,numdash,numdashinhostname,atsymbol,tildesymbol,numunderscore,numpercent,...,extfavicon,insecureforms,relativeformaction,extformaction,abnormalformaction,pctnullselfredirecthyperlinks,frequentdomainnamemismatch,fakelinkinstatusbar,rightclickdisabled,class_label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7859,2,1,3,100,0,0,0,0,0,0,...,1,1,0,1,0,0,0,0,0,0
1843,1,0,5,47,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1
2685,2,0,7,74,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
6565,1,0,2,87,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
6228,2,0,1,85,0,1,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6187,5,1,2,121,0,0,0,0,0,7,...,0,1,0,0,0,0,0,0,0,0
9826,4,1,2,54,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4403,4,1,3,50,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
3270,2,1,3,72,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1


In [10]:
df.to_csv('database_data.csv')