In [21]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import sqlalchemy as db
import requests
import re
from sqlalchemy import text

In [22]:
username = "root"
password = "password"
server = "localhost"
database = "quakes"

engine = db.create_engine(f"mysql+pymysql://{username}:{password}@{server}/{database}")
metadata = db.MetaData()

<h2>Populate MagnitudeTypes Table</h2>

In [23]:
url = "https://www.usgs.gov/programs/earthquake-hazards/magnitude-types"
dfs = pd.read_html(requests.get(url).content)
dfs

[                                      Magnitude Type  Magnitude Range  \
 0          Mww (Moment W-phase)(generic notation Mw)  ~5.0 and larger   
 1                                     Mwc (centroid)  ~5.5 and larger   
 2                                    Mwb (body wave)     ~5.5 to ~7.0   
 3                                     Mwr (regional)     ~4.0 to ~6.5   
 4                    Ms20 or Ms (20sec surface wave)     ~5.0 to ~8.5   
 5                        mb (short-period body wave)     ~4.0 to ~6.5   
 6                          Mfa (felt-area magnitude)              any   
 7                               ML Ml, or ml (local)     ~2.0 to ~6.5   
 8   mb_Lg, mb_lg, or MLg (short-period surface wave)     ~3.5 to ~7.0   
 9                                Md or md (duration)    ~4 or smaller   
 10                     Mi or Mwp (integrated p-wave)     ~5.0 to ~8.0   
 11                                       Me (energy)  ~3.5 and larger   
 12                                   

In [24]:
mag_types = dfs[0].iloc[:, 0]
mag_types

0            Mww (Moment W-phase)(generic notation Mw)
1                                       Mwc (centroid)
2                                      Mwb (body wave)
3                                       Mwr (regional)
4                      Ms20 or Ms (20sec surface wave)
5                          mb (short-period body wave)
6                            Mfa (felt-area magnitude)
7                                 ML Ml, or ml (local)
8     mb_Lg, mb_lg, or MLg (short-period surface wave)
9                                  Md or md (duration)
10                       Mi or Mwp (integrated p-wave)
11                                         Me (energy)
12                                                  Mh
13                               Finite Fault Modeling
14                          Mint (intensity magnitude)
Name: Magnitude Type, dtype: object

In [25]:
#particular cases
mag_types[0] = 'Mww Mw'
mag_types[13] = 'FFM'
#remove 'or'
mag_types.replace('or', ' ', regex=True, inplace=True)
#remove content inside parentheses
mag_types.replace(r'\(.*\)|,', ' ', regex=True, inplace=True)

In [26]:
#populate MagnitudeTypes
with engine.connect() as con:
    for index, item in enumerate(mag_types):
        for mag_type in set(map(lambda x: x.lower(), filter(None, item.split(' ')))):
            con.execute(text(f"INSERT INTO MagnitudeTypes(id_type, name) VALUE ({index}, '{mag_type}');"))

In [27]:
#checking
with engine.connect() as con:
    print("id|id_type|name")
    for row in con.execute(text("SELECT * FROM MagnitudeTypes")):
        print(row)

id|id_type|name
(20, 0, 'mw')
(21, 0, 'mww')
(22, 1, 'mwc')
(23, 2, 'mwb')
(24, 3, 'mwr')
(25, 4, 'ms20')
(26, 4, 'ms')
(27, 5, 'mb')
(28, 6, 'mfa')
(29, 7, 'ml')
(30, 8, 'mlg')
(31, 8, 'mb_lg')
(32, 9, 'md')
(33, 10, 'mi')
(34, 10, 'mwp')
(35, 11, 'me')
(36, 12, 'mh')
(37, 13, 'ffm')
(38, 14, 'mint')


<h2>Populate Agencies table</h2>

In [28]:
url = "https://earthquake.usgs.gov/data/comcat/catalog/"
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <title>
   Catalogs
  </title>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link href="/theme/site/earthquake/index.css" rel="stylesheet"/>
  <meta content="USGS Earthquake Hazards Program, responsible for monitoring, reporting, and researching earthquakes and earthquake hazards" name="description"/>
  <meta content="aftershock,earthquake,epicenter,fault,foreshock,geologist,geophysics,hazard,hypocenter,intensity,intensity scale,magnitude,magnitude scale,mercalli,plate,richter,seismic,seismicity,seismogram,seismograph,seismologist,seismology,subduction,tectonics,tsunami,quake,sismologico,sismologia" name="keywords"/>
  <script async="async" id="_fed_an_ua_tag" src="https://dap.digitalgov.gov/Universal-Federated-Analytics-Min.js?agency=DOI&amp;subagency=USGS">
  </script>
  <link href="https://fonts.googleapis.com/icon?family=Material+Icons|Merriweather:400,400italic,700|Source+Sans+Pro:400

In [29]:
divs = soup.find_all("div", {"class": "page-content"})
divs[0]

<div class="page-content">

In [30]:
url = 'https://www.emsc-csem.org/Earthquake/contributors.php'
dfs = pd.read_html(requests.get(url).content)
dfs

[    0                                                  1  \
 0 NaN  set_server_date(2022,3,28,14,3,4)  Current tim...   
 
                          2  
 0  Member access  Name Pwd  ,
                0              1
 0  Member access  Member access
 1            NaN            NaN
 2           Name            NaN
 3            NaN            NaN
 4            Pwd            NaN
 5            NaN            NaN
 6            NaN            NaN,
      Code                                          Institute  Country    City
 0      AE  Arizona Broadband Seismic Network, Arizona Geo...      USA     NaN
 1      AG  Arkansas Seismic Network, Arkansas Geological ...      USA     NaN
 2      AK  Alaska Regional Network, University of Alaska-...      USA     NaN
 3      AO  Arkansas Seismic Observatory, University of Ar...      USA     NaN
 4      AR  Northern Arizona Network, Arizona Earthquake I...      USA     NaN
 ..    ...                                                ...      ...     .

In [31]:
df = dfs[-1].loc[:, ['Code', 'Institute']]
df

Unnamed: 0,Code,Institute
0,AE,"Arizona Broadband Seismic Network, Arizona Geo..."
1,AG,"Arkansas Seismic Network, Arkansas Geological ..."
2,AK,"Alaska Regional Network, University of Alaska-..."
3,AO,"Arkansas Seismic Observatory, University of Ar..."
4,AR,"Northern Arizona Network, Arizona Earthquake I..."
...,...,...
156,WAR,Warsaw seismic network
157,WR,"California Division of Water Resources, Califo..."
158,WY,Yellowstone Volcano Observatory Seismic Networ...
159,ZAG,"Seismological Survey, University of Zagreb"


In [33]:
with engine.connect() as con:
    table = db.Table('Agencies', metadata, autoload_with=engine)
    
    for row in divs[0].find_all("li"):
        abb, details = map(str.strip, row.text.split('-', 1))
        con.execute(text(f"INSERT INTO Agencies(abbreviation, details) VALUE ('{abb}', '{details}')"))
    
    for index, row in df.iterrows():
        abb, details = map(str.strip, [row['Code'], row['Institute']])
        
        insert_stmt = db.dialects.mysql.insert(table).values(abbreviation=abb, details=details)
        con.execute(insert_stmt.on_duplicate_key_update(abbreviation=insert_stmt.inserted.abbreviation))
    
    #edge cases
    edge_cases = ['usauto', 'tx', 'ew', 'cgs', 'aacse', 'ew_dm']
    for edge_case in edge_cases:
        con.execute(text(f"INSERT INTO Agencies(abbreviation) VALUE ('{edge_case}')"))
    

In [34]:
#checking
with engine.connect() as con:
    print("id|abb|full_name")
    for row in con.execute(text("SELECT * FROM  Agencies")):
        print(row)

id|abb|full_name
(199, 'AK', 'Alaska Earthquake Center')
(201, 'ATLAS', 'ShakeMap Atlas')
(202, 'AV', 'Alaska Volcano Observatory')
(203, 'CHOY', 'Energy Magnitude and Broadband Depth')
(204, 'CI', 'California Integrated Seismic Network: Southern California Seismic Network (Caltech/USGS Pasadena and Partners) and Southern California Earthquake Data Center')
(205, 'DUPUTEL', 'Duputel et al. W phase catalog')
(206, 'EQH', 'EQH - Coffman, von Hake and Stover, Earthquake History of the United States')
(207, 'GCMT', 'Lamont-Doherty Earth Observatory Global CMT project, New York, USA')
(208, 'HV', 'Hawaii Volcano Observatory')
(209, 'ISCGEM', 'ISC-GEM Main Catalog')
(210, 'ISCGEMSUP', 'ISC-GEM Supplementary Catalog')
(211, 'ISMPKANSAS', 'USGS Induced Seismicity Project (Kansas)')
(212, 'LD', 'Lamont-Doherty Cooperative Seismographic Network')
(213, 'MB', 'Montana Bureau of Mines and Geology')
(214, 'NC', 'California Integrated Seismic Network: Northern California Seismic System (UC Berkeley,

<h2>Populate Status table</h2>

In [35]:
url = "https://earthquake.usgs.gov/data/comcat"
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <title>
   ANSS Comprehensive Earthquake Catalog (ComCat) Documentation
  </title>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link href="/theme/site/earthquake/index.css" rel="stylesheet"/>
  <meta content="USGS Earthquake Hazards Program, responsible for monitoring, reporting, and researching earthquakes and earthquake hazards" name="description"/>
  <meta content="aftershock,earthquake,epicenter,fault,foreshock,geologist,geophysics,hazard,hypocenter,intensity,intensity scale,magnitude,magnitude scale,mercalli,plate,richter,seismic,seismicity,seismogram,seismograph,seismologist,seismology,subduction,tectonics,tsunami,quake,sismologico,sismologia" name="keywords"/>
  <script async="async" id="_fed_an_ua_tag" src="https://dap.digitalgov.gov/Universal-Federated-Analytics-Min.js?agency=DOI&amp;subagency=USGS">
  </script>
  <link href="https://fonts.googleapis.com/icon?family=Material+Icon

In [36]:
el = soup.find("dt", {"id": "status"})
info = el.next_sibling.next_sibling.find_all("dd")[1]
info

<dd>
        “automatic”,
        “reviewed”,
        “deleted”
      </dd>

In [43]:
with engine.connect() as con:
    for s in map(str.strip, info.text.split(",")):
        con.execute(text(f"INSERT INTO Status(name) VALUE ('{s}')"))

In [44]:
#checking
with engine.connect() as con:
    print("id|name")
    for row in con.execute(text("SELECT * FROM Status")):
        print(row)

id|name
(11, 'automatic')
(13, 'deleted')
(12, 'reviewed')


<h2>Populate Alerts table</h2>

In [45]:
url = "https://earthquake.usgs.gov/data/comcat"
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
el = soup.find("dt", {"id": "alert"})
info = el.next_sibling.next_sibling.find_all("dd")[1]
info

<dd>
        “green”, “yellow”, “orange”,
        “red”.
      </dd>

In [49]:
with engine.connect() as con:
    for s in map(str.strip, info.text.replace('.', '').replace('“', '').replace('”', '').split(",")):
        con.execute(text(f"INSERT INTO Alerts(name) VALUE ('{s}')"))

IntegrityError: (pymysql.err.IntegrityError) (1062, "Duplicate entry 'green' for key 'alerts.name_UNIQUE'")
[SQL: INSERT INTO Alerts(name) VALUE ('green')]
(Background on this error at: https://sqlalche.me/e/14/gkpj)

In [50]:
#checking
with engine.connect() as con:
    print("id|name")
    for row in con.execute(text("SELECT * FROM Alerts")):
        print(row)

id|name
(9, 'green')
(11, 'orange')
(12, 'red')
(10, 'yellow')
