In [116]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import sqlalchemy as db
import requests
import re
from sqlalchemy import text
from sqlalchemy.orm import Session

<h4>Credentials:</h4>

In [117]:
protocol = "mysql+pymysql"
username = "root"
password = "password"
server = "localhost"
database = "quakes"

<h4>Connect to server</h4>

In [118]:
engine = db.create_engine(f"{protocol}://{username}:{password}@{server}/{database}")
metadata = db.MetaData()

<h4>Create table variables</h4>

In [119]:
engine.table_names()

  engine.table_names()


['Agencies',
 'Alerts',
 'Contributed',
 'Events',
 'IdMap',
 'MagnitudeTypes',
 'Status']

In [120]:
agencies_tbl = db.Table('Agencies', metadata, autoload_with=engine)
status_tbl = db.Table('Status', metadata, autoload_with=engine)
alerts_tbl = db.Table('Alerts', metadata, autoload_with=engine)
mag_types_tbl = db.Table('MagnitudeTypes', metadata, autoload_with=engine)

<h2>Populate MagnitudeTypes table</h2>

In [121]:
url = "https://www.usgs.gov/programs/earthquake-hazards/magnitude-types"
dfs = pd.read_html(requests.get(url).content)
dfs

[                                      Magnitude Type  Magnitude Range  \
 0          Mww (Moment W-phase)(generic notation Mw)  ~5.0 and larger   
 1                                     Mwc (centroid)  ~5.5 and larger   
 2                                    Mwb (body wave)     ~5.5 to ~7.0   
 3                                     Mwr (regional)     ~4.0 to ~6.5   
 4                    Ms20 or Ms (20sec surface wave)     ~5.0 to ~8.5   
 5                        mb (short-period body wave)     ~4.0 to ~6.5   
 6                          Mfa (felt-area magnitude)              any   
 7                               ML Ml, or ml (local)     ~2.0 to ~6.5   
 8   mb_Lg, mb_lg, or MLg (short-period surface wave)     ~3.5 to ~7.0   
 9                                Md or md (duration)    ~4 or smaller   
 10                     Mi or Mwp (integrated p-wave)     ~5.0 to ~8.0   
 11                                       Me (energy)  ~3.5 and larger   
 12                                   

In [122]:
mag_types = dfs[0].iloc[:, 0]
mag_types

0            Mww (Moment W-phase)(generic notation Mw)
1                                       Mwc (centroid)
2                                      Mwb (body wave)
3                                       Mwr (regional)
4                      Ms20 or Ms (20sec surface wave)
5                          mb (short-period body wave)
6                            Mfa (felt-area magnitude)
7                                 ML Ml, or ml (local)
8     mb_Lg, mb_lg, or MLg (short-period surface wave)
9                                  Md or md (duration)
10                       Mi or Mwp (integrated p-wave)
11                                         Me (energy)
12                                                  Mh
13                               Finite Fault Modeling
14                          Mint (intensity magnitude)
Name: Magnitude Type, dtype: object

In [123]:
#edge cases
mag_types[0] = 'Mww Mw'
mag_types[13] = 'FFM'
#remove 'or'
mag_types.replace('or', ' ', regex=True, inplace=True)
#remove content inside parentheses
mag_types.replace(r'\(.*\)|,', ' ', regex=True, inplace=True)
mag_types = [set(map(lambda x: x.lower(), filter(None, mag_types_coll.split(' ')))) for mag_types_coll in mag_types]

In [124]:
#populate MagnitudeTypes
with Session(engine) as session, session.begin():
    session.execute(mag_types_tbl.insert(),
        [{'id_type': index, 'name': mag_type} for index, mag_types_coll in enumerate(mag_types) 
                                              for mag_type in mag_types_coll])

In [125]:
#checking
with Session(engine) as session, session.begin():
    print("id|id_type|name")
    for row in session.execute(db.select(mag_types_tbl)):
        print(row)

id|id_type|name
(1, 0, 'mww')
(2, 0, 'mw')
(3, 1, 'mwc')
(4, 2, 'mwb')
(5, 3, 'mwr')
(6, 4, 'ms20')
(7, 4, 'ms')
(8, 5, 'mb')
(9, 6, 'mfa')
(10, 7, 'ml')
(11, 8, 'mb_lg')
(12, 8, 'mlg')
(13, 9, 'md')
(14, 10, 'mi')
(15, 10, 'mwp')
(16, 11, 'me')
(17, 12, 'mh')
(18, 13, 'ffm')
(19, 14, 'mint')


<h2>Populate Agencies table</h2>

<h4>First source</h4>

In [126]:
url = "https://earthquake.usgs.gov/data/comcat/catalog/"
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
print(soup.prettify())

.................................
   <div class="page-content">
    <ul>
     <li>
      <a href="ak/">
       AK - Alaska Earthquake Center
      </a>
     </li>
     <li>
      <a href="at/">
      </a>
     </li>
     <li>
      <a href="atlas/">
       ATLAS - ShakeMap Atlas
      </a>
     </li>
     <li>
      <a href="av/">
       AV - Alaska Volcano Observatory
      </a>
     </li>
     <li>
      <a href="choy/">
       CHOY - Energy Magnitude and Broadband Depth
      </a>
     </li>
     <li>
      <a href="ci/">
.................................



In [127]:
divs = soup.find_all("div", {"class": "page-content"})
divs[0]

<div class="page-content">

<h4>Second source</h4>

In [128]:
url = 'https://www.emsc-csem.org/Earthquake/contributors.php'
dfs = pd.read_html(requests.get(url).content)
dfs

[    0                                                  1  \
 0 NaN  set_server_date(2022,4,1,0,11,50)  Current tim...   
 
                          2  
 0  Member access  Name Pwd  ,
                0              1
 0  Member access  Member access
 1            NaN            NaN
 2           Name            NaN
 3            NaN            NaN
 4            Pwd            NaN
 5            NaN            NaN
 6            NaN            NaN,
      Code                                          Institute  Country    City
 0      AE  Arizona Broadband Seismic Network, Arizona Geo...      USA     NaN
 1      AG  Arkansas Seismic Network, Arkansas Geological ...      USA     NaN
 2      AK  Alaska Regional Network, University of Alaska-...      USA     NaN
 3      AO  Arkansas Seismic Observatory, University of Ar...      USA     NaN
 4      AR  Northern Arizona Network, Arizona Earthquake I...      USA     NaN
 ..    ...                                                ...      ...     .

In [129]:
df = dfs[-1].loc[:, ['Code', 'Institute']]
df

Unnamed: 0,Code,Institute
0,AE,"Arizona Broadband Seismic Network, Arizona Geo..."
1,AG,"Arkansas Seismic Network, Arkansas Geological ..."
2,AK,"Alaska Regional Network, University of Alaska-..."
3,AO,"Arkansas Seismic Observatory, University of Ar..."
4,AR,"Northern Arizona Network, Arizona Earthquake I..."
...,...,...
156,WAR,Warsaw seismic network
157,WR,"California Division of Water Resources, Califo..."
158,WY,Yellowstone Volcano Observatory Seismic Networ...
159,ZAG,"Seismological Survey, University of Zagreb"


<h4>Populating</h4>

In [130]:
with Session(engine) as session, session.begin():
    
    for row in divs[0].find_all("li"):
        abb, details = map(str.strip, row.text.split('-', 1))
        session.execute(agencies_tbl.insert().values(abbreviation=abb, details=details))
    
    for index, row in df.iterrows():
        abb, details = map(str.strip, [row['Code'], row['Institute']])
        
        insert_stmt = db.dialects.mysql.insert(agencies_tbl).values(abbreviation=abb, details=details)
        session.execute(insert_stmt.on_duplicate_key_update(abbreviation=insert_stmt.inserted.abbreviation))
    
    #edge cases
    edge_cases = ['usauto', 'tx', 'ew', 'cgs', 'aacse', 'ew_dm']
    for edge_case in edge_cases:
        session.execute(text(f"INSERT INTO Agencies(abbreviation) VALUE ('{edge_case}')"))
    

<h4>Checking</h4>

In [131]:
with engine.connect() as con:
    print("id|abb|full_name")
    for row in con.execute(text("SELECT * FROM  Agencies")):
        print(row)

id|abb|full_name
(1, 'AK', 'Alaska Earthquake Center')
(3, 'ATLAS', 'ShakeMap Atlas')
(4, 'AV', 'Alaska Volcano Observatory')
(5, 'CHOY', 'Energy Magnitude and Broadband Depth')
(6, 'CI', 'California Integrated Seismic Network: Southern California Seismic Network (Caltech/USGS Pasadena and Partners) and Southern California Earthquake Data Center')
(7, 'DUPUTEL', 'Duputel et al. W phase catalog')
(8, 'EQH', 'EQH - Coffman, von Hake and Stover, Earthquake History of the United States')
(9, 'GCMT', 'Lamont-Doherty Earth Observatory Global CMT project, New York, USA')
(10, 'HV', 'Hawaii Volcano Observatory')
(11, 'ISCGEM', 'ISC-GEM Main Catalog')
(12, 'ISCGEMSUP', 'ISC-GEM Supplementary Catalog')
(13, 'ISMPKANSAS', 'USGS Induced Seismicity Project (Kansas)')
(14, 'LD', 'Lamont-Doherty Cooperative Seismographic Network')
(15, 'MB', 'Montana Bureau of Mines and Geology')
(16, 'NC', 'California Integrated Seismic Network: Northern California Seismic System (UC Berkeley, USGS Menlo Park, and P

<h2>Populate Status table</h2>

In [132]:
url = "https://earthquake.usgs.gov/data/comcat"
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <title>
   ANSS Comprehensive Earthquake Catalog (ComCat) Documentation
  </title>
  <meta charset="utf-8"/>
.................................
     </dd>
     <dt id="status">
      status
     </dt>
     <dd>
      <dl>
       <dt>
        Data Type
       </dt>
       <dd class="datatype">
        String
       </dd>
       <dt>
        Typical Values
       </dt>
       <dd>
        “automatic”,
        “reviewed”,
        “deleted”
       </dd>
       <dt>
        Description
       </dt>
       <dd>
        Indicates whether the event has been reviewed by a human.
       </dd>
       <dt>
        Additional Information
       </dt>
       <dd>
        <p>
         Status is either automatic or reviewed. Automatic events
          are directly posted by automatic processing systems and have not
          been verified or altered by a human. Reviewed events have been
          looked at by a human. The level of review can range from a quick

In [133]:
el = soup.find("dt", {"id": "status"})
info = el.next_sibling.next_sibling.find_all("dd")[1]
info

<dd>
        “automatic”,
        “reviewed”,
        “deleted”
      </dd>

In [134]:
status_values = list(map(str.strip, info.text.replace('“', '').replace('”', '').split(",")))
status_values

['automatic', 'reviewed', 'deleted']

<h4>Populating</h4>

In [135]:
with Session(engine) as session, session.begin():
    
    session.execute(status_tbl.insert(), 
                    [{'name': name} for name in status_values])

<h4>Checking</h4>

In [136]:
with Session(engine) as session, session.begin():
    print("id|name")
    for row in session.execute(status_tbl.select()):
        print(row)

id|name
(1, 'automatic')
(3, 'deleted')
(2, 'reviewed')


<h2>Populate Alerts table</h2>

In [137]:
url = "https://earthquake.usgs.gov/data/comcat"
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
el = soup.find("dt", {"id": "alert"})
info = el.next_sibling.next_sibling.find_all("dd")[1]
info

<dd>
        “green”, “yellow”, “orange”,
        “red”.
      </dd>

In [138]:
alerts_values = list(map(str.strip, info.text.replace('.','').replace('“', '').replace('”', '').split(",")))
alerts_values

['green', 'yellow', 'orange', 'red']

<h4>Populate</h4>

In [139]:
with Session(engine) as session, session.begin():
    session.execute(alerts_tbl.insert(), [{'name': name} for name in alerts_values])

<h4>Check<h4>

In [140]:
with Session(engine) as session, session.begin():
    print("id|name")
    print(list(session.execute(alerts_tbl.select())))

id|name
[(1, 'green'), (3, 'orange'), (4, 'red'), (2, 'yellow')]
