In [139]:
import numpy
import pandas as pd
import json
import sqlalchemy as db
import matplotlib.pyplot as plt
import seaborn as sns

<h2>Read data from files</h2>

In [183]:
all_files = !ls
all_files = list(filter(lambda x: x.startswith("quakes"), all_files))
all_files

['quakes-00000-20000.json',
 'quakes-20000-40000.json',
 'quakes-40000-60000.json',
 'quakes-60000-80000.json',
 'quakes-80000-100000.json']

In [181]:
records = []
for file in all_files:
    with open(file) as f:
        data = json.load(f)
        records.extend(data['features'])
len(records)

100000

In [182]:
records[0].keys()

dict_keys(['type', 'properties', 'geometry', 'id'])

In [184]:
records[0]['properties'].keys()

dict_keys(['mag', 'place', 'time', 'updated', 'tz', 'url', 'detail', 'felt', 'cdi', 'mmi', 'alert', 'status', 'tsunami', 'sig', 'net', 'code', 'ids', 'sources', 'types', 'nst', 'dmin', 'rms', 'gap', 'magType', 'type', 'title'])

<h2>Set up mysql server</h2>

In [165]:
username = "root"
password = "password"
server = "localhost"
database = "quakes"

engine = db.create_engine(f"mysql+pymysql://{username}:{password}@{server}/{database}")

In [173]:
connection = engine.connect()

connection.execute("""CREATE TABLE IF NOT EXISTS idMap (
                            Name VarChar(50) PRIMARY KEY,
                            AssignedIndex INT)""")
connection.close()

In [178]:
engine.table_names()

  engine.table_names()


['idMap']

ProgrammingError: 1046 (3D000): No database selected

In [65]:
def construct_ids_dict(data):
    local_id_to_global_id = {}
    next_global_id = 0
    
    for item in data:
        #local_ids = set of id + ids
        local_ids = set(filter(None, item['properties']['ids'].split(',' )))
        local_ids.add(item['id'])
        
        global_id = -1
        
        #global_id = -1 if none of local ids were assigned
        #otherwise = local_id_to_global_id[local_id] for some assigned local id
        for local_id in local_ids:
            #local_id was already assigned then...
            if local_id in local_id_to_global_id:
                new_global_id  = local_id_to_global_id[local_id]
                
                #if global id was not assigned earlier then assign new_global_id to it
                if global_id == -1:
                    global_id = new_global_id
                #raise consistency exception
                elif global_id != new_global_id:
                    raise Exception("Dataset is inconsistent")
        
        #none local ids were assigned earlier
        if global_id == -1:
            global_id = next_global_id
            next_global_id = next_global_id + 1
        
        for local_id in local_ids:
            local_id_to_global_id[local_id] = global_id
            
    return local_id_to_global_id

In [119]:
ids = construct_ids_dict(records)

In [185]:
df = pd.DataFrame([[key, value] for key, value in ids.items()], columns=["local", "global"])
df.head(20)

Unnamed: 0,local,global
0,hv72961597,0
1,hv72961587,1
2,hv72961577,2
3,us7000gx6m,3
4,us7000gx6g,4
5,usauto7000gx6g,4
6,us7000gx6b,5
7,us7000gx64,6
8,us7000gx62,7
9,us7000gx5v,8


In [187]:
len(primaries)

99999

In [131]:
primaries = set(record["id"] for record in records)

In [137]:
len(set(df.local))

141652

In [138]:
df.to_sql(con=mydbs, name='table_name_for_df', if_exists='replace', flavor='mysql')

Unnamed: 0,local,global
0,hv72961597,0
1,hv72961587,1
2,hv72961577,2
3,us7000gx6m,3
4,us7000gx6g,4
...,...,...
141647,ak20281929,99970
141648,ak018d718yqc,99971
141649,ak20281904,99971
141650,at00pgm50s,99971


In [191]:
df = df.set_index('local')

In [136]:
len([x for x in df.local if not x in primaries])

41653

In [193]:
primaries = list(primaries)

In [49]:
result = process_json(gj)

In [51]:
item = result[0]

In [52]:
item.keys()

dict_keys(['type', 'properties', 'geometry', 'id'])

In [54]:
item['properties'].keys()

dict_keys(['mag', 'place', 'time', 'updated', 'tz', 'url', 'detail', 'felt', 'cdi', 'mmi', 'alert', 'status', 'tsunami', 'sig', 'net', 'code', 'ids', 'sources', 'types', 'nst', 'dmin', 'rms', 'gap', 'magType', 'type', 'title'])

In [58]:
list(filter(None, item['properties']['ids'].split(',' )))

['hv72961597']

In [201]:
len(set([record['properties']['code'] for record in records]))

99997