In [1]:
from QuakeAPI.DBQueries import *
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Lining Up Data 

We have two data sources: EMSC and USGS. Many earthquakes are reported by both places. We need a way to line up the quakes that are the same between the two sources. Unfortunately the time can be off by as much as several seconds, so they aren't exact matches. 


## First thought: K-neighbors

for sake of experimentation I'm going to pull out 2000 quakes from USGS and EMSC and then compare them. 

In [2]:
USGS_QUAKES = query_all('SELECT id, time, latitude, longitude, magnitude FROM USGS where magnitude >= 0;')
EMSC_QUAKES = query_all('SELECT id, time, latitude, longitude, magnitude FROM EMSC ORDER BY TIME DESC LIMIT 20000;')
len(USGS_QUAKES), len(EMSC_QUAKES)

(22993, 20000)

Data is in the structure:

'id': quake[0]

'place': quake[1]

'time': quake[2]

'lat': quake[3]

'lon': quake[4]

'mag': quake[5]

In [11]:
#there were NA values to get rid of 
df = pd.DataFrame(USGS_QUAKES)
df = df.dropna()
df = df.set_index(0)
edf = pd.DataFrame(EMSC_QUAKES).set_index(0)
edf.head()


Unnamed: 0_level_0,1,2,3,4
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1586237049900,37.8,16.91,2.9
2,1586236321100,19.2,-155.48,2.2
3,1586236127700,33.51,-116.52,2.1
4,1586234922400,38.42,25.88,2.1
5,1586234581600,38.27,38.82,2.1


In [4]:
# instanciate the classifier
classifier = NearestNeighbors(n_neighbors=1)

In [5]:
# train the classifier 
classifier.fit(df)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                 radius=1.0)

In [12]:
# run the classifier on all the data 
distance, index = classifier.kneighbors(edf)

In [30]:
#check the best match
distance.min()

6.348473394446884

In [77]:
# well damn. There aren't any found. I should come back and try alternate algorithms
times = []
for i, dist in enumerate(distance):
    if dist < 20000:
        if np.absolute(USGS_QUAKES[index[i][0]][1]-EMSC_QUAKES[i][1]) < 1000:
            print(USGS_QUAKES[index[i][0]])
            print(EMSC_QUAKES[i])
            print()

(20376, 1585986334270, 33.515667, -116.5168304, 1.15)
(718, 1585986335100, 44.27, -115.24, 2.7)

(19709, 1585961973790, 38.7888336, -122.7546692, 1.04)
(795, 1585961973900, 33.99, 60.27, 3.7)

(19362, 1585863761615, -6.9704, 126.6103, 5.2)
(1105, 1585863761000, 18.9, -68.88, 3.5)

(16965, 1585180991090, 19.208334, -155.4116669, 2.13)
(2981, 1585180992000, -35.73, -73.6, 3.1)

(16620, 1585092913640, 35.8293333, -117.6666667, 1.01)
(3237, 1585092914600, -18.32, 120.13, 3.2)

(16611, 1585091143900, 37.6368332, -118.9448318, 1.43)
(3242, 1585091144300, 17.9, -66.8, 2.5)

(16338, 1585027963110, 36.0853348, -117.8408356, 1.38)
(3402, 1585027963000, 15.04, -92.6, 4.3)

(15630, 1584842895430, 60.4937, -143.0373, 1.5)
(3817, 1584842895600, 19.22, -155.42, 2.1)

(15317, 1584764465310, 32.6508333, -115.895, 1.1)
(4004, 1584764466000, 63.92, -22.0, 3.0)

(15315, 1584764336060, 19.2066669, -155.4106598, 1.94)
(4006, 1584764337000, 18.69, -102.75, 3.6)

(15260, 1584754775980, 39.0653, -119.7273, 2.0

In [91]:
# attempting to do some lining up by hand
df[df[4]>.4].sort_values(by=4).head(30)

Unnamed: 0,0,1,2,3,4
7306,7872,1581901023650,33.501333,-116.786333,0.41
18135,18900,1585731600690,33.230167,-116.734167,0.41
21476,22242,1586174055600,33.506667,-116.5035,0.41
20436,21202,1586067639600,33.506167,-116.510333,0.41
3361,3627,1582829274500,35.857333,-117.6755,0.41
1643,1749,1583267723150,37.661167,-118.895167,0.41
3856,4180,1582739615010,37.6985,-118.877,0.41
16384,17147,1585231111840,35.866667,-117.700833,0.41
3073,3323,1582881115290,38.837666,-122.804169,0.41
202,203,1583661066350,38.832001,-122.80217,0.41


In [79]:
EMSC_df = pd.DataFrame(EMSC_QUAKES)

In [84]:
EMSC_df.sort_values(by=4).head(30)

Unnamed: 0,0,1,2,3,4
946,947,1585911218000,45.61,15.21,0.4
883,884,1585928095000,45.75,14.05,0.4
839,840,1585940360000,46.09,13.68,0.4
433,434,1586076338500,45.87,15.98,0.5
231,232,1586142907400,45.9,15.98,0.5
780,781,1585966547000,45.95,14.1,0.6
836,837,1585941571000,45.7,14.42,0.6
1286,1287,1585805621100,45.89,15.96,0.6
59,60,1586213070700,47.53,9.27,0.7
599,600,1586019353900,46.64,9.6,0.7


## thought 2, use the history

well nothing so far has produced usable results. I think I'm going to try to line up quakes using the history funciton. 

In [92]:
import requests

In [101]:
requests.get('http://quake-ds-staging.herokuapp.com/history/USGS/0,0,3000').json()

{'message': [{'Oceanic': False,
   'id': 3691,
   'lat': -13.794,
   'lon': -14.6731,
   'mag': 5.3,
   'place': 'Southern Mid-Atlantic Ridge',
   'time': 1582816698668},
  {'Oceanic': False,
   'id': 3697,
   'lat': -13.0278,
   'lon': -15.1094,
   'mag': 4.7,
   'place': 'Southern Mid-Atlantic Ridge',
   'time': 1582816227635},
  {'Oceanic': False,
   'id': 3912,
   'lat': -12.2781,
   'lon': -14.7808,
   'mag': 5.1,
   'place': 'Southern Mid-Atlantic Ridge',
   'time': 1582776671106}],
 'status_code': 200}

In [102]:
requests.get('http://quake-ds-staging.herokuapp.com/history/EMSC/0,0,3000').json()

{'message': [{'id': 8944,
   'lat': -13.79,
   'lon': -14.6,
   'mag': 5.3,
   'place': 'SOUTHERN MID-ATLANTIC RIDGE',
   'time': 1582845498700},
  {'id': 8950,
   'lat': -13.03,
   'lon': -15.11,
   'mag': 4.7,
   'place': 'SOUTHERN MID-ATLANTIC RIDGE',
   'time': 1582845027600},
  {'id': 9057,
   'lat': -12.29,
   'lon': -14.8,
   'mag': 5.1,
   'place': 'SOUTHERN MID-ATLANTIC RIDGE',
   'time': 1582805470900},
  {'id': 19970,
   'lat': -11.75,
   'lon': -13.78,
   'mag': 5.9,
   'place': 'ASCENSION ISLAND REGION',
   'time': 1579271872600},
  {'id': 20219,
   'lat': 0.11,
   'lon': -17.1,
   'mag': 5.5,
   'place': 'NORTH OF ASCENSION ISLAND',
   'time': 1579197498600},
  {'id': 22852,
   'lat': 0.18,
   'lon': -16.93,
   'mag': 5.6,
   'place': 'NORTH OF ASCENSION ISLAND',
   'time': 1578302960400},
  {'id': 23063,
   'lat': -1.2,
   'lon': -12.86,
   'mag': 4.8,
   'place': 'NORTH OF ASCENSION ISLAND',
   'time': 1578160115700},
  {'id': 26206,
   'lat': 1.71,
   'lon': 8.18,
   '

The first three quakes around the area are clearly the same quakes, so there are some in the database, the question is just finding them. 
I'm going to try to algorithmically go through all of the EMSC quakes and search for proximity of time. 

In [113]:
for quake in EMSC_QUAKES:
    url = f'http://quake-ds-staging.herokuapp.com/history/USGS/{quake[3]},{quake[4]},500'
    responses = requests.get(url).json()['message']
    if len(responses) != 0:
        for response in responses:
            if np.absolute(response['time'] - quake[1]) < 2000:
                print('EMSC:', quake)
                print(response)

This proved to be wayyyyy to slow. probably because its calling the api 20000 times 

I'm going to go back and try alternate algorithms now.

In [26]:
# instanciate the classifier
classifier = NearestNeighbors(n_neighbors=1, algorithm='kd_tree')
# train the classifier 
classifier.fit(df)
# run the classifier on all the data 
distance, index = classifier.kneighbors(edf)
#check the best match
distance.min()

6.348473394446884

In [28]:
# instanciate the classifier
classifier = NearestNeighbors(n_neighbors=1, algorithm='auto')
# train the classifier 
classifier.fit(df)
# run the classifier on all the data 
distance, index = classifier.kneighbors(edf)
#check the best match
distance.min()

6.348473394446884

In [29]:
# instanciate the classifier
classifier = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
# train the classifier 
classifier.fit(df)
# run the classifier on all the data 
distance, index = classifier.kneighbors(edf)
#check the best match
distance.min()

6.348473394446884

In [31]:
# instanciate the classifier
classifier = NearestNeighbors(n_neighbors=1, algorithm='brute')
# train the classifier 
classifier.fit(df)
# run the classifier on all the data 
distance, index = classifier.kneighbors(edf)
#check the best match
distance.min()

0.0

# Brute force seems to be the best algorithm 

I'm going to line up the results and see what happens

In [44]:
# merge distance and index into one array, then a DF
dist_df = pd.DataFrame(np.concatenate((distance, index), axis=1), columns=['distance', 'USGS_index'])
dist_df = dist_df.set_index(pd.Index(np.arange(1,20001)))

In [50]:
# merge the predictions back with the quake data
df2 = edf.reset_index().merge(dist_df.reset_index(), left_on=0, right_on='index')

In [84]:
#seperate everything with 0 distance, theoretically they are the same quake
zero_dist = df2[df2['distance'] == 0]
zero_dist.shape, low_dist.shape

((1423, 8), (1423, 8))

In [85]:
# rename columns for ease of reading
zero_dist = zero_dist.rename(columns={0: 'E_id', 1:'E_time', 2:'E_lat', 3:'E_lon', 4:'E_mag', 'index':'id(also)', 'distance':'distance_from_USGS', 'USGS_index':'USGS_id'})
low_dist = low_dist.rename(columns={0: 'E_id', 1:'E_time', 2:'E_lat', 3:'E_lon', 4:'E_mag', 'index':'id(also)', 'distance':'distance_from_USGS', 'USGS_index':'USGS_id'})
df2 = df2.rename(columns={0: 'E_id', 1:'E_time', 2:'E_lat', 3:'E_lon', 4:'E_mag', 'index':'id(also)', 'distance':'distance_from_USGS', 'USGS_index':'USGS_id'})

In [94]:
#checking to see if any match up, it doesn't look like they do 
merged = zero_dist.merge(df.reset_index(), left_on='USGS_id', right_on=0, how='left')
merged['time_matches'] = merged['E_mag'] == merged[4]
merged[merged['time_matches']]

Unnamed: 0,E_id,E_time,E_lat,E_lon,E_mag,id(also),distance_from_USGS,USGS_id,0,1,2,3,4,time_matches
28,234,1586142773000,45.68,14.2,1.4,234,0.0,21208.0,21208.0,1586069000000.0,35.592835,-117.40567,1.4,True
125,703,1585990532900,33.5,-116.51,2.5,703,0.0,19669.0,19669.0,1585951000000.0,44.2235,-115.1142,2.5,True
722,7811,1583287860400,39.77,41.56,2.0,7811,0.0,1563.0,1563.0,1583314000000.0,39.647,-120.0371,2.0,True
735,7895,1583256674000,30.55,130.95,4.8,7895,0.0,1693.0,1693.0,1583281000000.0,24.4544,96.1562,4.8,True


Thats weird. I would have really expected that to work. I'm going to see what I can do with the kneighbors algorithm to get better results

In [76]:
#I have no idea how this works 
classifier.radius_neighbors(edf)

(array([array([], dtype=float64), array([0.]), array([], dtype=float64),
        ..., array([], dtype=float64), array([], dtype=float64),
        array([], dtype=float64)], dtype=object),
 array([array([], dtype=int64), array([21862]), array([], dtype=int64),
        ..., array([], dtype=int64), array([], dtype=int64),
        array([], dtype=int64)], dtype=object))