In [2]:
from QuakeAPI.DBQueries import *
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Lining Up Data 

We have two data sources: EMSC and USGS. Many earthquakes are reported by both places. We need a way to line up the quakes that are the same between the two sources. Unfortunately the time can be off by as much as several seconds, so they aren't exact matches. 


## First thought: K-neighbors

for sake of experimentation I'm going to pull out 2000 quakes from USGS and EMSC and then compare them. 

In [29]:
USGS_QUAKES = query_all('SELECT id, time, latitude, longitude, magnitude FROM USGS where magnitude >= 0;')
EMSC_QUAKES = query_all('SELECT id, time, latitude, longitude, magnitude FROM EMSC ORDER BY TIME DESC;')
len(USGS_QUAKES), len(EMSC_QUAKES)

(23476, 101999)

Data is in the structure:

'id': quake[0]

'place': quake[1]

'time': quake[2]

'lat': quake[3]

'lon': quake[4]

'mag': quake[5]

In [20]:
#there were NA values to get rid of 
df = pd.DataFrame(USGS_QUAKES)
df = df.dropna()
df = df.set_index(0)
edf = pd.DataFrame(EMSC_QUAKES)
edf.head()


Unnamed: 0,0,1,2,3,4
0,1,1586237049900,37.8,16.91,2.9
1,2,1586236321100,19.2,-155.48,2.2
2,3,1586236127700,33.51,-116.52,2.1
3,4,1586234922400,38.42,25.88,2.1
4,5,1586234581600,38.27,38.82,2.1


In [5]:
# instanciate the classifier
classifier = NearestNeighbors(n_neighbors=1)

In [6]:
# train the classifier 
classifier.fit(df)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                 radius=1.0)

In [7]:
# run the classifier on all the data 
distance, index = classifier.kneighbors(edf)

In [8]:
#check the best match
distance.min()

6.348473394446884

In [None]:
# well damn. There aren't any found. I should come back and try alternate algorithms
times = []
for i, dist in enumerate(distance):
    if dist < 20000:
        if np.absolute(USGS_QUAKES[index[i][0]][1]-EMSC_QUAKES[i][1]) < 1000:
            print(USGS_QUAKES[index[i][0]])
            print(EMSC_QUAKES[i])
            print()

In [10]:
# attempting to do some lining up by hand
df[df[4]>.4].sort_values(by=4).head(30)

Unnamed: 0_level_0,1,2,3,4
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3323,1582881115290,38.837666,-122.804169,0.41
15680,1584854077700,33.495167,-116.788667,0.41
2823,1582996500100,33.596333,-116.799833,0.41
3627,1582829274500,35.857333,-117.6755,0.41
23875,1586415189480,33.496333,-116.506,0.41
8031,1581863391890,44.531,-111.1025,0.41
788,1583502664250,33.223667,-116.717167,0.41
1105,1583420748580,35.954333,-117.703833,0.41
5856,1582343147980,33.503,-116.770167,0.41
3308,1582886617310,38.827835,-122.793335,0.41


In [11]:
EMSC_df = pd.DataFrame(EMSC_QUAKES)

In [12]:
EMSC_df.sort_values(by=4).head(30)

Unnamed: 0,0,1,2,3,4
946,947,1585911218000,45.61,15.21,0.4
883,884,1585928095000,45.75,14.05,0.4
839,840,1585940360000,46.09,13.68,0.4
433,434,1586076338500,45.87,15.98,0.5
231,232,1586142907400,45.9,15.98,0.5
780,781,1585966547000,45.95,14.1,0.6
836,837,1585941571000,45.7,14.42,0.6
1286,1287,1585805621100,45.89,15.96,0.6
59,60,1586213070700,47.53,9.27,0.7
599,600,1586019353900,46.64,9.6,0.7


## thought 2, use the history

well nothing so far has produced usable results. I think I'm going to try to line up quakes using the history funciton. 

In [30]:
import requests

In [31]:
requests.get('http://quake-ds-staging.herokuapp.com/history/USGS/0,0,3000').json()

{'message': [{'Oceanic': False,
   'id': 3691,
   'lat': -13.794,
   'lon': -14.6731,
   'mag': 5.3,
   'place': 'Southern Mid-Atlantic Ridge',
   'time': 1582816698668},
  {'Oceanic': False,
   'id': 3697,
   'lat': -13.0278,
   'lon': -15.1094,
   'mag': 4.7,
   'place': 'Southern Mid-Atlantic Ridge',
   'time': 1582816227635},
  {'Oceanic': False,
   'id': 3912,
   'lat': -12.2781,
   'lon': -14.7808,
   'mag': 5.1,
   'place': 'Southern Mid-Atlantic Ridge',
   'time': 1582776671106}],
 'status_code': 200}

In [32]:
requests.get('http://quake-ds-staging.herokuapp.com/history/EMSC/0,0,3000').json()

{'message': [{'id': 8944,
   'lat': -13.79,
   'lon': -14.6,
   'mag': 5.3,
   'place': 'SOUTHERN MID-ATLANTIC RIDGE',
   'time': 1582845498700},
  {'id': 8950,
   'lat': -13.03,
   'lon': -15.11,
   'mag': 4.7,
   'place': 'SOUTHERN MID-ATLANTIC RIDGE',
   'time': 1582845027600},
  {'id': 9057,
   'lat': -12.29,
   'lon': -14.8,
   'mag': 5.1,
   'place': 'SOUTHERN MID-ATLANTIC RIDGE',
   'time': 1582805470900},
  {'id': 19970,
   'lat': -11.75,
   'lon': -13.78,
   'mag': 5.9,
   'place': 'ASCENSION ISLAND REGION',
   'time': 1579271872600},
  {'id': 20219,
   'lat': 0.11,
   'lon': -17.1,
   'mag': 5.5,
   'place': 'NORTH OF ASCENSION ISLAND',
   'time': 1579197498600},
  {'id': 22852,
   'lat': 0.18,
   'lon': -16.93,
   'mag': 5.6,
   'place': 'NORTH OF ASCENSION ISLAND',
   'time': 1578302960400},
  {'id': 23063,
   'lat': -1.2,
   'lon': -12.86,
   'mag': 4.8,
   'place': 'NORTH OF ASCENSION ISLAND',
   'time': 1578160115700},
  {'id': 26206,
   'lat': 1.71,
   'lon': 8.18,
   '

The first three quakes around the area are clearly the same quakes, so there are some in the database, the question is just finding them. 
I'm going to try to algorithmically go through all of the EMSC quakes and search for proximity of time. 

This proved to be wayyyyy to slow. probably because its calling the api 20000 times 

I'm going to go back and try alternate algorithms now.

In [15]:
# instanciate the classifier
classifier = NearestNeighbors(n_neighbors=1, algorithm='kd_tree')
# train the classifier 
classifier.fit(df)
# run the classifier on all the data 
distance, index = classifier.kneighbors(edf)
#check the best match
distance.min()

6.348473394446884

In [16]:
# instanciate the classifier
classifier = NearestNeighbors(n_neighbors=1, algorithm='auto')
# train the classifier 
classifier.fit(df)
# run the classifier on all the data 
distance, index = classifier.kneighbors(edf)
#check the best match
distance.min()

6.348473394446884

In [17]:
# instanciate the classifier
classifier = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
# train the classifier 
classifier.fit(df)
# run the classifier on all the data 
distance, index = classifier.kneighbors(edf)
#check the best match
distance.min()

6.348473394446884

In [18]:
# instanciate the classifier
classifier = NearestNeighbors(n_neighbors=1, algorithm='brute')
# train the classifier 
classifier.fit(df)
# run the classifier on all the data 
distance, index = classifier.kneighbors(edf)
#check the best match
distance.min()

0.0

# Brute force seems to be the best algorithm 

I'm going to line up the results and see what happens

In [None]:
# merge distance and index into one array, then a DF
dist_df = pd.DataFrame(np.concatenate((distance, index), axis=1), columns=['distance', 'USGS_index'])
dist_df = dist_df.set_index(pd.Index(np.arange(1,20001)))

In [None]:
# merge the predictions back with the quake data
df2 = edf.reset_index().merge(dist_df.reset_index(), left_on=0, right_on='index')

In [None]:
#seperate everything with 0 distance, theoretically they are the same quake
zero_dist = df2[df2['distance'] == 0]
zero_dist.shape, low_dist.shape

In [None]:
# rename columns for ease of reading
zero_dist = zero_dist.rename(columns={0: 'E_id', 1:'E_time', 2:'E_lat', 3:'E_lon', 4:'E_mag', 'index':'id(also)', 'distance':'distance_from_USGS', 'USGS_index':'USGS_id'})
low_dist = low_dist.rename(columns={0: 'E_id', 1:'E_time', 2:'E_lat', 3:'E_lon', 4:'E_mag', 'index':'id(also)', 'distance':'distance_from_USGS', 'USGS_index':'USGS_id'})
df2 = df2.rename(columns={0: 'E_id', 1:'E_time', 2:'E_lat', 3:'E_lon', 4:'E_mag', 'index':'id(also)', 'distance':'distance_from_USGS', 'USGS_index':'USGS_id'})

In [None]:
#checking to see if any match up, it doesn't look like they do 
merged = zero_dist.merge(df.reset_index(), left_on='USGS_id', right_on=0, how='left')
merged['time_matches'] = merged['E_mag'] == merged[4]
merged[merged['time_matches']]

Thats weird. I would have really expected that to work. I'm going to see what I can do with the kneighbors algorithm to get better results

In [None]:
#I have no idea how this works 
classifier.radius_neighbors(edf)

## Manual matching 
Well, this seems rediculous but I'm not having a lot of luck with matching up quakes as is, I'm going to try writin an algorithm that takes a quake from EMSC, then gets all quakes within 10 seconds in either direction and then matches up the magnitudes and locations. 

OK so that was a little slow. I think I'm going to try to use the dataframe that I already have and replicate the search using pandas

In [28]:
# I realized that I have more EMSC quakes and therefore should be itterating thorugh USGS
for i, quake in enumerate(USGS_QUAKES):
    replies = edf[(edf[1] > quake[1]-10000) & (edf[1] < quake[1]+10000)]
    if len(replies) > 0:
        for reply in replies.values:
            if np.absolute(reply[4]-quake[4]) < .3:
                if np.absolute(reply[2]-quake[2]) < 1:
                    if np.absolute(reply[3]-quake[3]) < 1:
                        print(f'USGS Time: {quake[1]}, EMSC Time: {reply[1]}')
                        print(f'USGS Mag: {quake[4]}, EMSC Mag: {reply[4]}')
                        print(f'USGS Lat: {quake[2]}, EMSC Lat: {reply[2]}')
                        print(f'USGS Lon: {quake[3]}, EMSC Lon:{reply[3]}')
                        print('-------------')

USGS Time: 1583662453170, EMSC Time: 1583662447700.0
USGS Mag: 2.48, EMSC Mag: 2.5
USGS Lat: 17.9203, EMSC Lat: 17.77
USGS Lon: -66.8825, EMSC Lon:-66.85
-------------
USGS Time: 1583493229990, EMSC Time: 1583493220400.0
USGS Mag: 1.84, EMSC Mag: 2.1
USGS Lat: 38.7891667, EMSC Lat: 38.81
USGS Lon: -122.735, EMSC Lon:-122.82
-------------
USGS Time: 1583232207210, EMSC Time: 1583232201200.0
USGS Mag: 3.1, EMSC Mag: 3.3
USGS Lat: 17.9443, EMSC Lat: 17.76
USGS Lon: -66.9303, EMSC Lon:-66.87
-------------
USGS Time: 1582653390250, EMSC Time: 1582653395700.0
USGS Mag: 1.91, EMSC Mag: 2.1
USGS Lat: 19.2108326, EMSC Lat: 19.21
USGS Lon: -155.4658356, EMSC Lon:-155.4
-------------
USGS Time: 1581323793470, EMSC Time: 1581323789500.0
USGS Mag: 2.55, EMSC Mag: 2.3
USGS Lat: 19.2266674, EMSC Lat: 18.85
USGS Lon: -155.4075012, EMSC Lon:-155.23
-------------
USGS Time: 1584565854320, EMSC Time: 1584565852100.0
USGS Mag: 2.89, EMSC Mag: 2.6
USGS Lat: 40.7270012, EMSC Lat: 40.76
USGS Lon: -112.042167

So plus side, dramatically faster. Down side... still only kinda working. 

Its possible that these are the only quakes that actually line up from the data, that would be cool as it would mean that we have even more data than we thought, but also means that we should 

In [22]:
len(df)

23456

In [20]:
EMSC_QUAKES[0]

(1, 1586237049900, 37.8, 16.91, 2.9)

## Some Thoughts for when I get back to this

I checked out the alternate metrics on kneighbors and I think there are a couple that would be worth trying

I can try to mess with the p value of the current metric and see what happpens 

Maybe I should try to write a function which normalizes the data? Perhaps its having trouble because the time is sooooo many orders of magnitude larger than anything else