In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import warnings
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
warnings.filterwarnings("ignore")

2023-10-29 02:44:03.367215: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
TIME_CUT = "1973-01"
MAG_TH = 5
TH = 150
GEO_SPLIT = 1

In [3]:
df = pd.read_csv("../data/usgs_data_small.csv")
df.dropna(inplace=True)
df.head()

Unnamed: 0,time,longitude,latitude,depth,mag
31,1728-02-08 12:54:00,-70.979,42.842,0.0,4.0
146,1843-02-08 16:14:00,-61.17,16.73,40.0,8.5
179,1853-09-02 01:24:00,-120.8,36.25,10.0,6.3
222,1860-04-17 04:54:00,-120.95,36.35,10.0,6.0
225,1860-10-17 12:39:00,-70.1,47.5,10.0,6.1


In [4]:
df["time_new"] = df["time"].apply(lambda x: x[:7])
df = df[df["time_new"] > TIME_CUT]
df["time"] = pd.to_datetime(df["time"], format="mixed")
df["time_new"] = pd.to_datetime(df["time_new"], format="%Y-%m")
df

Unnamed: 0,time,longitude,latitude,depth,mag,time_new
71255,1973-02-01 01:33:03.700,167.175000,-15.565000,45.000,5.00,1973-02-01
71256,1973-02-01 03:18:46.460,-155.422667,19.437500,5.750,2.95,1973-02-01
71257,1973-02-01 03:34:27.530,-155.550167,19.394833,14.180,2.48,1973-02-01
71258,1973-02-01 03:58:40.100,-91.480000,-0.628000,33.000,4.70,1973-02-01
71259,1973-02-01 05:10:23.000,-80.534000,-4.033000,75.000,4.90,1973-02-01
...,...,...,...,...,...,...
4451672,2023-10-01 01:27:41.780,-179.015000,51.730000,10.980,1.22,2023-10-01
4451673,2023-10-01 01:30:30.046,152.699600,-4.342900,65.434,4.90,2023-10-01
4451674,2023-10-01 01:33:05.281,-152.662000,60.099900,109.400,1.90,2023-10-01
4451675,2023-10-01 01:41:39.645,-175.317600,51.902800,75.100,2.10,2023-10-01


In [5]:
df["time"].quantile(0.8), df["time"].quantile(0.9)

(Timestamp('2018-10-18 07:15:48.347200'),
 Timestamp('2021-01-06 23:41:01.441600'))

In [6]:
df["latitude_disc"] = (df["latitude"] // GEO_SPLIT * GEO_SPLIT).astype(int)
df["longitude_disc"] = (df["longitude"] // GEO_SPLIT * GEO_SPLIT).astype(int)
df["pos"] = df["latitude_disc"].astype(str) + "_" + df["longitude_disc"].astype(str)

In [7]:
def filter_regions(df: pd.DataFrame, threshold: int) -> pd.DataFrame:
    df_f = df[df["time"] <= df["time"].quantile(0.8)]
    df_agg = df_f.groupby(["pos"])["mag"].count().reset_index()
    tmp = df_agg.loc[df_agg["mag"] >= threshold, "pos"].values
    df = df[df["pos"].isin(tmp)]
    return df

In [8]:
print(df.shape)
df = filter_regions(df, TH)
print(df.shape)

(4224878, 9)
(3972931, 9)


In [9]:
df

Unnamed: 0,time,longitude,latitude,depth,mag,time_new,latitude_disc,longitude_disc,pos
71255,1973-02-01 01:33:03.700,167.175000,-15.565000,45.000,5.00,1973-02-01,-16,167,-16_167
71256,1973-02-01 03:18:46.460,-155.422667,19.437500,5.750,2.95,1973-02-01,19,-156,19_-156
71257,1973-02-01 03:34:27.530,-155.550167,19.394833,14.180,2.48,1973-02-01,19,-156,19_-156
71260,1973-02-01 05:43:44.790,-120.036000,47.629500,7.927,1.90,1973-02-01,47,-121,47_-121
71261,1973-02-01 06:04:46.390,-119.311000,46.847667,-0.314,0.60,1973-02-01,46,-120,46_-120
...,...,...,...,...,...,...,...,...,...
4451672,2023-10-01 01:27:41.780,-179.015000,51.730000,10.980,1.22,2023-10-01,51,-180,51_-180
4451673,2023-10-01 01:30:30.046,152.699600,-4.342900,65.434,4.90,2023-10-01,-5,152,-5_152
4451674,2023-10-01 01:33:05.281,-152.662000,60.099900,109.400,1.90,2023-10-01,60,-153,60_-153
4451675,2023-10-01 01:41:39.645,-175.317600,51.902800,75.100,2.10,2023-10-01,51,-176,51_-176


In [10]:
# https://www.usgs.gov/faqs/how-can-earthquake-have-negative-magnitude
df["mag"].min()

-9.99

In [11]:
# https://www.usgs.gov/observatories/hvo/news/volcano-watch-why-do-some-earthquakes-have-negative-depths
df["depth"].min()

-10.0

In [12]:
def add_features(df, mag_th):
    dfs = []
    df["time"] = pd.to_datetime(df["time"], format="mixed")
    df["time_new"] = pd.to_datetime(df["time_new"])
    for pos in df["pos"].unique():
        tmp = df[df["pos"] == pos]
        tmp.sort_values("time", inplace=True)
        tmp["diff_days"] = (tmp["time"] - tmp["time"].shift(1)).dt.days
        for time in tmp["time_new"].unique():
            tmp_t0 = tmp[tmp["time_new"] == time]
            tmp_t1 = tmp[tmp["time_new"] == time + pd.DateOffset(months=1)]
            max_mag = tmp_t1["mag"].max()
            tmp_t0["label"] = 0 if max_mag < mag_th else 1
            dfs.append(tmp_t0)
    return pd.concat(dfs)

In [14]:
df = add_features(df, MAG_TH)
df

Unnamed: 0,time,longitude,latitude,depth,mag,time_new,latitude_disc,longitude_disc,pos,diff_days,label
71255,1973-02-01 01:33:03.700,167.1750,-15.5650,45.000,5.0,1973-02-01,-16,167,-16_167,,0
71618,1973-02-15 16:40:55.500,167.1410,-15.1390,64.000,4.8,1973-02-01,-16,167,-16_167,14.0,0
72568,1973-03-27 20:55:27.900,167.2960,-15.0080,135.000,4.9,1973-03-01,-16,167,-16_167,40.0,1
72894,1973-04-08 13:41:02.000,167.2180,-15.7790,35.000,6.4,1973-04-01,-16,167,-16_167,11.0,1
73214,1973-04-21 21:30:35.700,167.2830,-15.8820,33.000,5.3,1973-04-01,-16,167,-16_167,13.0,1
...,...,...,...,...,...,...,...,...,...,...,...
4348175,2023-01-24 06:52:23.260,-97.8194,37.2064,5.000,2.6,2023-01-01,37,-98,37_-98,37.0,0
4355466,2023-02-11 03:29:01.909,-97.8680,37.5394,5.000,3.2,2023-02-01,37,-98,37_-98,17.0,0
4361554,2023-02-26 06:48:59.171,-97.2213,37.7068,2.198,2.3,2023-02-01,37,-98,37_-98,15.0,0
4364525,2023-03-05 20:35:51.407,-97.8587,37.0150,5.000,2.2,2023-03-01,37,-98,37_-98,7.0,0


In [15]:
df.to_csv("../data/with_features.csv", index=False)