## Part 1. Data modifications
### 1.a Get data and filter earthquakes after 1993

Get data

In [10]:
import pandas as pd
df = pd.read_csv("../../data/processed/processed_data_turkey_1900_2024_datetime.csv")
df['Datetime'] = pd.to_datetime(df['Datetime'], format="%d/%m/%Y %H:%M:%S")
raw_df = df
raw_df

Unnamed: 0,Event ID,Datetime,Latitude,Longitude,Depth(km),xM,MD,ML,Mw,Ms,Mb,Type,Location
0,19000920000001,1900-09-20 00:00:01,37.8000,29.1000,5.0,5.0,5.0,-,,-,-,Ke,DENIZLI (DENIZLI) [North East 2.3 km]
1,19010223000000,1901-02-23 00:00:00,37.9000,27.9000,15.0,4.8,4.7,4.6,4.8,4.6,4.7,Ke,KENGER- (AYDIN) [North East 1.1 km]
2,19010301000001,1901-03-01 00:00:01,38.2000,27.7000,5.0,5.0,5.0,-,,-,-,Ke,YAKACIK-BAYINDIR (IZMIR) [South West 0.8 km]
3,19010401000001,1901-04-01 00:00:01,38.4000,31.4000,5.0,5.0,5.0,-,,-,-,Ke,ATAKENT-AKSEHIR (KONYA) [North East 2.4 km]
4,19010501000001,1901-05-01 00:00:01,37.8000,27.8000,15.0,5.0,5.0,-,,-,-,Ke,KADIKOY- (AYDIN) [South West 2.6 km]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
64134,20240428211453,2024-04-28 21:14:53,38.5330,38.0423,5.4,3.9,-,3.8,3.9,-,-,Ke,CIVRIL-YAZIHAN (MALATYA) [South 3.0 km]
64135,20240428213328,2024-04-28 21:33:28,37.0177,29.7712,6.2,4.0,-,4.0,3.9,-,-,Ke,YAZIR-CAVDIR (BURDUR) [South East 2.5 km]
64136,20240429220835,2024-04-29 22:08:35,38.3103,38.2865,5.0,3.2,-,3.2,3.0,-,-,Ke,BOSTANBASI-YESILYURT (MALATYA) [East 3.1 km]
64137,20240430080325,2024-04-30 08:03:25,38.2458,31.5638,5.0,3.6,-,3.5,3.6,-,-,Ke,YAYLABELEN-AKSEHIR (KONYA) [North East 2.6 km]


Get eqs after 1993

In [11]:
df_93s = df.copy()
df_93s = (df_93s[df_93s['Datetime'] >= pd.Timestamp('1993-01-01')]).sort_values('Datetime', ascending=True)
df_93s

Unnamed: 0,Event ID,Datetime,Latitude,Longitude,Depth(km),xM,MD,ML,Mw,Ms,Mb,Type,Location
12289,19930101142459,1993-01-01 14:24:59,38.2500,26.9900,0.0,3.1,3.1,-,,-,-,Ke,EFEMCUKURU-MENDERES (IZMIR) [South East 3.7 km]
12290,19930101204941,1993-01-01 20:49:41,38.2700,26.8500,8.0,3.1,3.1,-,,-,-,Ke,BADEMLER-URLA (IZMIR) [South East 1.9 km]
12291,19930101214621,1993-01-01 21:46:21,35.0200,33.2100,13.0,3.2,-,-,,-,3.2,Ke,KIBRIS-LEFKOSA
12292,19930102014840,1993-01-02 01:48:40,39.2900,28.7000,14.0,3.1,3.1,-,,-,-,Ke,KIZILCIK-SIMAV (KÜTAHYA) [North East 4.3 km]
12294,19930105081346,1993-01-05 08:13:46,39.2800,28.6800,8.0,3.0,3.0,-,,-,-,Ke,KIZILCIK-SIMAV (KÜTAHYA) [East 2.4 km]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
64134,20240428211453,2024-04-28 21:14:53,38.5330,38.0423,5.4,3.9,-,3.8,3.9,-,-,Ke,CIVRIL-YAZIHAN (MALATYA) [South 3.0 km]
64135,20240428213328,2024-04-28 21:33:28,37.0177,29.7712,6.2,4.0,-,4.0,3.9,-,-,Ke,YAZIR-CAVDIR (BURDUR) [South East 2.5 km]
64136,20240429220835,2024-04-29 22:08:35,38.3103,38.2865,5.0,3.2,-,3.2,3.0,-,-,Ke,BOSTANBASI-YESILYURT (MALATYA) [East 3.1 km]
64137,20240430080325,2024-04-30 08:03:25,38.2458,31.5638,5.0,3.6,-,3.5,3.6,-,-,Ke,YAYLABELEN-AKSEHIR (KONYA) [North East 2.6 km]


### 1.b Adding cartesian system components to the all earthquakes in dataframe

First let's define the functions that we need for adding cartesian system components

In [12]:
import numpy as np

def lat_lon_to_cartesian(lat, lon, R=6371.0):
    # Convert latitude and longitude from degrees to radians
    lat_rad = np.radians(lat)
    lon_rad = np.radians(lon)
    
    # Calculate x, y, z coordinates
    x = R * np.cos(lat_rad) * np.cos(lon_rad)
    y = R * np.cos(lat_rad) * np.sin(lon_rad)
    z = R * np.sin(lat_rad)
    
    return np.array([x, y, z])

def spherical_vector(lat1, lon1, lat2, lon2):
    # Convert points to Cartesian coordinates
    p1 = lat_lon_to_cartesian(lat1, lon1)
    p2 = lat_lon_to_cartesian(lat2, lon2)
    
    # Calculate Cartesian difference vector
    d_cartesian = p2 - p1
    
    # The resulting vector contains x, y, and z components
    return d_cartesian

# This function adds cartesian coordinate system components as new columns to the df
def lat_lon_to_cartesian_vectorized(df, R=6371.0):
    lat_rad = np.radians(df['Latitude'].values)
    lon_rad = np.radians(df['Longitude'].values)
    
    x = R * np.cos(lat_rad) * np.cos(lon_rad)
    y = R * np.cos(lat_rad) * np.sin(lon_rad)
    z = R * np.sin(lat_rad)
    
    df_copy = df.copy()
    df_copy['X'] = x
    df_copy['Y'] = y
    df_copy['Z'] = z
    
    return df_copy

Testing recently defined functions

In [13]:
df_93s = lat_lon_to_cartesian_vectorized(df_93s)
df_93s

Unnamed: 0,Event ID,Datetime,Latitude,Longitude,Depth(km),xM,MD,ML,Mw,Ms,Mb,Type,Location,X,Y,Z
12289,19930101142459,1993-01-01 14:24:59,38.2500,26.9900,0.0,3.1,3.1,-,,-,-,Ke,EFEMCUKURU-MENDERES (IZMIR) [South East 3.7 km],4458.328476,2270.651769,3944.247551
12290,19930101204941,1993-01-01 20:49:41,38.2700,26.8500,8.0,3.1,3.1,-,,-,-,Ke,BADEMLER-URLA (IZMIR) [South East 1.9 km],4462.634766,2259.129271,3945.993776
12291,19930101214621,1993-01-01 21:46:21,35.0200,33.2100,13.0,3.2,-,-,,-,3.2,Ke,KIBRIS-LEFKOSA,4365.354070,2857.695938,3656.076964
12292,19930102014840,1993-01-02 01:48:40,39.2900,28.7000,14.0,3.1,3.1,-,,-,-,Ke,KIZILCIK-SIMAV (KÜTAHYA) [North East 4.3 km],4325.067555,2367.905321,4034.409007
12294,19930105081346,1993-01-05 08:13:46,39.2800,28.6800,8.0,3.0,3.0,-,,-,-,Ke,KIZILCIK-SIMAV (KÜTAHYA) [East 2.4 km],4326.511530,2366.733335,4033.548352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64134,20240428211453,2024-04-28 21:14:53,38.5330,38.0423,5.4,3.9,-,3.8,3.9,-,-,Ke,CIVRIL-YAZIHAN (MALATYA) [South 3.0 km],3924.951926,3071.177681,3968.911820
64135,20240428213328,2024-04-28 21:33:28,37.0177,29.7712,6.2,4.0,-,4.0,3.9,-,-,Ke,YAZIR-CAVDIR (BURDUR) [South East 2.5 km],4415.525476,2525.848674,3835.735164
64136,20240429220835,2024-04-29 22:08:35,38.3103,38.2865,5.0,3.2,-,3.2,3.0,-,-,Ke,BOSTANBASI-YESILYURT (MALATYA) [East 3.1 km],3923.905739,3097.413105,3949.510958
64137,20240430080325,2024-04-30 08:03:25,38.2458,31.5638,5.0,3.6,-,3.5,3.6,-,-,Ke,YAYLABELEN-AKSEHIR (KONYA) [North East 2.6 km],4263.308197,2619.093085,3943.880783


### 1.c Filter earthquake function which takes an earthquake filters it with space-time and magnitude and returns past earthquakes, event earthquake

In [14]:
# TODO: If we are going to remove all aftershock earthquakes you should do this in before this step.

In [15]:
from src.util import earthquake_filter_utils

def space_time_filter(df, event_id, radius=10, past_years=30, num_earthquakes=30):
    filtered_df = earthquake_filter_utils.distance_filter(df, event_id, radius)
    filtered_df = earthquake_filter_utils.time_filter(filtered_df, event_id, past_years)
    filtered_df = earthquake_filter_utils.past_earthquakes_filter(filtered_df, event_id, num_earthquakes)
    return filtered_df.iloc[:-1], filtered_df.iloc[-1]  # past_s_earthquakes, target earthquake

# IMPORTANT NOTE
## This function uses space_time_filter returns the data we need for DL algorithms
### Apply general filters to df before using this function (like earthquakes after 1993) but apply specific filters (like spacetime) after that function 
### 1.d get a simple sample

In [16]:
def get_row_property(df, idx_array):
    event_id = df.iloc[idx_array]["Event ID"]
    magnitude = df.iloc[idx_array]["xM"]
    depth = df.iloc[idx_array]["Depth(km)"]
    x = df.iloc[idx_array]["X"]
    y = df.iloc[idx_array]["Y"]
    z = df.iloc[idx_array]["Z"]
    date = df.iloc[idx_array]["Datetime"]
    return np.array([event_id, magnitude, depth, x, y, z, date])

In [17]:

def get_sample_from_eid(df, event_id, radius=10, past_years=30, num_earthquakes=30, big_eq_min_magnitude=5.5):
    sample_of_eid, target_earthquake = space_time_filter(df, event_id, radius, past_years, num_earthquakes)
    
    # Extract the relevant columns to NumPy arrays (skipping Event ID column)
    properties = sample_of_eid[['xM', 'Depth(km)', 'X', 'Y', 'Z', 'Datetime']].values
    
    # Calculate the differences between consecutive rows (axis=0 for row-wise subtraction)
    differences = np.diff(properties, axis=0)
    
    # Extract the datetime column
    datetime_col = properties[:, 5]

    # Calculate the differences between consecutive datetime values
    time_differences = datetime_col[1:] - datetime_col[:-1]

    # Convert the time differences to hours
    time_diffs_hours = np.array([td.total_seconds() / 3600. for td in time_differences])
    
    # Replace the last column in 'differences' with the time differences converted to hours
    differences[:, 5] = time_diffs_hours

    # Create the output matrix
    X_eid = differences

    # Determine y_eid: 1 if target earthquake magnitude >= big_eq_min_magnitude, otherwise 0
    y_eid = 1 if target_earthquake['xM'] >= big_eq_min_magnitude else 0
    
    return X_eid, y_eid

In [18]:
example_one_sample = get_sample_from_eid(df_93s, 19990817000137)
X_one_instance, y_of_that_instance = example_one_sample
example_one_sample

(array([[0.0, 0.0, 1.0493304195033488, -0.3678815424818822,
         -0.8425638220815017, 0.32555555555555554],
        [0.0, 3.0, 0.6286533164766297, 0.36207592593245863,
         -0.8426904529842432, 0.04083333333333333],
        [0.0, -7.0, -3.782642915109136, 3.654858349093047,
         1.6852542750657449, 3.923888888888889],
        [0.10000000000000009, -1.0, 0.0, 0.0, 0.0, 1.3633333333333333],
        [-0.10000000000000009, 8.0, -1.044101673584919,
         -2.547606131098746, 2.526931424175018, 157.80666666666667],
        [0.3999999999999999, 0.0, 2.514875242063681, 1.4507936576919747,
         -3.3694952462565197, 0.034722222222222224],
        [-0.2999999999999998, -3.0, 2.9403952039774595,
         -2.1960433604576792, -1.6855075112043778, 80.70416666666667],
        [0.0, 3.0, 0.20759592667491233, 1.0923987778373885,
         -0.8429436377791717, 108.75861111111111],
        [0.0, -8.0, -3.984305489045255, -0.3513061553535408,
         4.2134521365751425, 113.0388888888888

## Part 2. Collect samples
### 2.a Removing all aftershock events

Important note: `remove_aftershocks_from_eq()` function raises ValueError if given earthquake magnitude is smaller than minimum magnitude.

In [19]:
def get_aftershocks_from_eq(df, event_id, radius=10, big_eq_min_magnitude=5.5):
    event_mag = earthquake_filter_utils.get_value_from_eid(df, event_id, "xM")
    if event_mag < big_eq_min_magnitude:
        raise ValueError(f"Earthquake magnitude should minimum {big_eq_min_magnitude}. Given event_id={event_id}, xM={event_mag}")
    else:
         # Calculate the aftershock duration in hours (using the empirical formula)
        aftershock_duration_in_hours = 10 ** (event_mag - 3.0) * 24
        
        # Get the datetime of the main event
        event_datetime = earthquake_filter_utils.get_value_from_eid(df, event_id, "Datetime")
        
        # FIXME: Distance parameter might be change with a formula. 
        
        # Filter earthquakes by distance from the event
        filtered_df = earthquake_filter_utils.distance_filter(df, event_id, radius)
    
        # Calculate the time window for aftershocks
        end_time = event_datetime + pd.Timedelta(hours=aftershock_duration_in_hours)
        
        # Return all earthquakes that occurred after the main event but before the end of the aftershock window
        aftershocks_df = filtered_df[(filtered_df['Datetime'] > event_datetime) & (filtered_df['Datetime'] <= end_time)]
        return aftershocks_df

Using `get_aftershocks_from_eq()` function now we can remove all aftershocks from dataframe.

In [20]:
def remove_all_aftershocks_from_data(df, radius=10, big_eq_min_magnitude=5.5):
     # Create a copy of the original DataFrame to avoid modifying it
    df_copy = df.copy()

    # Filter large earthquakes based on the magnitude threshold
    df_large_eqs = df_copy[df_copy["xM"] >= big_eq_min_magnitude]
    
    # Create an empty DataFrame to store all aftershocks
    df_all_aftershocks = pd.DataFrame(columns=df.columns)
    
    # Loop through each large earthquake and use the helper function to find aftershocks
    for index, large_eq in df_large_eqs.iterrows():
        event_id = large_eq["Event ID"]
        
        # Use the helper function get_aftershocks_from_eq to find aftershocks for the current large earthquake
        aftershocks_df = get_aftershocks_from_eq(df_copy, event_id, radius, big_eq_min_magnitude)
        
        # Concatenate the identified aftershocks into the df_all_aftershocks DataFrame
        df_all_aftershocks = pd.concat([df_all_aftershocks, aftershocks_df], ignore_index=True)
    
    # Remove all rows from df_copy that are in df_all_aftershocks
    df_without_aftershocks = df_copy[~df_copy["Event ID"].isin(df_all_aftershocks["Event ID"])]
    
    return df_without_aftershocks

In [21]:
df_test = df_93s.copy()
df_test # Before

Unnamed: 0,Event ID,Datetime,Latitude,Longitude,Depth(km),xM,MD,ML,Mw,Ms,Mb,Type,Location,X,Y,Z
12289,19930101142459,1993-01-01 14:24:59,38.2500,26.9900,0.0,3.1,3.1,-,,-,-,Ke,EFEMCUKURU-MENDERES (IZMIR) [South East 3.7 km],4458.328476,2270.651769,3944.247551
12290,19930101204941,1993-01-01 20:49:41,38.2700,26.8500,8.0,3.1,3.1,-,,-,-,Ke,BADEMLER-URLA (IZMIR) [South East 1.9 km],4462.634766,2259.129271,3945.993776
12291,19930101214621,1993-01-01 21:46:21,35.0200,33.2100,13.0,3.2,-,-,,-,3.2,Ke,KIBRIS-LEFKOSA,4365.354070,2857.695938,3656.076964
12292,19930102014840,1993-01-02 01:48:40,39.2900,28.7000,14.0,3.1,3.1,-,,-,-,Ke,KIZILCIK-SIMAV (KÜTAHYA) [North East 4.3 km],4325.067555,2367.905321,4034.409007
12294,19930105081346,1993-01-05 08:13:46,39.2800,28.6800,8.0,3.0,3.0,-,,-,-,Ke,KIZILCIK-SIMAV (KÜTAHYA) [East 2.4 km],4326.511530,2366.733335,4033.548352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64134,20240428211453,2024-04-28 21:14:53,38.5330,38.0423,5.4,3.9,-,3.8,3.9,-,-,Ke,CIVRIL-YAZIHAN (MALATYA) [South 3.0 km],3924.951926,3071.177681,3968.911820
64135,20240428213328,2024-04-28 21:33:28,37.0177,29.7712,6.2,4.0,-,4.0,3.9,-,-,Ke,YAZIR-CAVDIR (BURDUR) [South East 2.5 km],4415.525476,2525.848674,3835.735164
64136,20240429220835,2024-04-29 22:08:35,38.3103,38.2865,5.0,3.2,-,3.2,3.0,-,-,Ke,BOSTANBASI-YESILYURT (MALATYA) [East 3.1 km],3923.905739,3097.413105,3949.510958
64137,20240430080325,2024-04-30 08:03:25,38.2458,31.5638,5.0,3.6,-,3.5,3.6,-,-,Ke,YAYLABELEN-AKSEHIR (KONYA) [North East 2.6 km],4263.308197,2619.093085,3943.880783


In [22]:
df_test = remove_all_aftershocks_from_data(df_test)
df_test # After

  df_all_aftershocks = pd.concat([df_all_aftershocks, aftershocks_df], ignore_index=True)


Unnamed: 0,Event ID,Datetime,Latitude,Longitude,Depth(km),xM,MD,ML,Mw,Ms,Mb,Type,Location,X,Y,Z
12289,19930101142459,1993-01-01 14:24:59,38.2500,26.9900,0.0,3.1,3.1,-,,-,-,Ke,EFEMCUKURU-MENDERES (IZMIR) [South East 3.7 km],4458.328476,2270.651769,3944.247551
12290,19930101204941,1993-01-01 20:49:41,38.2700,26.8500,8.0,3.1,3.1,-,,-,-,Ke,BADEMLER-URLA (IZMIR) [South East 1.9 km],4462.634766,2259.129271,3945.993776
12291,19930101214621,1993-01-01 21:46:21,35.0200,33.2100,13.0,3.2,-,-,,-,3.2,Ke,KIBRIS-LEFKOSA,4365.354070,2857.695938,3656.076964
12292,19930102014840,1993-01-02 01:48:40,39.2900,28.7000,14.0,3.1,3.1,-,,-,-,Ke,KIZILCIK-SIMAV (KÜTAHYA) [North East 4.3 km],4325.067555,2367.905321,4034.409007
12294,19930105081346,1993-01-05 08:13:46,39.2800,28.6800,8.0,3.0,3.0,-,,-,-,Ke,KIZILCIK-SIMAV (KÜTAHYA) [East 2.4 km],4326.511530,2366.733335,4033.548352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64133,20240428163059,2024-04-28 16:30:59,38.0518,36.6597,5.0,3.0,-,3.0,2.8,-,-,Ke,GUCUKSU-GOKSUN (KAHRAMANMARAS) [South East 0....,4024.510670,2995.376693,3926.916517
64134,20240428211453,2024-04-28 21:14:53,38.5330,38.0423,5.4,3.9,-,3.8,3.9,-,-,Ke,CIVRIL-YAZIHAN (MALATYA) [South 3.0 km],3924.951926,3071.177681,3968.911820
64135,20240428213328,2024-04-28 21:33:28,37.0177,29.7712,6.2,4.0,-,4.0,3.9,-,-,Ke,YAZIR-CAVDIR (BURDUR) [South East 2.5 km],4415.525476,2525.848674,3835.735164
64137,20240430080325,2024-04-30 08:03:25,38.2458,31.5638,5.0,3.6,-,3.5,3.6,-,-,Ke,YAYLABELEN-AKSEHIR (KONYA) [North East 2.6 km],4263.308197,2619.093085,3943.880783
