In [53]:
import pandas as pd
from math import radians, sin, cos, sqrt, atan2
from datetime import datetime, timedelta

# Path to your dataset
file_path = '/home/halil/Documents/Code/AnalysisOfEarthquakeCatalogues/data/raw/raw_data_turkey_1900_2024.csv'

# Load the dataset into a pandas DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
print(df.head())

   No            Event ID        Date  Origin Time  Latitude  Longitude  \
0       1  19000920000001  1900-09-20  00:00:01.00      37.8       29.1   
1       2  19010223000000  1901-02-23  00:00:00.00      37.9       27.9   
2       3  19010301000001  1901-03-01  00:00:01.00      38.2       27.7   
3       4  19010401000001  1901-04-01  00:00:01.00      38.4       31.4   
4       5  19010501000001  1901-05-01  00:00:01.00      37.8       27.8   

   Depth(km)   xM   MD   ML   Mw   Ms   Mb Type  \
0        5.0  5.0  5.0    -  NaN    -    -   Ke   
1       15.0  4.8  4.7  4.6  4.8  4.6  4.7   Ke   
2        5.0  5.0  5.0    -  NaN    -    -   Ke   
3        5.0  5.0  5.0    -  NaN    -    -   Ke   
4       15.0  5.0  5.0    -  NaN    -    -   Ke   

                                        Location  
0         DENIZLI (DENIZLI) [North East  2.3 km]  
1           KENGER- (AYDIN) [North East  1.1 km]  
2  YAKACIK-BAYINDIR (IZMIR) [South West  0.8 km]  
3   ATAKENT-AKSEHIR (KONYA) [North Eas

In [69]:
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering

# Cluster parameters
# k = 20  # You can adjust this based on your specific needs
n_clusters = 20
n_neighbors = 10  # Ensure there are enough data points

# Create and apply spectral clustering
sc = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors', n_neighbors=n_neighbors, random_state=42)

# clusters = sc.fit_predict(data[['Longitude', 'Latitude']])
# Create a KMeans instance with k clusters
#kmeans = KMeans(n_clusters=k)

# Fit the model
df['Cluster'] = sc.fit_predict(df[['Latitude', 'Longitude']])

In [64]:
# Sort by Mw column (or any other column you prefer)
df_sorted = df.sort_values(by='Mw', ascending=False)

# Display the sorted DataFrame
print(df_sorted.head())

       No            Event ID        Date  Origin Time  Latitude  Longitude  \
487       488  19391226235720  1939-12-26  23:57:20.90   39.8000    39.5100   
59837   59838  20230206011732  2023-02-06  01:17:32.67   37.1757    37.0850   
59954   59955  20230206102447  2023-02-06  10:24:47.88   38.0818    37.1773   
567       568  19431126222040  1943-11-26  22:20:40.80   41.0537    33.8098   
18878   18879  19990817000137  1999-08-17  00:01:37.60   40.7600    29.9700   

       Depth(km)   xM   MD   ML   Mw   Ms   Mb Type  \
487         20.0  7.9  7.2  7.2  7.7  7.9  7.1   Ke   
59837        5.5  7.7    -  7.5  7.7    -    -   Ke   
59954        5.0  7.6    -  7.4  7.6    -    -   Ke   
567         10.0  7.5  6.7  6.7  7.5  7.2  6.6   Ke   
18878       18.0  7.4  6.7    -  7.4    -    -   Ke   

                                                Location  Cluster  
487           KURUTILEK- (ERZINCAN) [North East  3.0 km]        1  
59837  YAMACOBA-SEHITKAMIL (GAZIANTEP) [North West  0...  

In [65]:
def select_earthquakes(df, t, f, magnitude_column='Mw'):
    # Convert the magnitude column to numeric, coerce errors to NaN
    df[magnitude_column] = pd.to_numeric(df[magnitude_column], errors='coerce')
    
    # Drop rows where magnitude is NaN (since they can't be compared)
    df_cleaned = df.dropna(subset=[magnitude_column])
    
    # Filter earthquakes greater than 5 in magnitude
    greater_than_5 = df_cleaned[df_cleaned[magnitude_column] > 5]
    
    # Filter earthquakes smaller than 5 in magnitude
    less_than_5 = df_cleaned[df_cleaned[magnitude_column] < 5]
    
    # Randomly sample t earthquakes from greater_than_5
    sample_greater = greater_than_5.sample(n=t, random_state=42)
    
    # Randomly sample f earthquakes from less_than_5
    sample_less = less_than_5.sample(n=f, random_state=42)
    
    # Concatenate the two samples into one DataFrame
    result = pd.concat([sample_greater, sample_less])
    
    return result

# Example usage (select 10 earthquakes > 5 and 15 earthquakes < 5)
t = 20  # Number of earthquakes greater than magnitude 5
f = 80  # Number of earthquakes smaller than magnitude 5
selected_earthquakes = select_earthquakes(df, t, f, magnitude_column='Mw')

# Display the selected earthquakes
print(selected_earthquakes)


       No            Event ID        Date  Origin Time  Latitude  Longitude  \
1909     1910  19710223194123  1971-02-23  19:41:23.00   39.6200    27.3200   
290       291  19270605082455  1927-06-05  08:24:55.60   36.1900    31.0800   
119       120  19171227074200  1917-12-27  07:42:00.20   40.5000    26.0000   
274       275  19261022164405  1926-10-22  16:44:05.00   40.7000    43.7000   
488       489  19391227223413  1939-12-27  22:34:13.30   40.8300    36.8000   
...       ...             ...         ...          ...       ...        ...   
64092   64093  20240418233539  2024-04-18  23:35:39.02   40.0642    35.9605   
56143   56144  20200601105618  2020-06-01  10:56:18.62   36.6937    26.6853   
1679     1680  19700405122947  1970-04-05  12:29:47.70   39.3100    29.1800   
54957   54958  20191126132321  2019-11-26  13:23:21.66   36.4272    27.2350   
56216   56217  20200618090854  2020-06-18  09:08:54.11   35.9480    35.8838   

       Depth(km)   xM   MD   ML   Mw   Ms   Mb Type

In [71]:
def select_earthquakes_from_cluster(df, selected_earthquakes, s):
    selected_with_cluster = []

    # Iterate over each earthquake in selected_earthquakes
    for idx, row in selected_earthquakes.iterrows():
        cluster_id = row['Cluster']
        event_date = pd.to_datetime(row['Date'])
        
        # Filter the original dataframe for earthquakes in the same cluster that occurred before this one
        cluster_earthquakes = df[
            (df['Cluster'] == cluster_id) & 
            (pd.to_datetime(df['Date']) < event_date)
        ]
        
        # Randomly sample s earthquakes from the filtered result
        if not cluster_earthquakes.empty:
            sampled_earthquakes = cluster_earthquakes.sample(n=min(s, len(cluster_earthquakes)), random_state=42)
            selected_with_cluster.append(sampled_earthquakes)

    # Concatenate all selected earthquakes into a single DataFrame
    return pd.concat(selected_with_cluster, ignore_index=True) if selected_with_cluster else pd.DataFrame()

# Example usage
s = 30  # Number of earthquakes to sample from each cluster
resulting_earthquakes = select_earthquakes_from_cluster(df, selected_earthquakes, s)

# Display the resulting earthquakes
print(resulting_earthquakes)


      No            Event ID        Date  Origin Time  Latitude  Longitude  \
0        687  19510303214124  1951-03-03  21:41:24.90   41.9600    44.4800   
1        112  19161114135403  1916-11-14  13:54:03.40   40.8000    44.4000   
2        716  19521003113018  1952-10-03  11:30:18.00   41.3000    43.6000   
3        714  19520929085647  1952-09-29  08:56:47.20   41.2400    43.9700   
4       1555  19691215155328  1969-12-15  15:53:28.00   41.6000    43.8000   
...      ...             ...         ...          ...       ...        ...   
2809   52879  20170727230447  2017-07-27  23:04:47.35   38.8033    26.4810   
2810    6214  19820704224904  1982-07-04  22:49:04.90   38.7000    26.8600   
2811   30089  20050718202905  2005-07-18  20:29:05.60   38.3742    26.2672   
2812   11678  19920415055609  1992-04-15  05:56:09.50   39.1100    26.4400   
2813   49084  20140317202505  2014-03-17  20:25:05.84   38.6423    26.1233   

      Depth(km)   xM   MD   ML   Mw   Ms   Mb Type  \
0        

In [75]:
def create_earthquake_matrix(df, selected_earthquakes, s, magnitude_column='Mw'):
    matrix = []
    
    # Iterate over each randomly selected earthquake
    for idx, row in selected_earthquakes.iterrows():
        # 1 or 0 based on magnitude being greater than 5 or not
        magnitude_flag = 1 if row[magnitude_column] > 5 else 0
        
        # Get the event ID of the randomly selected earthquake
        selected_no = row['No    ']
        
        # Get cluster ID and date of the selected earthquake
        cluster_id = row['Cluster']
        event_date = pd.to_datetime(row['Date'])
        
        # Filter earthquakes in the same cluster and before the current earthquake
        cluster_earthquakes = df[
            (df['Cluster'] == cluster_id) & 
            (pd.to_datetime(df['Date']) < event_date)
        ]
        
        # Sample s earthquakes from the filtered cluster earthquakes
        sampled_earthquakes = cluster_earthquakes.sample(n=min(s, len(cluster_earthquakes)), random_state=42)
        
        # Create the row: [magnitude_flag, selected_event_id, sampled_event_ids...]
        row_data = [magnitude_flag, selected_no] + sampled_earthquakes['No    '].tolist()
        
        # Add the row to the matrix
        matrix.append(row_data)

        matrix_df = pd.DataFrame(matrix)
    
        # Prompt user for file path
        file_path = input("Enter the full path where you want to save the CSV file (e.g., /home/user/Downloads/batches.csv): ")
        matrix_df.to_csv(file_path, index=False)
    
        print(f"Matrix saved as {file_path}")
        
    # Convert the list of rows into a DataFrame (or a matrix)
    return pd.DataFrame(matrix)

# Example usage
s = 10  # Number of earthquakes to sample from each cluster
earthquake_matrix = create_earthquake_matrix(df, selected_earthquakes, s, magnitude_column='Mw')

# Display the resulting matrix
print(earthquake_matrix)


    0      1        2        3        4        5        6        7        8   \
0    1   1910    687.0    112.0    716.0    714.0   1555.0    195.0    701.0   
1    1    291     39.0     60.0      9.0     40.0      NaN      NaN      NaN   
2    1    120      NaN      NaN      NaN      NaN      NaN      NaN      NaN   
3    1    275    252.0     14.0    186.0     11.0    198.0     66.0    255.0   
4    1    489    317.0    218.0      6.0    217.0    391.0    302.0     24.0   
..  ..    ...      ...      ...      ...      ...      ...      ...      ...   
95   0  64093  23004.0   6737.0  48396.0  19450.0  13492.0  31888.0  19570.0   
96   0  56144  12980.0  30435.0  27031.0   6655.0  31372.0  25277.0  12441.0   
97   0   1680   1206.0   1160.0    206.0    256.0   1414.0    901.0    899.0   
98   0  54958  26950.0  23294.0  39556.0  38822.0  32351.0  18266.0  33241.0   
99   0  56217  39213.0   2491.0   5031.0   4923.0  33090.0   4887.0  11613.0   

         9        10       11  
0     5