In [3]:
# Import libraries
import numpy as np
import pandas as pd
from obspy import read
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import joblib

Categorizing Script

In [4]:
results_array = []

catalog_lunar_dir = "./data/lunar/data/training/catalogs/apollo12_catalog_GradeA_final.csv"

catalog_lunar = pd.read_csv(catalog_lunar_dir)

# Initialize an array to store results DataFrames for each catalog row
results_array = []

# Loop through each row in the catalog_lunar DataFrame
for index, row in catalog_lunar.iterrows():
    try:
    # Parse arrival time and relative arrival time
        arrival_time = datetime.strptime(row["time_abs(%Y-%m-%dT%H:%M:%S.%f)"], '%Y-%m-%dT%H:%M:%S.%f')
        arrival_time_relative = row["time_rel(sec)"]
        test_filename = row.filename
        data_dir = "./data/lunar/data/training/data/S12_GradeA/"
        csv_file = f'{data_dir}{test_filename}.csv'
        
        # Load the raw data from the corresponding CSV file
        raw_data = pd.read_csv(csv_file)

        chunk_size = 10000
        total_rows = raw_data.shape[0]

        # Create a DataFrame to hold results for the current row
        results_df = pd.DataFrame(columns=["chunk", "label"])

        # Process the raw data in chunks
        for start in range(0, total_rows, chunk_size):
            end = start + chunk_size

            chunk = raw_data.iloc[start:end]
            data_df = pd.DataFrame(chunk["time_rel(sec)"])
        
            # Categorize based on arrival time
            if arrival_time_relative < data_df["time_rel(sec)"].values.max() and arrival_time_relative > data_df["time_rel(sec)"].values.min():
                new_row = pd.DataFrame({"chunk": [chunk], "label": [1]})
            else:
                new_row = pd.DataFrame({"chunk": [chunk], "label": [0]})

            # Append new row to results DataFrame
            results_df = pd.concat([results_df, new_row], ignore_index=True)

        # Append the results DataFrame to the results array
        results_array.append(results_df)
    except FileNotFoundError:
        print(f"File not found for test filename: {test_filename}. Skipping this entry.")

File not found for test filename: xa.s12.00.mhz.1971-04-13HR00_evid00029. Skipping this entry.


In [5]:
results_array[4]

Unnamed: 0,chunk,label
0,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel(...,0
1,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel...,0
2,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel...,0
3,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel...,0
4,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel...,0
5,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel...,0
6,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel...,0
7,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel...,0
8,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel...,0
9,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel...,0


In [6]:
testing_df = pd.concat(results_array, ignore_index=True)

In [7]:
label_counts = testing_df["label"].value_counts()

count_label_1 = label_counts.get(1, 0)

count_label_1

np.int64(75)

Test Data

In [55]:
# test_directory = './data/lunar/data/test/data/S15_GradeA/'
# row = glob.glob(os.path.join(test_directory, '*.mseed'))

test_directory = './data/lunar/data/training/data/S12_GradeA/'
cat_directory = './data/lunar/data/training/catalogs/'
cat_file = cat_directory + 'apollo12_catalog_GradeA_final.csv'
cat = pd.read_csv(cat_file)
cat

row = cat.iloc[2]
arrival_time = datetime.strptime(row['time_abs(%Y-%m-%dT%H:%M:%S.%f)'],'%Y-%m-%dT%H:%M:%S.%f')
arrival_time

# If we want the value of relative time, we don't need to use datetime
arrival_time_rel = row['time_rel(sec)']
arrival_time_rel

# Let's also get the name of the file
test_filename = row.filename
print(test_filename)

# test_filename = row[2]
# print(test_filename)

mseed_file = f'{test_directory}{test_filename}.mseed'
st = read(mseed_file)
print(st)

tr = st.traces[0].copy()
tr_times = tr.times()
tr_data = tr.data
print(tr_data)

test_df = pd.DataFrame(columns=["chunk"])
total_rows = tr_data.shape[0]

def extract_features(chunk):
    return [
        chunk['time_rel(sec)'].mean(),
        chunk['time_rel(sec)'].std(),
        chunk['time_rel(sec)'].min(),
        chunk['time_rel(sec)'].max()
    ]



for start in range(0, total_rows, chunk_size):
    end = start + chunk_size

    chunk = raw_data.iloc[start:end]

    data_df = pd.DataFrame(chunk["time_rel(sec)"])

    new_row = pd.DataFrame({"chunk": [chunk]})

    test_df = pd.concat([test_df, new_row], ignore_index=True)


test_df

xa.s12.00.mhz.1970-03-26HR00_evid00004
1 Trace(s) in Stream:
XA.S12.00.MHZ | 1970-03-26T00:00:00.565000Z - 1970-03-27T00:00:02.074434Z | 6.6 Hz, 572411 samples
[-2.82124634e-14 -3.52331707e-14 -3.85933435e-14 ... -8.55618968e-15
 -1.20576801e-14 -1.86280424e-14]


Unnamed: 0,chunk
0,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel(...
1,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel...
2,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel...
3,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel...
4,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel...
5,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel...
6,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel...
7,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel...
8,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel...
9,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_rel...


In [56]:
# Hold chunks and labels
X = []
y = []

for index, row in testing_df.iterrows():
    # Convert chunk to DataFrame and select the "time_rel(sec)" column
    chunk_data = row["chunk"]["time_rel(sec)"].to_numpy()

    # Ensure chunk_data is numeric, ignore non-numeric values
    chunk_data = pd.to_numeric(chunk_data, errors='coerce')

    # Extract features from chunk_data
    features = [
        np.nanmean(chunk_data),  # Mean of time_rel(sec), ignoring NaNs
        np.nanstd(chunk_data),   # Standard deviation, ignoring NaNs
        np.nanmin(chunk_data),   # Minimum value, ignoring NaNs
        np.nanmax(chunk_data),   # Maximum value, ignoring NaNs
    ]

    X.append(features)  # Append features
    y.append(row["label"])  # Append label

# Convert lists to numpy arrays
X = np.array(X)
y = np.array(y)

X

array([[  754.64150943,   435.73605004,     0.        ,  1509.28301887],
       [ 2264.0754717 ,   435.73605004,  1509.43396226,  3018.71698113],
       [ 3773.50943396,   435.73605004,  3018.86792453,  4528.1509434 ],
       ...,
       [83773.50943396,   435.73605004, 83018.86792453, 84528.1509434 ],
       [85282.94339623,   435.73605004, 84528.30188679, 86037.58490566],
       [86219.32075472,   104.88165872, 86037.73584906, 86400.90566038]])

With RandomForest

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a RandomForest classifier
clf = RandomForestClassifier(class_weight='balanced')

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.2f}")

Z_test = np.array([extract_features(chunk) for chunk in test_df['chunk']])
y_pred = clf.predict(Z_test)
print(y_pred)

y_pred_proba = clf.predict_proba(X_test)
print(y_pred_proba)
# len(X_train)

Z_test


Model accuracy: 0.73
[0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0
 0 1 0 0 0 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0]
[[0.45549607 0.54450393]
 [0.64332516 0.35667484]
 [0.68673466 0.31326534]
 ...
 [0.67510525 0.32489475]
 [0.64237826 0.35762174]
 [1.         0.        ]]


array([[  754.64150943,   435.75783848,     0.        ,  1509.28301887],
       [ 2264.0754717 ,   435.75783848,  1509.43396226,  3018.71698113],
       [ 3773.50943396,   435.75783848,  3018.86792453,  4528.1509434 ],
       [ 5282.94339623,   435.75783848,  4528.30188679,  6037.58490566],
       [ 6792.37735849,   435.75783848,  6037.73584906,  7547.01886792],
       [ 8301.81132075,   435.75783848,  7547.16981132,  9056.45283019],
       [ 9811.24528302,   435.75783848,  9056.60377358, 10565.88679245],
       [11320.67924528,   435.75783848, 10566.03773585, 12075.32075472],
       [12830.11320755,   435.75783848, 12075.47169811, 13584.75471698],
       [14339.54716981,   435.75783848, 13584.90566038, 15094.18867925],
       [15848.98113208,   435.75783848, 15094.33962264, 16603.62264151],
       [17358.41509434,   435.75783848, 16603.77358491, 18113.05660377],
       [18867.8490566 ,   435.75783848, 18113.20754717, 19622.49056604],
       [20377.28301887,   435.75783848, 19622.64150

With LogisticRegression

In [58]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
log_reg = LogisticRegression(class_weight='balanced')

# Train the model
log_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.2f}")

Z_test = np.array([extract_features(chunk) for chunk in test_df['chunk']])
y_pred = log_reg.predict(Z_test)
y_pred

Model accuracy: 0.45


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

Testing the Model

In [59]:
joblib.dump(log_reg, "LR-QuakeModel.pkl")

data = pd.read_csv("")


FileNotFoundError: [Errno 2] No such file or directory: ''