In [1]:
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

In [2]:
# Step 1: Load the moonquake type mapping CSV file
mapping_file = 'archive/data/lunar/training/catalogs/apollo12_catalog_GradeA_final.csv'  # Replace with your path
mapping_df = pd.read_csv(mapping_file)

In [3]:
mapping_df.head()

Unnamed: 0,filename,time_abs(%Y-%m-%dT%H:%M:%S.%f),time_rel(sec),evid,mq_type
0,xa.s12.00.mhz.1970-01-19HR00_evid00002,1970-01-19T20:25:00.000000,73500.0,evid00002,impact_mq
1,xa.s12.00.mhz.1970-03-25HR00_evid00003,1970-03-25T03:32:00.000000,12720.0,evid00003,impact_mq
2,xa.s12.00.mhz.1970-03-26HR00_evid00004,1970-03-26T20:17:00.000000,73020.0,evid00004,impact_mq
3,xa.s12.00.mhz.1970-04-25HR00_evid00006,1970-04-25T01:14:00.000000,4440.0,evid00006,impact_mq
4,xa.s12.00.mhz.1970-04-26HR00_evid00007,1970-04-26T14:29:00.000000,52140.0,evid00007,deep_mq


In [4]:
mapping_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76 entries, 0 to 75
Data columns (total 5 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   filename                        76 non-null     object 
 1   time_abs(%Y-%m-%dT%H:%M:%S.%f)  76 non-null     object 
 2   time_rel(sec)                   76 non-null     float64
 3   evid                            76 non-null     object 
 4   mq_type                         76 non-null     object 
dtypes: float64(1), object(4)
memory usage: 3.1+ KB


In [5]:
# Step 2: Load individual moonquake CSV files and process them
def process_moonquake_file(filepath):
    # Load moonquake data
    df = pd.read_csv(filepath)

    # Use the correct column name for time_abs
    time_column = str.format('time_abs(%Y-%m-%dT%H:%M:%S.%f)')
    
    # Check if the time_abs column exists
    if time_column not in df.columns:
        print(f"Warning: '{time_column}' column missing in file {filepath}")
        return None
    
    # Parse time features from the absolute time
    df[time_column] = pd.to_datetime(df[time_column], format='%Y-%m-%dT%H:%M:%S.%f')
    df['hour'] = df[time_column].dt.hour
    df['day'] = df[time_column].dt.day
    df['month'] = df[time_column].dt.month
    
    # Calculate velocity statistics
    velocity_stats = {
        'velocity_mean': df['velocity(m/s)'].mean(),
        'velocity_max': df['velocity(m/s)'].max(),
        'velocity_min': df['velocity(m/s)'].min(),
        'velocity_std': df['velocity(m/s)'].std(),
    }
    
    # Combine with time-based features (you can add more if needed)
    features = {
        'hour': df['hour'].iloc[0],
        'day': df['day'].iloc[0],
        'month': df['month'].iloc[0]
    }
    
    features.update(velocity_stats)
    
    return features

In [6]:
# Directory where moonquake CSV files are stored
moonquake_data_dir = 'archive/data/lunar/training/data/S12_GradeA'  # Replace with your directory

In [7]:
# Step 3: Merge moonquake data with the mapping
data = []

for _, row in mapping_df.iterrows():
    filename = row['filename']
    mq_type = row['mq_type']
    
    # Load and process the corresponding moonquake file
    moonquake_filepath = os.path.join(moonquake_data_dir, filename+'.csv')
    if os.path.exists(moonquake_filepath):
        features = process_moonquake_file(moonquake_filepath)
        features['mq_type'] = mq_type
        data.append(features)

In [8]:
# Convert the list of dictionaries into a DataFrame
moonquake_df = pd.DataFrame(data)

In [9]:
moonquake_df.head()

Unnamed: 0,hour,day,month,velocity_mean,velocity_max,velocity_min,velocity_std,mq_type
0,0,19,1,-8.443134e-13,7.874026e-09,-8.185283e-09,3.530059e-10,impact_mq
1,0,25,3,-1.939339e-12,4.707866e-09,-4.603228e-09,3.86514e-10,impact_mq
2,0,26,3,-2.980386e-13,5.969005e-09,-6.144452e-09,3.219585e-10,impact_mq
3,0,25,4,-1.547089e-13,6.853803e-09,-6.155705e-09,3.383785e-10,impact_mq
4,0,26,4,-6.921802e-13,5.491012e-09,-4.475551e-09,3.009882e-10,deep_mq


In [10]:


# Step 4: Prepare data for classification
X = moonquake_df.drop(columns=['mq_type'])
y = moonquake_df['mq_type']

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Step 5: Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
clf.fit(X_train, y_train)

In [13]:
# Step 6: Evaluate the model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     deep_mq       0.00      0.00      0.00         2
   impact_mq       0.87      1.00      0.93        13

    accuracy                           0.87        15
   macro avg       0.43      0.50      0.46        15
weighted avg       0.75      0.87      0.80        15



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
X_test.head()

Unnamed: 0,hour,day,month,velocity_mean,velocity_max,velocity_min,velocity_std
4,0,26,4,-6.921802e-13,5.491012e-09,-4.475551e-09,3.009882e-10
63,0,26,4,-3.612109e-13,3.2952e-09,-3.220845e-09,2.676057e-10
10,0,24,10,-1.523585e-12,1.507133e-08,-1.707915e-08,4.220413e-10
0,0,19,1,-8.443134e-13,7.874026e-09,-8.185283e-09,3.530059e-10
35,0,17,7,9.046443e-13,3.395373e-08,-3.406026e-08,1.616808e-09


In [15]:
joblib.dump(clf, 'moonquake_classifier.joblib')

['moonquake_classifier.joblib']

In [16]:
clf_loaded = joblib.load('moonquake_classifier.joblib')

In [17]:
data_test = process_moonquake_file('archive/data/lunar/test/data/S16_GradeA/xa.s16.00.mhz.1974-12-12HR02_evid00168.csv')

In [18]:
data_test_df = pd.DataFrame([data_test])
data_test_df.head()

Unnamed: 0,hour,day,month,velocity_mean,velocity_max,velocity_min,velocity_std
0,2,12,12,-6.411936e-13,2.390841e-09,-2.018262e-09,2.044125e-10


In [19]:
y_pred = clf_loaded.predict(data_test_df)

In [20]:
y_pred

array(['impact_mq'], dtype=object)

In [21]:
def predict_moonquake_type(filepath):
    data_test = process_moonquake_file(filepath)
    data_test_df = pd.DataFrame([data_test])
    return clf_loaded.predict(data_test_df)[0]

In [22]:
predict_moonquake_type('archive/data/lunar/test/data/S16_GradeA/xa.s16.00.mhz.1977-04-17HR00_evid00249.csv')

'impact_mq'

In [23]:
def predict_a_folder(folderpath):
    predictions = {}
    for filename in os.listdir(folderpath):
        if filename.endswith('.csv'):
            filepath = os.path.join(folderpath, filename)
            predictions[filename] = predict_moonquake_type(filepath)
    return predictions

In [24]:
predict_a_folder('archive/data/lunar/training/data/S12_GradeA')

{'xa.s12.00.mhz.1973-01-18HR00_evid00088.csv': 'impact_mq',
 'xa.s12.00.mhz.1971-10-20HR00_evid00044.csv': 'impact_mq',
 'xa.s12.00.mhz.1974-02-07HR00_evid00137.csv': 'impact_mq',
 'xa.s12.00.mhz.1970-01-19HR00_evid00002.csv': 'impact_mq',
 'xa.s12.00.mhz.1975-05-04HR00_evid00192.csv': 'impact_mq',
 'xa.s12.00.mhz.1973-07-28HR00_evid00120.csv': 'impact_mq',
 'xa.s12.00.mhz.1970-07-20HR00_evid00010.csv': 'impact_mq',
 'xa.s12.00.mhz.1972-07-17HR00_evid00068.csv': 'impact_mq',
 'xa.s12.00.mhz.1975-06-24HR00_evid00196.csv': 'impact_mq',
 'xa.s12.00.mhz.1973-07-20HR00_evid00117.csv': 'impact_mq',
 'xa.s12.00.mhz.1974-02-12HR00_evid00138.csv': 'impact_mq',
 'xa.s12.00.mhz.1970-12-27HR00_evid00019.csv': 'impact_mq',
 'xa.s12.00.mhz.1970-07-20HR00_evid00011.csv': 'impact_mq',
 'xa.s12.00.mhz.1970-10-24HR00_evid00014.csv': 'impact_mq',
 'xa.s12.00.mhz.1973-06-27HR00_evid00112.csv': 'impact_mq',
 'xa.s12.00.mhz.1970-09-26HR00_evid00013.csv': 'impact_mq',
 'xa.s12.00.mhz.1974-04-08HR00_evid00141