In [1]:
# Import libraries
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import os


In [2]:
catalog_lunar_dir = "../data/lunar/data/training/catalogs/apollo12_catalog_GradeA_final.csv"
data_dir = "../data/lunar/data/training/data/Processed"
chunk_size = 20000

def process_lunar_data(catalog_path, data_dir, row_num, chunk_size):
    # Read the catalog file
    catalog_lunar = pd.read_csv(catalog_path)

    # Extract row and relevant data
    row = catalog_lunar.iloc[row_num]
    arrival_time = datetime.strptime(row["time_abs(%Y-%m-%dT%H:%M:%S.%f)"], '%Y-%m-%dT%H:%M:%S.%f')
    arrival_time_relative = row["time_rel(sec)"]
    test_filename = row.filename

    # Read the associated CSV file containing raw data
    csv_file = f'{data_dir}/{test_filename}.csv'
    raw_data = pd.read_csv(csv_file)

    # Initialize the results dataframe
    total_rows = raw_data.shape[0]
    results_df = pd.DataFrame(columns=["chunk", "label"])

    # Iterate over chunks of data
    start = 0
    while start < total_rows:
        end = min(start + chunk_size, total_rows)  # Handle case where we don't have a full chunk at the end
        chunk = raw_data.iloc[start:end]
        data_df = pd.DataFrame(chunk["time_rel(sec)"])

        # Check if arrival time is within the current chunk
        if arrival_time_relative >= data_df["time_rel(sec)"].values.min() and arrival_time_relative <= data_df["time_rel(sec)"].values.max():
            # Split chunk at the arrival_time_relative
            before_arrival = chunk[data_df["time_rel(sec)"] < arrival_time_relative]
            after_arrival = chunk[data_df["time_rel(sec)"] >= arrival_time_relative]

            # Add the part before arrival_time_relative with label 0
            if not before_arrival.empty:
                results_df = pd.concat([results_df, pd.DataFrame({"chunk": [before_arrival], "label": [0]})], ignore_index=True)

            # Add a chunk starting from the arrival_time_relative, ensure it has chunk_size rows
            after_start = after_arrival.index[0]  # Start from the first row after the arrival time
            after_end = min(after_start + chunk_size, total_rows)  # Ensure the chunk has exactly chunk_size rows
            after_chunk = raw_data.iloc[after_start:after_end]

            results_df = pd.concat([results_df, pd.DataFrame({"chunk": [after_chunk], "label": [1]})], ignore_index=True)

            # Move the start index beyond this chunk (chunk_size after the arrival time)
            start = after_end
        else:
            # If no arrival time in this chunk, label the entire chunk as 0
            results_df = pd.concat([results_df, pd.DataFrame({"chunk": [chunk], "label": [0]})], ignore_index=True)
            start += chunk_size  # Move to the next chunk

    # If there are remaining rows less than the chunk size, add them with the appropriate label
    if start < total_rows:
        remaining_chunk = raw_data.iloc[start:total_rows]
        remaining_label = 1 if arrival_time_relative >= remaining_chunk["time_rel(sec)"].values.min() else 0
        results_df = pd.concat([results_df, pd.DataFrame({"chunk": [remaining_chunk], "label": [remaining_label]})], ignore_index=True)

    return results_df

Process Entire Catalog

In [3]:
def process_entire_catalog(catalog_dir, data_dir, chunk_size):
    try:
        catalog_lunar = pd.read_csv(catalog_dir)
        testing_df = pd.DataFrame(columns=["chunk", "label"])  # Initialize an empty dataframe to store results
        
        # Iterate over every row in the catalog
        for row_num in range(len(catalog_lunar)):
            # Process each row using process_lunar_data and append the result to testing_df
            results_df = process_lunar_data(catalog_dir, data_dir, row_num, chunk_size)
            testing_df = pd.concat([testing_df, results_df], ignore_index=True)
    
    except FileNotFoundError:
        print("File not found")
    
    return testing_df

In [4]:
testing_df = process_entire_catalog(catalog_lunar_dir, data_dir, chunk_size)

File not found


In [5]:
label_counts = testing_df["label"].value_counts()
print(label_counts)

count_label_1 = label_counts.get(1, 0)

count_label_1

label
0    571
1     20
Name: count, dtype: int64


20

In [6]:
testing_df

Unnamed: 0,chunk,label
0,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_re...,0
1,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_re...,0
2,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_re...,0
3,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_re...,0
4,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_re...,0
...,...,...
586,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_r...,0
587,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_r...,0
588,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_r...,0
589,time_abs(%Y-%m-%dT%H:%M:%S.%f) time_r...,0


In [7]:
testing_df["chunk"].values[590]

Unnamed: 0,time_abs(%Y-%m-%dT%H:%M:%S.%f),time_rel(sec),velocity(m/s)
564905,1971,85268.679245,0.0
564906,1971,85268.830189,0.0
564907,1971,85268.981132,0.0
564908,1971,85269.132075,0.0
564909,1971,85269.283019,0.0
...,...,...,...
572422,1971,86403.320755,0.0
572423,1971,86403.471698,0.0
572424,1971,86403.622642,0.0
572425,1971,86403.773585,0.0


Extract Features from chunk

extracting features from chunk

In [8]:
def extract_features_from_chunk(chunk):
    features = {}
    
    
    
    # Velocity features
    features['mean_velocity'] = chunk["velocity(m/s)"].mean()
    features['median_velocity'] = chunk["velocity(m/s)"].median()
    features['max_velocity'] = chunk["velocity(m/s)"].max()
    features['min_velocity'] = chunk["velocity(m/s)"].min()
    features['std_velocity'] = chunk["velocity(m/s)"].std()
    features['range_velocity'] = features['max_velocity'] - features['min_velocity']
    
    # Energy features
    features['total_energy'] = (chunk["velocity(m/s)"] ** 2).sum()
    features['rms_velocity'] = np.sqrt((chunk["velocity(m/s)"] ** 2).mean())
    
    # Frequency/Oscillation features (zero crossings)
    zero_crossings = np.where(np.diff(np.sign(chunk["velocity(m/s)"])))[0]
    features['zero_crossing_rate'] = len(zero_crossings)
    
    
    
    return features

converting to suitable df

In [9]:
def convert_to_polished_df(testing_df):
    polished_data = []

    for idx, row in testing_df.iterrows():
        chunk = row["chunk"]
        label = row["label"]
        
        # Extract statistical features from chunk
        features = extract_features_from_chunk(chunk)
        features['label'] = label  # Add the label for quake/no-quake
        
        # Append to polished data
        polished_data.append(features)

    # Convert to a new DataFrame
    polished_df = pd.DataFrame(polished_data)
    return polished_df

In [10]:
polished_df = convert_to_polished_df(testing_df)
polished_df

Unnamed: 0,mean_velocity,median_velocity,max_velocity,min_velocity,std_velocity,range_velocity,total_energy,rms_velocity,zero_crossing_rate,label
0,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0,0
1,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0,0
2,7.500383e-13,0.0,1.509708e-09,-1.370997e-09,7.662188e-11,2.880705e-09,1.174236e-16,7.662363e-11,12,0
3,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0,0
4,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0,0
...,...,...,...,...,...,...,...,...,...,...
586,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0,0
587,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0,0
588,-3.189185e-12,0.0,1.869227e-09,-1.827074e-09,1.562159e-10,3.696301e-09,4.882469e-16,1.562445e-10,22,0
589,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0,0


In [11]:
polished_df.columns

Index(['mean_velocity', 'median_velocity', 'max_velocity', 'min_velocity',
       'std_velocity', 'range_velocity', 'total_energy', 'rms_velocity',
       'zero_crossing_rate', 'label'],
      dtype='object')

In [12]:
label_counts = polished_df["label"].value_counts()
print(label_counts)

count_label_1 = label_counts.get(1, 0)

count_label_1

label
0    571
1     20
Name: count, dtype: int64


20

In [13]:
type(polished_df)

pandas.core.frame.DataFrame