Model

In [318]:
# Import libraries
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import os
import glob

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

Settings

In [319]:
catalog_lunar_dir = "./data/lunar/data/training/catalogs/apollo12_catalog_GradeA_final.csv"
data_dir = "./data/lunar/data/training/data/S12_GradeA/Filtered"
chunk_size = 10000

Enter Raw Test File Inof:


In [320]:
raw_data_dir = "./data/lunar/data/test/data/Filt_S16_GradeA"
file_name = "xa.s16.00.mhz.1977-06-02HR00_evid00255.csv"

Functions

In [321]:
#Process single lunar data

def process_lunar_data(catalog_path, data_dir, row_num, chunk_size):
    # Read the catalog file
    catalog_lunar = pd.read_csv(catalog_path)

    # Extract row and relevant data
    row = catalog_lunar.iloc[row_num]
    arrival_time = datetime.strptime(row["time_abs(%Y-%m-%dT%H:%M:%S.%f)"], '%Y-%m-%dT%H:%M:%S.%f')
    arrival_time_relative = row["time_rel(sec)"]
    test_filename = row.filename

    # Read the associated CSV file containing raw data
    csv_file = f'{data_dir}/{test_filename}.csv'
    raw_data = pd.read_csv(csv_file)

    raw_data = raw_data[raw_data['cluster_ids'] != 0]

    results_df = pd.DataFrame(columns=["chunk", "label"])
    
    for cluster_id in raw_data['cluster_ids'].unique():
        # Get the data corresponding to the current cluster
        cluster_data = raw_data[raw_data['cluster_ids'] == cluster_id]
        
        # Determine the start and end time of the cluster (in relative time)
        cluster_start = cluster_data['time_rel(sec)'].min()
        cluster_end = cluster_data['time_rel(sec)'].max()

        if arrival_time_relative >= cluster_start and arrival_time_relative <= cluster_end:
            results_df = pd.concat([results_df, pd.DataFrame({"chunk": [cluster_data], "label": [1]})], ignore_index=True)
        else:
            results_df = pd.concat([results_df, pd.DataFrame({"chunk": [cluster_data], "label": [0]})], ignore_index=True)
        # # Initialize the cluster label to 0 (non-quake)
        # raw_data.loc[raw_data['cluster_id'] == cluster_id, 'quake_cluster'] = 0

        # # Loop through matching events to check if they fall within the cluster's time range
        # for _, event_row in matching_events.iterrows():
        #     event_time = event_row['time_rel']
            
        #     if is_time_in_cluster(cluster_start, cluster_end, event_time):
        #         # If the event time falls within the cluster range, mark the cluster as a quake
        #         raw_data.loc[raw_data['cluster_id'] == cluster_id, 'quake_cluster'] = 1

    # Initialize the results dataframe
    # total_rows = raw_data.shape[0]
    # results_df = pd.DataFrame(columns=["chunk", "label"])



    # Iterate over chunks of data
    # start = 0
    # while start < total_rows:
    #     end = min(start + chunk_size, total_rows)  # Handle case where we don't have a full chunk at the end
    #     chunk = raw_data.iloc[start:end]
    #     data_df = pd.DataFrame(chunk["time_rel(sec)"])

    #     # Check if arrival time is within the current chunk
    #     if arrival_time_relative >= data_df["time_rel(sec)"].values.min() and arrival_time_relative <= data_df["time_rel(sec)"].values.max():
    #         # Split chunk at the arrival_time_relative
    #         before_arrival = chunk[data_df["time_rel(sec)"] < arrival_time_relative]
    #         after_arrival = chunk[data_df["time_rel(sec)"] >= arrival_time_relative]

    #         # Add the part before arrival_time_relative with label 0
    #         if not before_arrival.empty:
    #             results_df = pd.concat([results_df, pd.DataFrame({"chunk": [before_arrival], "label": [0]})], ignore_index=True)

    #         # Add a chunk starting from the arrival_time_relative, ensure it has chunk_size rows
    #         after_start = after_arrival.index[0]  # Start from the first row after the arrival time
    #         after_end = min(after_start + chunk_size, total_rows)  # Ensure the chunk has exactly chunk_size rows
    #         after_chunk = raw_data.iloc[after_start:after_end]

    #         results_df = pd.concat([results_df, pd.DataFrame({"chunk": [after_chunk], "label": [1]})], ignore_index=True)

    #         # Move the start index beyond this chunk (chunk_size after the arrival time)
    #         start = after_end
    #     else:
    #         # If no arrival time in this chunk, label the entire chunk as 0
    #         results_df = pd.concat([results_df, pd.DataFrame({"chunk": [chunk], "label": [0]})], ignore_index=True)
    #         start += chunk_size  # Move to the next chunk

    # # If there are remaining rows less than the chunk size, add them with the appropriate label
    # if start < total_rows:
    #     remaining_chunk = raw_data.iloc[start:total_rows]
    #     remaining_label = 1 if arrival_time_relative >= remaining_chunk["time_rel(sec)"].values.min() else 0
    #     results_df = pd.concat([results_df, pd.DataFrame({"chunk": [remaining_chunk], "label": [remaining_label]})], ignore_index=True)

    return results_df







# Process entire catalog

def process_entire_catalog(catalog_dir, data_dir, chunk_size):
    try:
        catalog_lunar = pd.read_csv(catalog_dir)
        testing_df = pd.DataFrame(columns=["chunk", "label"])  # Initialize an empty dataframe to store results
        
        # Iterate over every row in the catalog
        for row_num in range(len(catalog_lunar)):
            # Process each row using process_lunar_data and append the result to testing_df
            results_df = process_lunar_data(catalog_dir, data_dir, row_num, chunk_size)
            testing_df = pd.concat([testing_df, results_df], ignore_index=True)
    
    except FileNotFoundError:
        print("File not found")
    
    return testing_df






# extracting the features

def extract_features_from_chunk(chunk):
    features = {}
    
    
    
    # Velocity features
    # features['mean_velocity'] = chunk["mov_avg_clamped"].mean()
    
    # features['max_velocity'] = chunk["mov_avg_clamped"].max()
    # features['min_velocity'] = chunk["mov_avg_clamped"].min()
    # features['std_velocity'] = chunk["mov_avg_clamped"].std()
    # features['range_velocity'] = features['max_velocity'] - features['min_velocity']
    
    # # Energy features
    # features['total_energy'] = (chunk["mov_avg_clamped"] ** 2).sum()
     # # Frequency/Oscillation features (zero crossings)
    signs = np.sign(chunk["mov_avg_clamped"])
    signs[signs == 0] = -1  # Treat zeros as negative to prevent false crossings

    # Find zero crossings
    zero_crossings = np.where(np.diff(signs))[0]

    # Calculate zero-crossing rate
    features['zero_crossing_rate'] = len(zero_crossings) / len(chunk)

    # rms_velocity = np.sqrt(np.mean(chunk["mov_avg_clamped"]**2))
    # features["rms_velocity"] = rms_velocity
    


    # mean_abs_velocity = np.mean(np.abs(chunk["mov_avg_clamped"]))
    # features["mean_abs_velocity"] = mean_abs_velocity
    
    # total_energy = (chunk['mov_avg_clamped'] ** 2).sum()
    # features["total_energy"] = total_energy



   
    
    
    
    return features





# converting to suitable

def convert_to_polished_df(testing_df):
    polished_data = []

    for idx, row in testing_df.iterrows():
        chunk = row["chunk"]
        label = row["label"]
        
        # Extract statistical features from chunk
        features = extract_features_from_chunk(chunk)
        features['label'] = label  # Add the label for quake/no-quake
        
        # Append to polished data
        polished_data.append(features)

    # Convert to a new DataFrame
    polished_df = pd.DataFrame(polished_data)
    return polished_df



def process_test_file_for_model(file_name, chunk_size):
    # Path to the CSV file
    csv_file = f"{file_name}"
    
    # Read the raw data from the CSV file
    raw_data = pd.read_csv(csv_file)
    total_rows = raw_data.shape[0]

    # Lists to store statistics for each chunk and the chunks themselves
    stats_list = []
    chunks_list = []  # List to store the chunks

    # Iterate over chunks of data in the file
    start = 0
    while start < total_rows:
        end = min(start + chunk_size, total_rows)
        chunk = raw_data.iloc[start:end]

        # Store the current chunk in the chunks list
        chunks_list.append(chunk)

        # Calculate statistical values from the chunk
        # mean_velocity = chunk['velocity(m/s)'].mean()
        
        # max_velocity = chunk['velocity(m/s)'].max()
        # min_velocity = chunk['velocity(m/s)'].min()
        # std_velocity = chunk['velocity(m/s)'].std()
        # range_velocity = max_velocity - min_velocity
        # total_energy = (chunk['velocity(m/s)'] ** 2).sum()
        # rms_velocity = (chunk['velocity(m/s)'] ** 2).mean() ** 0.5

        # Calculate zero crossing rate
        zero_crossings = np.where(np.diff(np.sign(chunk["velocity(m/s)"])))[0]
        zero_crossing_rate = len(zero_crossings)  # Count of zero crossings


        # Add the calculated statistics to the stats list
        stats_list.append({
            # 'mean_velocity': mean_velocity,
            
            # 'max_velocity': max_velocity,
            # 'min_velocity': min_velocity,
            # 'std_velocity': std_velocity,
            # 'range_velocity': range_velocity,
            # 'total_energy': total_energy,
            # 'rms_velocity': rms_velocity,
            'zero_crossing_rate': zero_crossing_rate
        })

        # Move to the next chunk
        start += chunk_size

    # Create DataFrames from the lists
    stats_df = pd.DataFrame(stats_list)  # DataFrame for statistics
    chunks_df = pd.DataFrame({'chunks': chunks_list})  # DataFrame for chunks

    return stats_df, chunks_df

# Process raw input into ready for model to read
def process_raw_file_for_model(raw_data_dir, file_name, chunk_size):
    # Path to the CSV file
    csv_file = f"{raw_data_dir}/{file_name}"
    
    # Read the raw data from the CSV file
    raw_data = pd.read_csv(csv_file)
    total_rows = raw_data.shape[0]

    # Lists to store statistics for each chunk and the chunks themselves
    stats_list = []
    chunks_list = []  # List to store the chunks

    # Iterate over chunks of data in the file
    start = 0
    while start < total_rows:
        end = min(start + chunk_size, total_rows)
        chunk = raw_data.iloc[start:end]

        # Store the current chunk in the chunks list
        chunks_list.append(chunk)

        # Calculate statistical values from the chunk
        # mean_velocity = chunk['velocity(m/s)'].mean()
        
        # max_velocity = chunk['velocity(m/s)'].max()
        # min_velocity = chunk['velocity(m/s)'].min()
        # std_velocity = chunk['velocity(m/s)'].std()
        # range_velocity = max_velocity - min_velocity
        # total_energy = (chunk['velocity(m/s)'] ** 2).sum()
        # rms_velocity = (chunk['velocity(m/s)'] ** 2).mean() ** 0.5
        
        mean_abs_velocity = np.mean(np.abs(chunk['velocity(m/s)']))
        rms_velocity = np.sqrt(np.mean(chunk['velocity(m/s)'] ** 2))
        # Calculate zero crossing rate
        signs = np.sign(chunk["velocity(m/s)"])

        # Treat zeros as negative to avoid false zero crossings
        signs[signs == 0] = -1

        # Find zero crossings (where the sign changes)
        zero_crossings = np.where(np.diff(signs))[0]

        # Calculate the zero-crossing rate (normalized by the chunk length)
        zero_crossing_rate = len(zero_crossings) / len(chunk["velocity(m/s)"])  # Count of zero crossings

        total_energy = (chunk['velocity(m/s)'] ** 2).sum()


        # Add the calculated statistics to the stats list
        stats_list.append({
            # 'mean_velocity': mean_velocity,
            
            # 'max_velocity': max_velocity,
            # 'min_velocity': min_velocity,
            # 'std_velocity': std_velocity,
            # 'range_velocity': range_velocity,
            # 'total_energy': total_energy,
            # 'rms_velocity': rms_velocity,
            'zero_crossing_rate': zero_crossing_rate,
            "rms_velocity": rms_velocity,
            "mean_abs_velocity": mean_abs_velocity,
            "total_energy": total_energy
        })

        # Move to the next chunk
        start += chunk_size

    # Create DataFrames from the lists
    stats_df = pd.DataFrame(stats_list)  # DataFrame for statistics
    chunks_df = pd.DataFrame({'chunks': chunks_list})  # DataFrame for chunks

    return stats_df, chunks_df


# Plot Input Data

def plot_data(data_dir, file_name):
    
    test_filename = file_name
    csv_file = f'{data_dir}/{test_filename}'
    data = pd.read_csv(csv_file)

    csv_times = np.array(data["time_rel(sec)"].tolist())
    csv_velocity = np.array(data["velocity(m/s)"].tolist())

    # Plot the trace!
    fig,ax = plt.subplots(1,1,figsize=(10,3))
    ax.plot(csv_times,csv_velocity)
    # Make the plot pretty
    ax.set_xlim([min(csv_times),max(csv_times)])
    ax.set_ylabel('Velocity (m/s)')
    ax.set_xlabel('Time (s)')
    ax.set_title(f'{test_filename}', fontweight='bold')
    

Training Data

In [322]:
testing_chunks = process_entire_catalog(catalog_lunar_dir, data_dir, chunk_size)
# polished_df = convert_to_polished_df(testing_chunks)

label_counts = testing_chunks["label"].value_counts()
print(label_counts)

count_label_1 = label_counts.get(1, 0)

count_label_1

File not found
label
0    20
Name: count, dtype: int64


0

In [323]:
testing_chunks

Unnamed: 0,chunk,label
0,Empty DataFrame Columns: [time_abs(%Y-%m-%dT%H...,0
1,Empty DataFrame Columns: [time_abs(%Y-%m-%dT%H...,0
2,Empty DataFrame Columns: [time_abs(%Y-%m-%dT%H...,0
3,Empty DataFrame Columns: [time_abs(%Y-%m-%dT%H...,0
4,Empty DataFrame Columns: [time_abs(%Y-%m-%dT%H...,0
5,Empty DataFrame Columns: [time_abs(%Y-%m-%dT%H...,0
6,Empty DataFrame Columns: [time_abs(%Y-%m-%dT%H...,0
7,Empty DataFrame Columns: [time_abs(%Y-%m-%dT%H...,0
8,Empty DataFrame Columns: [time_abs(%Y-%m-%dT%H...,0
9,Empty DataFrame Columns: [time_abs(%Y-%m-%dT%H...,0


In [324]:
# testing_chunks[testing_chunks["label"] == 1]

Model Implementation

In [325]:
#Option 1 - Using LR

# from sklearn.linear_model import LogisticRegression

# # Split the dataset into features (X) and target labels (y)
# X = testing_chunks.drop(columns=['label'])  # Features
# y = testing_chunks['label']  # Target labels (1 for quake, 0 for non-quake)


# # Split data into training and testing sets (80% training, 20% testing)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)


# # Initialize the Random Forest Classifier
# clf = LogisticRegression( max_iter=1000, random_state=42)

# # Train the model
# clf.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = clf.predict(X_test)

# # Evaluate the model
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))





In [326]:
# Option 2 - Balancing the data out more

# df = testing_chunks

# # Split the DataFrame into two separate DataFrames for each class
# df_majority = df[df['label'] == 0]
# df_minority = df[df['label'] == 1]

# # Specify the number of majority class instances you want to keep
# # Here we keep the same number as minority class instances
# n_minority = len(df_minority)
# df_majority_undersampled = df_majority.sample(n=n_minority, random_state=42)  # Randomly sample from majority class

# # Combine the undersampled majority class with the minority class
# df_balanced = pd.concat([df_majority_undersampled, df_minority])

# # Shuffle the DataFrame to mix class labels
# df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# # Now `df_balanced` is your new training dataset with a balanced class distribution

# # Split the dataset into features (X) and target labels (y)
# X = df_balanced.drop(columns=['label'])  # Features
# y = df_balanced['label']  # Target labels (1 for quake, 0 for non-quake)


# # Split data into training and testing sets (80% training, 20% testing)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# # Initialize the Random Forest Classifier
# clf2 = RandomForestClassifier(n_estimators=100, random_state=42)

# # Train the model
# clf2.fit(X_train, y_train)

# # Make predictions on the test set
# y_pred = clf2.predict(X_test)

# # Evaluate the model
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))




In [327]:
# Option 3 - USED
print(testing_chunks)
# Split the dataset into features (X) and target labels (y)
X = testing_chunks.drop(columns=['label'])  # Features
y = testing_chunks['label']  # Target labels (1 for quake, 0 for non-quake)


X_test = pd.DataFrame(columns=["chunk"]) 


# Split data into training and testing sets (80% training, 20% testing)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
row = glob.glob(os.path.join(raw_data_dir, '*.csv'))
for i in row:
    print(i)
    X_sample, nope = process_test_file_for_model(i, chunk_size)
    X_test = pd.concat([X_test, X_sample], ignore_index=True)
    
print(X_test)

# Initialize the Random Forest Classifier
clf3 = RandomForestClassifier(class_weight="balanced", random_state=42)

# Train the model
clf3.fit(X, y)

# Make predictions on the test set
y_pred = clf3.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


                                                chunk label
0   Empty DataFrame
Columns: [time_abs(%Y-%m-%dT%H...     0
1   Empty DataFrame
Columns: [time_abs(%Y-%m-%dT%H...     0
2   Empty DataFrame
Columns: [time_abs(%Y-%m-%dT%H...     0
3   Empty DataFrame
Columns: [time_abs(%Y-%m-%dT%H...     0
4   Empty DataFrame
Columns: [time_abs(%Y-%m-%dT%H...     0
5   Empty DataFrame
Columns: [time_abs(%Y-%m-%dT%H...     0
6   Empty DataFrame
Columns: [time_abs(%Y-%m-%dT%H...     0
7   Empty DataFrame
Columns: [time_abs(%Y-%m-%dT%H...     0
8   Empty DataFrame
Columns: [time_abs(%Y-%m-%dT%H...     0
9   Empty DataFrame
Columns: [time_abs(%Y-%m-%dT%H...     0
10  Empty DataFrame
Columns: [time_abs(%Y-%m-%dT%H...     0
11  Empty DataFrame
Columns: [time_abs(%Y-%m-%dT%H...     0
12  Empty DataFrame
Columns: [time_abs(%Y-%m-%dT%H...     0
13  Empty DataFrame
Columns: [time_abs(%Y-%m-%dT%H...     0
14  Empty DataFrame
Columns: [time_abs(%Y-%m-%dT%H...     0
15  Empty DataFrame
Columns: [time_abs(%

ValueError: setting an array element with a sequence.

In [None]:
f"{raw_data_dir}/{file_name}"

In [275]:
ready_testing_df, chuncks_testing_df = process_raw_file_for_model(raw_data_dir, file_name, chunk_size)

In [None]:
chuncks_testing_df

In [None]:
ready_testing_df

In [None]:
predicted_labels = clf2.predict(ready_testing_df)
predicted_labels

In [None]:
indices_of_ones = np.where(predicted_labels == 1)[0]
indices_of_ones

Important Chunks

In [None]:
def plot_chunk(chunk):
    csv_times = np.array(chunk["time_rel(sec)"].tolist())
    csv_velocities = np.array(chunk["velocity(m/s)"].tolist())

    # Plot the trace!
    fig,ax = plt.subplots(1,1,figsize=(10,3))
    ax.plot(csv_times,csv_velocities)
    # Make the plot pretty
    ax.set_xlim([min(csv_times),max(csv_times)])
    ax.set_ylabel('Velocity (m/s)')
    ax.set_xlabel('Time (s)')

for index in indices_of_ones:
    chunk_plot = chuncks_testing_df["chunks"].values[index]
    plot_chunk(chunk_plot)

Whole Data Set

In [None]:
plot_data(raw_data_dir, file_name)