In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = 'database1.csv'
data = pd.read_csv(file_path)

# Data Preprocessing

# Combine Date and Time into a single datetime column
data['Datetime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'])

# Drop the original Date and Time columns
data = data.drop(columns=['Date', 'Time'])

# Fill missing values
data.fillna({
    'Depth Error': data['Depth Error'].mean(),
    'Magnitude Error': data['Magnitude Error'].mean(),
    'Azimuthal Gap': data['Azimuthal Gap'].mean(),
    'Horizontal Distance': data['Horizontal Distance'].mean(),
    'Horizontal Error': data['Horizontal Error'].mean(),
    'Root Mean Square': data['Root Mean Square'].mean(),
}, inplace=True)

# Drop columns with too many missing values or non-relevant features
data = data.drop(columns=['Depth Seismic Stations', 'Magnitude Seismic Stations', 'ID', 'Source', 'Location Source', 'Magnitude Source', 'Status'])

# Encode categorical features
le_type = LabelEncoder()
data['Type'] = le_type.fit_transform(data['Type'])

le_magnitude_type = LabelEncoder()
data['Magnitude Type'] = le_magnitude_type.fit_transform(data['Magnitude Type'].astype(str))

# Select features and target
X = data.drop(columns=['Type', 'Datetime'])
y = data['Type']

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Model Training (RandomForestClassifier)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le_type.classes_)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


ValueError: time data "1975-02-23T02:58:41.000Z 1975-02-23T02:58:41.000Z" doesn't match format "%m/%d/%Y %H:%M:%S", at position 3378. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = 'database1.csv'
data = pd.read_csv(file_path)

# Data Preprocessing

# Combine Date and Time into a single datetime column
data['Datetime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'], errors='coerce')

# Drop the original Date and Time columns
data = data.drop(columns=['Date', 'Time'])

# Fill missing values in numeric columns with the column's mean
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
for column in numeric_columns:
    data[column].fillna(data[column].mean(), inplace=True)

# Drop columns with too many missing values or irrelevant features
data = data.drop(columns=[
    'Depth Seismic Stations', 'Magnitude Seismic Stations', 'ID', 
    'Source', 'Location Source', 'Magnitude Source', 'Status'
])

# Encode categorical features safely
# If there's any missing categorical data, fill it with a placeholder
data['Magnitude Type'] = data['Magnitude Type'].fillna('Unknown')

le_type = LabelEncoder()
data['Type'] = le_type.fit_transform(data['Type'])

le_magnitude_type = LabelEncoder()
data['Magnitude Type'] = le_magnitude_type.fit_transform(data['Magnitude Type'])

# Select features and target
X = data.drop(columns=['Type', 'Datetime'])  # Remove 'Type' (target) and 'Datetime' (not needed for training)
y = data['Type']

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Model Training (RandomForestClassifier)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le_type.classes_)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


ValueError: Number of classes, 3, does not match size of target_names, 4. Try specifying the labels parameter

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = 'database1.csv'
data = pd.read_csv(file_path)

# Data Preprocessing

# Combine Date and Time into a single datetime column
data['Datetime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'], errors='coerce')

# Drop the original Date and Time columns
data = data.drop(columns=['Date', 'Time'])

# Fill missing values in numeric columns with the column's mean
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
for column in numeric_columns:
    data[column].fillna(data[column].mean(), inplace=True)

# Drop columns with too many missing values or irrelevant features
data = data.drop(columns=[
    'Depth Seismic Stations', 'Magnitude Seismic Stations', 'ID', 
    'Source', 'Location Source', 'Magnitude Source', 'Status'
])

# Encode categorical features safely
# If there's any missing categorical data, fill it with a placeholder
data['Magnitude Type'] = data['Magnitude Type'].fillna('Unknown')

le_type = LabelEncoder()
data['Type'] = le_type.fit_transform(data['Type'])

le_magnitude_type = LabelEncoder()
data['Magnitude Type'] = le_magnitude_type.fit_transform(data['Magnitude Type'])

# Select features and target
X = data.drop(columns=['Type', 'Datetime'])  # Remove 'Type' (target) and 'Datetime' (not needed for training)
y = data['Type']

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Model Training (RandomForestClassifier)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Get the unique classes from the test set to avoid mismatches
unique_classes = np.unique(y_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le_type.inverse_transform(unique_classes))

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)


Accuracy: 0.9997152619589977
Classification Report:
                   precision    recall  f1-score   support

       Earthquake       1.00      1.00      1.00      6976
        Explosion       0.00      0.00      0.00         1
Nuclear Explosion       0.96      1.00      0.98        47

         accuracy                           1.00      7024
        macro avg       0.65      0.67      0.66      7024
     weighted avg       1.00      1.00      1.00      7024



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
