In [100]:
''''!pip install scikit-learn
!pip install matplotlib
!pip install pandas 
!pip install numpy 
!pip install seaborn
!pip install imbalanced-learn
%matplotlib inline'''

"'!pip install scikit-learn\n!pip install matplotlib\n!pip install pandas \n!pip install numpy \n!pip install seaborn\n!pip install imbalanced-learn\n%matplotlib inline"

In [101]:
import pandas as pd
import numpy as np
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.model_selection import GridSearchCV


<b>Preparing data<b>

In [102]:
df = pd.read_csv("./data/earthquakes.csv")
df = df.fillna(0)

print(df.shape)
df.dtypes

(7714, 23)


time                object
latitude           float64
longitude          float64
depth              float64
mag                float64
magType             object
nst                float64
gap                float64
dmin               float64
rms                float64
net                 object
id                  object
updated             object
place               object
type                object
horizontalError    float64
depthError         float64
magError           float64
magNst             float64
status              object
locationSource      object
magSource           object
Alert               object
dtype: object

In [103]:
typescat = {
    "earthquake":1,
    "volcanic eruption":2,
    "nuclear explosion":3
}
df['type_value'] = df['type'].map(typescat)
print(f"Columns name: {df.columns}")
df['Alert'].value_counts()

Columns name: Index(['time', 'latitude', 'longitude', 'depth', 'mag', 'magType', 'nst',
       'gap', 'dmin', 'rms', 'net', 'id', 'updated', 'place', 'type',
       'horizontalError', 'depthError', 'magError', 'magNst', 'status',
       'locationSource', 'magSource', 'Alert', 'type_value'],
      dtype='object')


Alert
green     7386
yellow     245
orange      51
red         32
Name: count, dtype: int64

depth: The depth at which the earthquake occurred, typically measured in kilometers below the Earth's surface.

mag: The magnitude of the earthquake, representing the energy released by the seismic event. In this case, a value of 8.6 indicates a very large earthquake.

dmin: The minimum distance between the earthquake's epicenter and the nearest seismic station, measured in degrees.

rms: The root mean square of the amplitude of the seismic waves, representing the strength of the seismic signal.

type: The type of event, such as "volcanic eruption" or "earthquake."


Convert Pandas data fram to a Numpy Array

In [104]:
X = df[['depth', 'mag', 'dmin', 'rms', 'type_value']].values

In [105]:
le = LabelEncoder()
y = le.fit_transform(df['Alert'])
for index, label in enumerate(le.classes_):
    print(f"{label} -> {index}")

green -> 0
orange -> 1
red -> 2
yellow -> 3


Normalize Data

In [106]:
X= preprocessing.StandardScaler().fit(X).transform(X.astype(float))

Train Test Split

In [107]:
#30%
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (5399, 5) (5399,)
Test set: (2315, 5) (2315,)


Sobremuestreo (para contrarrestar que hay muchos green)

In [108]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Verifica la nueva distribución de clases
print("SMOTE Distribution:", Counter(y_train_resampled))

SMOTE Distribution: Counter({np.int64(0): 5171, np.int64(3): 5171, np.int64(1): 5171, np.int64(2): 5171})


<b>K Nearest Neighbor Classification<b>

GridSearchCV to find the perfects hyperparameters 

In [109]:
param_grid = {
    'n_neighbors': range(1, 50),
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski', 'manhattan', 'euclidean']
}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)

print(f"Best hyperparameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

Best hyperparameters: {'metric': 'manhattan', 'n_neighbors': 1, 'weights': 'uniform'}
Best score: 0.9601140273486098


In [110]:
k = grid_search.best_params_.get('n_neighbors')
#train model 
neigh = KNeighborsClassifier(n_neighbors=k, weights=grid_search.best_params_.get('weights'), metric=grid_search.best_params_.get('metric')). fit(X_train_resampled,  y_train_resampled)


In [111]:
#Predict
yhat = neigh.predict(X_test)
print(f"Predicción:{yhat[211:221]}")
print(f"El y_test: {y_test[211:221]}")

Predicción:[0 0 0 2 0 0 0 0 0 0]
El y_test: [0 0 0 2 0 0 0 0 0 0]


In [112]:
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

Train set Accuracy:  0.9998147805149101
Test set Accuracy:  0.8665226781857451


XGBoost Classificaction

In [121]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [122]:
dtrain = xgb.DMatrix(X_train_resampled, label=y_train_resampled)
dtest = xgb.DMatrix(X_test, label=y_test)

In [198]:
params = {
    'objective':'multi:softmax',
    'max_depth':4,
    'num_class':4,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 1.0,
    'min_child_weight': 3
}

In [199]:
bst = xgb.train(params, dtrain, num_boost_round=50)

In [200]:
#predict
y_pred = bst.predict(dtest)
print(f"Predicción:{y_pred[211:221]}")
print(f"El y_test: {y_test[211:221]}")

Predicción:[0. 0. 0. 2. 0. 0. 1. 0. 0. 0.]
El y_test: [0 0 0 2 0 0 0 0 0 0]


In [201]:
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Accuracy: {accuracy:.4f}")

XGBoost Accuracy: 0.8078
