In [24]:
from pathlib import Path
import os
import sys

sys.path.append(str(Path(os.getcwd()).parent))

In [25]:
import pandas as pd
import numpy as np
import tensorflow as tf
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier

In [26]:
from sklearn.metrics import classification_report

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
data = pd.read_csv("../../data/neo_task.csv")
data.drop(columns=['id', 'name'], axis=1, inplace=True)
data.head()

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous
0,0.016016,0.035813,56014.078517,1024333.0,26.1,False
1,0.030518,0.06824,7864.34806,32681860.0,24.7,False
2,0.055533,0.124177,55257.544508,65386360.0,23.4,False
3,0.019256,0.043057,41531.404722,12607960.0,25.7,False
4,0.139494,0.311918,67639.394481,71305900.0,21.4,False


In [29]:
null_columns = []
for i in data.columns:
    if len(data[data[i].isnull()]) > 0:
        null_columns.append(i)
print(null_columns)

['est_diameter_max', 'relative_velocity', 'absolute_magnitude']


In [30]:
def fill_empty_cell(column_name, df):
    if df.dtypes[column_name] == "float64":
        df[column_name] = df[column_name].fillna(df[column_name].mean())
    elif df.dtypes[column_name] == "int64":
        df[column_name] = df[column_name].fillna(df[column_name].median())
    else:
        df[column_name] = df[column_name].fillna(df[column_name].mode())


for name in null_columns:
    fill_empty_cell(name, data)

In [31]:
X_classification = np.array(data.drop(columns=['hazardous'], axis=1))
y_classification = np.array(data['hazardous'])

In [32]:
rus = RandomUnderSampler()
X, y = rus.fit_resample(X_classification, y_classification)
print(X.shape, y.shape)

(17680, 5) (17680,)


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, shuffle=True)

In [34]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [35]:
data2 = data.head()
data2["h"] = ["1", "1", "2", "1", "1"]
data2['h'], data2['est_diameter_max'] = data2['est_diameter_max'], data2['h']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2["h"] = ["1", "1", "2", "1", "1"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['h'], data2['est_diameter_max'] = data2['est_diameter_max'], data2['h']


In [36]:
data2

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,absolute_magnitude,hazardous,h
0,0.016016,1,56014.078517,1024333.0,26.1,False,0.035813
1,0.030518,1,7864.34806,32681860.0,24.7,False,0.06824
2,0.055533,2,55257.544508,65386360.0,23.4,False,0.124177
3,0.019256,1,41531.404722,12607960.0,25.7,False,0.043057
4,0.139494,1,67639.394481,71305900.0,21.4,False,0.311918


In [37]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

X = data2.drop(columns=['hazardous'], axis=1)
y = data2['hazardous']
encoder = OneHotEncoder()

scaler2 = StandardScaler()

t = ColumnTransformer([
    ('scaler', scaler2, ['est_diameter_min', 'h', 'relative_velocity', 'miss_distance', 'absolute_magnitude']),
    ('onehot', encoder, ['est_diameter_max']),
], remainder='passthrough')

a = t.fit_transform(X)
t.transform(X)

array([[-0.78894771, -0.78894771,  0.50183031, -1.2755361 ,  1.07781257,
         1.        ,  0.        ],
       [-0.47243097, -0.47243097, -1.83214412, -0.14052255,  0.25773779,
         1.        ,  0.        ],
       [ 0.07355623,  0.07355623,  0.46515864,  1.03202802, -0.50376022,
         0.        ,  1.        ],
       [-0.71824328, -0.71824328, -0.20019211, -0.86022978,  0.84350549,
         1.        ,  0.        ],
       [ 1.90606573,  1.90606573,  1.06534728,  1.24426041, -1.67529562,
         1.        ,  0.        ]])

# ------------------------------------------------------------------------------

In [38]:
knn_params = {
    'n_neighbors': np.arange(5, 11, 2),
    'p': np.array([1, 2])
}

knn = GridSearchCV(KNeighborsClassifier(metric='minkowski'), knn_params).fit(X_train, y_train)

In [39]:
knn.best_params_

{'n_neighbors': 9, 'p': 1}

In [40]:
# KNN pipeline

pipe_knn = Pipeline([('scaler', scaler), ('knn', knn.best_estimator_)])

In [41]:
print(classification_report(y_test, pipe_knn.predict(X_test)))
# pipe_knn.transform(X_test)

              precision    recall  f1-score   support

       False       0.95      0.76      0.85      2652
        True       0.80      0.96      0.88      2652

    accuracy                           0.86      5304
   macro avg       0.88      0.86      0.86      5304
weighted avg       0.88      0.86      0.86      5304



In [42]:
print(classification_report(y_test, knn.best_estimator_.predict(X_test)))

              precision    recall  f1-score   support

       False       0.25      0.00      0.00      2652
        True       0.50      1.00      0.67      2652

    accuracy                           0.50      5304
   macro avg       0.37      0.50      0.33      5304
weighted avg       0.37      0.50      0.33      5304



# ------------------------------------------------------------------------------

In [43]:
log_reg_grid = {
    'C': np.array([0.01, 0.05, 0.1, 0.5, 1, 5, 10])
}

grid_bagging = {
    'estimator': [knn.best_estimator_, 
                  GridSearchCV(LogisticRegression(), log_reg_grid).fit(X_train, y_train).best_estimator_],
    'n_estimators': np.arange(9, 15, 2),
}

bagging = GridSearchCV(BaggingClassifier(), grid_bagging).fit(X_train, y_train)

In [44]:
bagging.best_params_

{'estimator': KNeighborsClassifier(n_neighbors=9, p=1), 'n_estimators': 13}

In [45]:
# Bagging pipeline

pipe_bagging = Pipeline([('scaler', scaler), ('bagging', bagging.best_estimator_)])

In [46]:
print(classification_report(y_test, pipe_bagging.predict(X_test)))

              precision    recall  f1-score   support

       False       0.96      0.77      0.85      2652
        True       0.81      0.97      0.88      2652

    accuracy                           0.87      5304
   macro avg       0.88      0.87      0.87      5304
weighted avg       0.88      0.87      0.87      5304



# ------------------------------------------------------------------------------

In [47]:
classification_model = tf.keras.Sequential(
    [
        tf.keras.layers.Dense(64, activation="relu", input_shape=(5,)),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(2, activation="softmax"),
    ]
)

classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.05), loss="sparse_categorical_crossentropy")
classification_model.fit(X_train, y_train, epochs=25, verbose=None)

<keras.callbacks.History at 0x2a7754b8e20>

In [48]:
def decision_function(X: np.ndarray) -> np.ndarray:
    return np.array([np.argmax(pred) for pred in X])

In [49]:
# Dence pipeline
pipe_dence = Pipeline([('scaler', scaler), ('dence', classification_model)])

# ------------------------------------------------------------------------------

In [50]:
from joblib import dump, load

In [51]:
dump(pipe_knn, 'pipe_knn.pkl') 

['pipe_knn.pkl']

In [52]:
dump(pipe_bagging, 'pipe_bagging.pkl') 

['pipe_bagging.pkl']

In [53]:
dump(pipe_dence, 'pipe_dence.h5') 

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\dense_2
......vars
.........0
.........1
...layers\dense_3
......vars
.........0
.........1
...layers\dropout
......vars
...layers\dropout_1
......vars
...metrics\mean
......vars
.........0
.........1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2023-06-27 09:51:39         2955
metadata.json                                  2023-06-27 09:51:39           64
variables.h5                                   2023-06-27 09:51:39        64760


['pipe_dence.h5']