In [23]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from collections import Counter
from sklearn.metrics import classification_report, f1_score
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV

In [2]:

X_train = 'D:/Python/Hydraulic Rig Dataset/Data/X_train_pickled.pkl'
X_test = 'D:/Python/Hydraulic Rig Dataset/Data/X_test_pickled.pkl'
y_train = 'D:/Python/Hydraulic Rig Dataset/Data/y_train_pickled.pkl'
y_test = 'D:/Python/Hydraulic Rig Dataset/Data/y_test_pickled.pkl'

In [3]:
X_train = pd.read_pickle(X_train)
X_test = pd.read_pickle(X_test)
y_train= pd.read_pickle(y_train)
y_test = pd.read_pickle(y_test)


In [20]:
pipeline = Pipeline([
    ('smote', SMOTE(random_state=1)),
    ('clf', LogisticRegression(solver='liblinear', random_state=1))
])

param_grid = {
    'smote__k_neighbors': [3, 5, 7],
    'clf__C': [0.01, 0.1, 1, 10],              # regularization strength
    'clf__penalty': ['l1', 'l2']               # L1 vs L2 regularization
}


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(
    pipeline,
    param_grid,
    scoring='f1',   
    cv=cv,
    n_jobs=-1,
    verbose=1)

In [21]:
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best CV F1:", grid.best_score_)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters: {'clf__C': 10, 'clf__penalty': 'l1', 'smote__k_neighbors': 5}
Best CV F1: 0.9477293180064219


In [24]:
y_train_pred = grid.predict(X_train)
train_f1_class1 = f1_score(y_train, y_train_pred, pos_label=1)
print("Train F1 (class 1):", train_f1_class1)
print("Train classification report:")
print(classification_report(y_train, y_train_pred))

# 2) Test‐set metrics
y_test_pred = grid.predict(X_test)
test_f1_class1 = f1_score(y_test, y_test_pred, pos_label=1)
print("Test F1 (class 1):", test_f1_class1)
print("Test classification report:")
print(classification_report(y_test, y_test_pred))

Train F1 (class 1): 1.0
Train classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1159
           1       1.00      1.00      1.00       605

    accuracy                           1.00      1764
   macro avg       1.00      1.00      1.00      1764
weighted avg       1.00      1.00      1.00      1764

Test F1 (class 1): 0.9383561643835615
Test classification report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       290
           1       0.97      0.91      0.94       151

    accuracy                           0.96       441
   macro avg       0.96      0.95      0.95       441
weighted avg       0.96      0.96      0.96       441

