In [None]:
!pip install "dask[dataframe]"


Collecting dask-expr<1.2,>=1.1 (from dask[dataframe])
  Downloading dask_expr-1.1.17-py3-none-any.whl.metadata (2.6 kB)
INFO: pip is looking at multiple versions of dask-expr to determine which version is compatible with other requirements. This could take a while.
  Downloading dask_expr-1.1.16-py3-none-any.whl.metadata (2.5 kB)
Downloading dask_expr-1.1.16-py3-none-any.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.2/243.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dask-expr
Successfully installed dask-expr-1.1.16


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import warnings

warnings.filterwarnings("ignore")

# Load data
secom_data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data'
secom_labels_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data'

# Load the datasets
data = pd.read_csv(secom_data_url, sep=' ', header=None, na_values='NaN')
labels = pd.read_csv(secom_labels_url, sep=' ', header=None)

# Preprocess labels
labels.columns = ['label', 'timestamp']
labels['label'] = labels['label'].apply(lambda x: 1 if x == -1 else 0)  # Convert -1 (faulty) to 1, and 1 (non-faulty) to 0

# Handle missing values by mean imputation
data.fillna(data.mean(), inplace=True)

# Feature Scaling
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data_scaled, labels['label'], test_size=0.2, random_state=42, stratify=labels['label'])

# Define LightGBM with hyperparameter tuning
lgb_model = lgb.LGBMClassifier(objective='binary', metric='binary_logloss', is_unbalance=True)

# Parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 7, 10],
    'num_leaves': [31, 50, 70],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(lgb_model, param_grid, scoring='accuracy', cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

# Train the best model on the training set
best_model.fit(X_train, y_train)

# Evaluate on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Best Model Parameters: {grid_search.best_params_}")
print(f"Test Accuracy: {accuracy}")
print("Classification Report:")
print(report)


Fitting 3 folds for each of 729 candidates, totalling 2187 fits
[LightGBM] [Info] Number of positive: 1170, number of negative: 83
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017911 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 101618
[LightGBM] [Info] Number of data points in the train set: 1253, number of used features: 468
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.933759 -> initscore=2.645918
[LightGBM] [Info] Start training from score 2.645918
[LightGBM] [Info] Number of positive: 1170, number of negative: 83
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009406 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 101618
[LightGBM] [Info] Number of data points in the train set: 1253, number of used features: 468
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.933759 -> initscore=2.645918
[LightGBM