In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from modAL.models import ActiveLearner
from modAL.uncertainty import entropy_sampling

from tqdm import tqdm

In [45]:
data = pd.read_csv('fresh.csv')

features = data.drop(columns=['Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'])
#labels = data[["TWF", 'HDF', 'PWF', 'OSF', 'RNF']]
labels = data['Machine failure']

features = features.astype(np.float64)
labels = labels.astype(np.float64)  

print(features.head())
print(labels.head())
print(features.__len__())

   Air temperature [K]  Process temperature [K]  Rotational speed [rpm]  \
0            -0.952342                -0.947313                0.068182   
1            -0.902348                -0.879915               -0.729435   
2            -0.952342                -1.014710               -0.227438   
3            -0.902348                -0.947313               -0.589992   
4            -0.902348                -0.879915               -0.729435   

   Torque [Nm]  Tool wear [min]  Type_H  Type_L  Type_M  
0     0.282186         0.000000     0.0     0.0     1.0  
1     0.633276         0.011858     0.0     1.0     0.0  
2     0.944242         0.019763     0.0     1.0     0.0  
3    -0.048843         0.027668     0.0     1.0     0.0  
4     0.001313         0.035573     0.0     1.0     0.0  
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Machine failure, dtype: float64
10000


In [46]:
features = features.to_numpy()
labels = labels.to_numpy()


In [47]:

X_pool, X_test, y_pool, y_test = train_test_split(
    features, labels,
    test_size=0.5,
    stratify=labels,
    random_state=42
)


In [48]:
n_initial = 1000

X_initial, X_unlabeled, y_initial, y_unlabeled = train_test_split(
    X_pool, y_pool,
    train_size=n_initial,
    stratify=y_pool,
    random_state=42
)


In [49]:
base_model = LogisticRegression(
    max_iter=1000,
    class_weight={0:1, 1:3},
)
learn = ActiveLearner(
    estimator=base_model,
    X_training=X_initial,
    y_training=y_initial,
    query_strategy=entropy_sampling
)

In [53]:
n_query = 20
batch_size = 50

metrics = {"acc": [], "confusion": [], "report": []}


for i in range(n_query):
    # Stop if not enough samples left in the unlabeled pool
    if len(X_unlabeled) < batch_size:
        print(f'Stopping at iteration {i + 1}: Only {len(X_unlabeled)} samples left in pool (need {batch_size})')
        break
    
    query_idx, query_X = learn.query(X_pool=X_unlabeled, n_instances=batch_size)
    query_idx = np.asarray(query_idx).reshape(-1)
    query_y = y_unlabeled[query_idx]
    learn.teach(X=np.asarray(query_X), y=np.asarray(query_y))
    
    X_unlabeled = np.delete(X_unlabeled, query_idx, axis=0)
    y_unlabeled = np.delete(y_unlabeled, query_idx, axis=0)

    
    y_prob = learn.predict_proba(X_test)[:, 1]
    y_pred = (y_prob > 0.3).astype(int)

    acc = accuracy_score(y_test, y_pred)
    metrics["acc"].append(acc)
    metrics["confusion"].append(confusion_matrix(y_test, y_pred))
    metrics["report"].append(classification_report(y_test, y_pred, output_dict=True))
    print(f'Accuracy after query {i + 1}: {acc:.4f}')
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

Accuracy after query 1: 0.9476
[[4645  186]
 [  76   93]]
              precision    recall  f1-score   support

         0.0       0.98      0.96      0.97      4831
         1.0       0.33      0.55      0.42       169

    accuracy                           0.95      5000
   macro avg       0.66      0.76      0.69      5000
weighted avg       0.96      0.95      0.95      5000

Accuracy after query 2: 0.9476
[[4645  186]
 [  76   93]]
              precision    recall  f1-score   support

         0.0       0.98      0.96      0.97      4831
         1.0       0.33      0.55      0.42       169

    accuracy                           0.95      5000
   macro avg       0.66      0.76      0.69      5000
weighted avg       0.96      0.95      0.95      5000

Accuracy after query 3: 0.9476
[[4645  186]
 [  76   93]]
              precision    recall  f1-score   support

         0.0       0.98      0.96      0.97      4831
         1.0       0.33      0.55      0.42       169

    accu

In [55]:
print("Best accuracy:", max(metrics["acc"]) )

Best accuracy: 0.9478


In [58]:
print(metrics)

{'acc': [0.9476, 0.9476, 0.9476, 0.9476, 0.9474, 0.9474, 0.9474, 0.9474, 0.9478, 0.9472, 0.9472, 0.9472, 0.9472, 0.9472, 0.9472, 0.9472, 0.9472, 0.9472, 0.9472, 0.9472], 'confusion': [array([[4645,  186],
       [  76,   93]], dtype=int64), array([[4645,  186],
       [  76,   93]], dtype=int64), array([[4645,  186],
       [  76,   93]], dtype=int64), array([[4645,  186],
       [  76,   93]], dtype=int64), array([[4645,  186],
       [  77,   92]], dtype=int64), array([[4645,  186],
       [  77,   92]], dtype=int64), array([[4645,  186],
       [  77,   92]], dtype=int64), array([[4645,  186],
       [  77,   92]], dtype=int64), array([[4646,  185],
       [  76,   93]], dtype=int64), array([[4643,  188],
       [  76,   93]], dtype=int64), array([[4643,  188],
       [  76,   93]], dtype=int64), array([[4643,  188],
       [  76,   93]], dtype=int64), array([[4643,  188],
       [  76,   93]], dtype=int64), array([[4643,  188],
       [  76,   93]], dtype=int64), array([[4643,  188

In [60]:
print("best recall", max([report['1.0']['recall'] for report in metrics['report']]) )
print("best precision", max([report['1.0']['precision'] for report in metrics['report']]) )

best recall 0.5502958579881657
best precision 0.3345323741007194
