In [8]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Label mapping
label_map = {
    "data low rpm": "safe",
    "GAOCHENG DATA USING WORN OUT DRILL AT LOW RPM WITH WOOD": "unsafe",
    "high rpm data": "unsafe",
    "mal data at high rpm using bricks": "unsafe",
    "mal data at high rpm using job": "unsafe",
    "mal data at high rpm using newer drill bit using brick": "unsafe",
    "mal data at high rpm using newer drill bit using wood": "unsafe",
    "MAL DATA AT HIGH RPM": "unsafe",
    "mal data at low rpm using bricks": "unsafe",
    "mal data at low rpm using wood in new drill bit": "safe",
    "MAL DATA AT LOW RPM WITHOUT JOB": "unsafe",
    "mal data using new drill bit with brick at low rpm": "safe",
    "MAL DATA USING WOODEN JOB 125": "safe"
}

def get_label(name):
    name = name.strip().lower()
    return 1 if label_map.get(name, "unsafe") == "unsafe" else 0

data_folder = "./final_data"

# Storage
train_parts = []
per_file_test_data = {}

# Step 1: Split each file's data into 80% train / 20% test
for file in os.listdir(data_folder):
    if file.endswith(".csv"):
        path = os.path.join(data_folder, file)
        df = pd.read_csv(path, header=None, names=["col1", "col2", "col3"])
        file_key = file.replace(".csv", "").strip()
        df["label"] = get_label(file_key)
        df["source_file"] = file_key

        train_split, test_split = train_test_split(
            df, test_size=0.2, random_state=42, stratify=df["label"]
        )

        train_parts.append(train_split)
        per_file_test_data[file_key] = test_split


# Step 2: Combine all 80% training data from all files
train_df = pd.concat(train_parts, ignore_index=True)

# Step 3: Train global model
model = RandomForestClassifier(random_state=42)
model.fit(train_df[["col1", "col2", "col3"]], train_df["label"])

# Step 4: Evaluate on each file's 20% test data
from sklearn.metrics import confusion_matrix

results = []

for file_key, test_df in per_file_test_data.items():
    y_true = test_df["label"]
    y_pred = model.predict(test_df[["col1", "col2", "col3"]])

    acc = accuracy_score(y_true, y_pred)
    report = classification_report(y_true, y_pred, zero_division=0)

    print(f"\n📁 File: {file_key}")
    print(f"✅ Accuracy: {acc:.3f}")
    print("📊 Classification Report:")
    print(report)

    results.append({
        "file": file_key,
        "accuracy": acc,
        "test_samples": len(test_df)
    })

# Summary Table
summary_df = pd.DataFrame(results)
print("\n📋 Per-file evaluation summary:")
print(summary_df)

# Optional: Save to CSV
summary_df.to_csv("per_file_eval_summary.csv", index=False)



📁 File: mal data at high rpm using newer drill bit using wood
✅ Accuracy: 0.913
📊 Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.91      0.95      9455

    accuracy                           0.91      9455
   macro avg       0.50      0.46      0.48      9455
weighted avg       1.00      0.91      0.95      9455


📁 File: mal data at high rpm using job
✅ Accuracy: 0.879
📊 Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.88      0.94      9093

    accuracy                           0.88      9093
   macro avg       0.50      0.44      0.47      9093
weighted avg       1.00      0.88      0.94      9093


📁 File: MAL DATA AT LOW RPM WITHOUT JOB
✅ Accuracy: 0.937
📊 Classification Report:
              precision    recall  f1-score   support

           0      