# ü§ñ Smart Cage - MQ2 Gas Sensor Model Training

**Samsung Innovation Campus - Phase 3**

Notebook ini melatih model ML untuk klasifikasi kondisi gas:
- **Aman**: Tidak ada gas terdeteksi atau durasi < 2 detik
- **Waspada**: Gas terdeteksi selama 2-4 detik
- **Bahaya**: Gas terdeteksi selama > 4 detik

## üì¶ Step 1: Install Dependencies

In [None]:
!pip install pandas scikit-learn matplotlib seaborn joblib

## üìö Step 2: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
import joblib

print("‚úÖ Libraries imported!")

## üìÅ Step 3: Upload Dataset

Upload file CSV yang dikumpulkan dari `collect_dataset_mq2.ipynb`

In [None]:
from google.colab import files

print("üì§ Upload your MQ2 dataset CSV file:")
uploaded = files.upload()

# Get filename
import os
csv_file = list(uploaded.keys())[0]
print(f"üìÅ Uploaded: {csv_file}")

## üîç Step 4: Load & Explore Dataset

In [None]:
# Load dataset
df = pd.read_csv(csv_file)

print("üìä Dataset Info:")
print(f"  Rows: {len(df)}")
print(f"  Columns: {list(df.columns)}")

print("\nüìã Preview:")
display(df.head())

print("\nüìà Label Distribution:")
print(df['label'].value_counts())

print("\nüìä Statistics:")
display(df.describe())

print("\n‚ùì Missing Values:")
print(df.isnull().sum())

## üé® Step 5: Data Visualization

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Duration distribution by label
colors = {'Aman': 'green', 'Waspada': 'orange', 'Bahaya': 'red'}
for label in df['label'].unique():
    subset = df[df['label'] == label]
    axes[0].hist(subset['duration_ms'], bins=15, alpha=0.6, 
                 label=label, color=colors.get(label, 'gray'))
axes[0].set_xlabel('Duration (ms)')
axes[0].set_ylabel('Count')
axes[0].set_title('Duration Distribution by Label')
axes[0].axvline(x=2000, color='orange', linestyle='--', linewidth=2)
axes[0].axvline(x=4000, color='red', linestyle='--', linewidth=2)
axes[0].legend()

# Label pie chart
label_counts = df['label'].value_counts()
pie_colors = [colors.get(label, 'gray') for label in label_counts.index]
axes[1].pie(label_counts, labels=label_counts.index, autopct='%1.1f%%',
            colors=pie_colors, startangle=90)
axes[1].set_title('Label Distribution')

plt.tight_layout()
plt.show()

## ‚úÇÔ∏è Step 6: Prepare Data & Split

In [None]:
# Feature: duration_ms only
# Note: We use duration_ms as the primary feature
# The model will learn the thresholds from the data

X = df[['duration_ms']]
y = df['label']

# Split 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"‚úÖ Data split complete!")
print(f"   Training: {len(X_train)} samples")
print(f"   Testing: {len(X_test)} samples")

print("\nüìä Training label distribution:")
print(y_train.value_counts())

print("\nüìä Testing label distribution:")
print(y_test.value_counts())

## ü§ñ Step 7: Train Model (Decision Tree)

In [None]:
# Train Decision Tree
model = DecisionTreeClassifier(random_state=42, max_depth=5)
model.fit(X_train, y_train)

print(f"‚úÖ Model trained!")
print(f"   Model: {type(model).__name__}")
print(f"   Max depth: {model.max_depth}")

## üìä Step 8: Model Evaluation

In [None]:
# Predictions
y_pred = model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("=" * 50)
print("üìä MODEL EVALUATION METRICS")
print("=" * 50)
print(f"‚úÖ Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"üìç Precision: {precision:.4f}")
print(f"üéØ Recall:    {recall:.4f}")
print(f"‚öñÔ∏è F1-Score:  {f1:.4f}")
print("=" * 50)

print("\nüìã Classification Report:")
print(classification_report(y_test, y_pred))

## üìà Step 9: Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
labels = sorted(df['label'].unique())

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - MQ2 Gas Sensor Model')
plt.show()

print("\nüìä Confusion Matrix:")
print(cm)

## üå≥ Step 10: Visualize Decision Tree

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(15, 8))
plot_tree(model, feature_names=['duration_ms'], 
          class_names=sorted(df['label'].unique()),
          filled=True, rounded=True, fontsize=10)
plt.title('Decision Tree - MQ2 Gas Sensor Classification')
plt.tight_layout()
plt.show()

## üíæ Step 11: Save Model

In [None]:
model_filename = "mq2_gas_model.pkl"
joblib.dump(model, model_filename)

print(f"‚úÖ Model saved: {model_filename}")
print(f"üìä File size: {os.path.getsize(model_filename)} bytes")

# Download model
files.download(model_filename)
print("üì• Download started!")

## üß™ Step 12: Test Predictions

In [None]:
# Test dengan data baru
test_data = [
    [0],        # No gas - should be Aman
    [500],      # Short detection - should be Aman
    [1500],     # < 2s - should be Aman
    [2500],     # 2-4s - should be Waspada
    [3500],     # 2-4s - should be Waspada
    [5000],     # > 4s - should be Bahaya
    [10000],    # Long detection - should be Bahaya
]

predictions = model.predict(test_data)
probabilities = model.predict_proba(test_data)

print("=" * 60)
print("üß™ TEST PREDICTIONS")
print("=" * 60)
print(f"{'Duration (ms)':<15} {'Prediction':<12} {'Confidence':<12}")
print("-" * 60)

for i, (data, pred, proba) in enumerate(zip(test_data, predictions, probabilities)):
    confidence = max(proba) * 100
    print(f"{data[0]:<15} {pred:<12} {confidence:.1f}%")

print("=" * 60)