#  1. SETUP

In [1]:
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

#  2. LOAD DATA

In [2]:
df = pd.read_csv("/content/Occupancy.csv")
df['date'] = pd.to_datetime(df['date'])

#  3. BATCH PROCESSING

In [3]:
print("=== BATCH PROCESSING ===")

X = df[['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio']]
y = df['Occupancy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

batch_model = LogisticRegression(max_iter=1000)
batch_model.fit(X_train, y_train)

y_pred = batch_model.predict(X_test)

batch_metrics = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1-Score": f1_score(y_test, y_pred)
}

print("Batch Metrics:")
for k, v in batch_metrics.items():
    print(f"{k}: {v:.4f}")

=== BATCH PROCESSING ===
Batch Metrics:
Accuracy: 0.9917
Precision: 0.9683
Recall: 0.9957
F1-Score: 0.9818


# 4. STREAM PROCESSING SIMULATION

In [4]:
print("\n=== STREAM PROCESSING ===")
stream_alerts = []
start_time = time.time()

def stream_processing(df):
    predictions = []
    for _, row in df.iterrows():

        input_features = pd.DataFrame([{
            'Temperature': row['Temperature'],
            'Humidity': row['Humidity'],
            'Light': row['Light'],
            'CO2': row['CO2'],
            'HumidityRatio': row['HumidityRatio']
                                          }])
        prediction = batch_model.predict(input_features)[0]

        predictions.append(prediction)

        if row['CO2'] > 800 and prediction == 1:
            stream_alerts.append((row['date'], row['CO2']))

    return predictions

stream_preds = stream_processing(X_test.assign(date=df.loc[X_test.index, 'date']))

stream_metrics = {
    "Accuracy": accuracy_score(y_test, stream_preds),
    "Precision": precision_score(y_test, stream_preds),
    "Recall": recall_score(y_test, stream_preds),
    "F1-Score": f1_score(y_test, stream_preds)
}
end_time = time.time()

print("Stream Metrics (simulated, using same model):")
for k, v in stream_metrics.items():
    print(f"{k}: {v:.4f}")

print(f"\nStream Alerts Raised: {len(stream_alerts)}")
print(f"Stream Processing Time: {end_time - start_time:.2f} seconds")


=== STREAM PROCESSING ===
Stream Metrics (simulated, using same model):
Accuracy: 0.9917
Precision: 0.9683
Recall: 0.9957
F1-Score: 0.9818

Stream Alerts Raised: 645
Stream Processing Time: 6.54 seconds


#  5. COMPARISON TABLE

In [5]:
print("\n=== COMPARISON ===")
import pandas as pd

comparison_df = pd.DataFrame({
    "Metric": list(batch_metrics.keys()),
    "Batch Processing": list(batch_metrics.values()),
    "Stream Processing (Simulated)": list(stream_metrics.values())
})

print(comparison_df.to_string(index=False))


=== COMPARISON ===
   Metric  Batch Processing  Stream Processing (Simulated)
 Accuracy          0.991732                       0.991732
Precision          0.968288                       0.968288
   Recall          0.995652                       0.995652
 F1-Score          0.981779                       0.981779


# **NOTE: Same model is used in both modes hence the identical metrics.**