<a href="https://colab.research.google.com/github/ManishaEY/eyaipythontraining/blob/main/Untitled10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# The direct URL to the raw CSV file on GitHub
url = 'https://raw.githubusercontent.com/ManishaEY/eyaipythontraining/main/equipment_data.csv'

# Read dataset from the GitHub raw URL
data = pd.read_csv(url)

# Separate features and target variable
X = data[['temperature', 'pressure', 'vibration', 'acoustic', 'operational_hours']]
y = data['maintenance_performed']

# Split the dataset into training (80%) and testing sets (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data (feature scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize machine learning models
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC()

# Train models on normalized data
rf_clf.fit(X_train_scaled, y_train)
svm_clf.fit(X_train_scaled, y_train)

# Make predictions on test data
rf_predictions = rf_clf.predict(X_test_scaled)
svm_predictions = svm_clf.predict(X_test_scaled)

# Evaluate models' performance
rf_acc = accuracy_score(y_test, rf_predictions)
svm_acc = accuracy_score(y_test, svm_predictions)

rf_report = classification_report(y_test, rf_predictions)
svm_report = classification_report(y_test, svm_predictions)

print(f"Random Forest Model Accuracy: {rf_acc:.2f}")
print(f"Support Vector Machine Model Accuracy: {svm_acc:.2f}")
print("\nRandom Forest Classification Report:")
print(rf_report)
print("\nSupport Vector Machine Classification Report:")
print(svm_report)

# Select and print the best model based on accuracy
best_model = 'Random Forest' if rf_acc > svm_acc else 'Support Vector Machine'
print(f"The best model is {best_model}")


Random Forest Model Accuracy: 0.48
Support Vector Machine Model Accuracy: 0.49

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.64      0.55        99
           1       0.49      0.34      0.40       101

    accuracy                           0.48       200
   macro avg       0.49      0.49      0.47       200
weighted avg       0.49      0.48      0.47       200


Support Vector Machine Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.72      0.58        99
           1       0.49      0.27      0.35       101

    accuracy                           0.49       200
   macro avg       0.49      0.49      0.46       200
weighted avg       0.49      0.49      0.46       200

The best model is Support Vector Machine


In [12]:
pip install --upgrade statsmodels



In [13]:
pip install --upgrade scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.4.2


In [18]:
from statsmodels.tsa.arima.model import ARIMA
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_absolute_error, classification_report
import pandas as pd
import numpy as np
import joblib

# Load dataset for forecasting from the GitHub raw URL
url = 'https://raw.githubusercontent.com/ManishaEY/eyaipythontraining/main/equipment_data.csv'
time_series_data = pd.read_csv(url, parse_dates=['timestamp'], index_col='timestamp')
# time_series_data.index.freq = 'D'
sensor_column_name = 'temperature'  # change this to the actual column name

# ARIMA Model for time-series forecasting
arima_model = ARIMA(time_series_data[sensor_column_name], order=(1, 1, 1))
arima_results = arima_model.fit()

# Use get_forecast to forecast future sensor readings with stderr and conf_int
n_periods = 5  # change this to forecast more periods
forecast_results = arima_results.get_forecast(steps=n_periods)
forecast_mean = forecast_results.predicted_mean
stderr = forecast_results.se_mean
conf_int = forecast_results.conf_int()

# Hardcoded actual future readings for the temperature sensor
actual_future_readings = [21.5, 21.3, 21.7, 22.0, 22.5]  # Dummy actual readings

# Calculate Mean Absolute Error for the forecast
mae = mean_absolute_error(actual_future_readings, forecast_mean)
print(f"Mean Absolute Error for the forecast: {mae}")

# Load dataset for anomaly detection (same URL as forecasting in this case)
anomaly_data = pd.read_csv(url)
feature_cols = ['temperature', 'pressure', 'vibration', 'acoustic']  # change to your actual sensor data columns

# Isolation Forest Model for anomaly detection
iso_forest = IsolationForest(n_estimators=100, max_samples='auto', random_state=42, contamination='auto')
iso_forest.fit(anomaly_data[feature_cols])

# Anomaly detection in the dataset
anomaly_scores = iso_forest.decision_function(anomaly_data[feature_cols])
anomaly_data['anomaly'] = iso_forest.predict(anomaly_data[feature_cols])
anomaly_data['anomaly'] = anomaly_data['anomaly'].map({1: 0, -1: 1})  # Mapping 1 to normal, -1 to anomaly

# Hardcoded actual anomaly labels (0 for normal and 1 for anomaly)
actual_anomalies = [0, 0, 0, 1, 0, 0, 1, 0, 0, 1]  # Dummy anomaly labels

# Calculate classification report for anomaly detection
report = classification_report(actual_anomalies, anomaly_data['anomaly'][:len(actual_anomalies)], target_names=['normal', 'anomaly'])
print("\nAnomaly Detection Classification Report: \n", report)

# Save ARIMA model's entire results including parameters
joblib.dump(arima_results, 'arima_model.pkl')

# Save Isolation Forest model
joblib.dump(iso_forest, 'isolation_forest_model.pkl')


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Mean Absolute Error for the forecast: 65.56027218155381

Anomaly Detection Classification Report: 
               precision    recall  f1-score   support

      normal       1.00      0.43      0.60         7
     anomaly       0.43      1.00      0.60         3

    accuracy                           0.60        10
   macro avg       0.71      0.71      0.60        10
weighted avg       0.83      0.60      0.60        10



['isolation_forest_model.pkl']

In [17]:
from statsmodels.tsa.arima.model import ARIMA
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_absolute_error, classification_report
import pandas as pd
import numpy as np
import joblib


# Load dataset for forecasting from the GitHub raw URL
url = 'https://raw.githubusercontent.com/ManishaEY/eyaipythontraining/main/equipment_data.csv'
time_series_data = pd.read_csv(url, parse_dates=['timestamp'], index_col='timestamp')

# You'll need to infer or set the frequency of the dataset here
time_series_data.index.freq = pd.infer_freq(time_series_data.index)  # or 'D' for daily, 'H' for hourly, etc.

sensor_column_name = 'temperature'  # change this to the actual column name

# ARIMA Model for time-series forecasting
arima_model = ARIMA(time_series_data[sensor_column_name], order=(1, 1, 1))
arima_results = arima_model.fit()

# Use get_forecast to forecast future sensor readings
n_periods = 5
forecast_results = arima_results.get_forecast(steps=n_periods)
forecast_mean = forecast_results.predicted_mean
stderr = forecast_results.se_mean
conf_int = forecast_results.conf_int()

# Hardcoded actual future readings for the temperature sensor
actual_future_readings = [21.5, 21.3, 21.7, 22.0, 22.5]  # Dummy actual readings

# Calculate Mean Absolute Error for the forecast
mae = mean_absolute_error(actual_future_readings, forecast_mean)
print(f"Mean Absolute Error for the forecast: {mae}")

# Load dataset for anomaly detection (same URL as forecasting in this case)
anomaly_data = pd.read_csv(url)

# Assuming the dataset also contains a 'machine' column with machine names
machine_column_name = 'machine_id'  # Replace with your actual column name for machine names
anomaly_data[machine_column_name] = np.random.choice(anomaly_data['machine_id'], size=len(anomaly_data))

feature_cols = ['temperature', 'pressure', 'vibration', 'acoustic']  # Replace with actual sensor data columns

# Isolation Forest Model for anomaly detection
iso_forest = IsolationForest(n_estimators=100, max_samples='auto', random_state=42, contamination='auto')
iso_forest.fit(anomaly_data[feature_cols])

# Anomaly detection in the dataset
anomaly_data['anomaly'] = iso_forest.predict(anomaly_data[feature_cols])
anomaly_data['anomaly'] = anomaly_data['anomaly'].map({1: 0, -1: 1})  # Mapping 1 to normal, -1 to anomaly

# Hardcoded actual anomaly labels (0 for normal and 1 for anomaly)
actual_anomalies = [0, 0, 0, 1, 0, 0, 1, 0, 0, 1]  # Dummy anomaly labels

# Calculate classification report for anomaly detection
# Truncate the data to match the length of actual anomaly labels
report = classification_report(actual_anomalies, anomaly_data['anomaly'][:len(actual_anomalies)], target_names=['normal', 'anomaly'])
print("\nAnomaly Detection Classification Report:")
print(report)

# Print machines with anomalies
normal_data_stats = anomaly_data[anomaly_data['anomaly'] == 0][feature_cols].describe()

# Print machines with anomalies including statistical context
anomalies = anomaly_data[anomaly_data['anomaly'] == 1]
print("\nDetected Anomalies:")
for index, row in anomalies.iterrows():
    machine_name = row[machine_column_name]
    anomaly_details = row[feature_cols].to_dict()
    print(f"Anomaly detected in {machine_name} with the following feature values:")
    for feature, value in anomaly_details.items():
        mean_value = normal_data_stats[feature]['mean']
        std_dev = normal_data_stats[feature]['std']
        print(f"  {feature}: {value} (mean: {mean_value:.2f}, std: {std_dev:.2f})")


# Save ARIMA model's entire results including parameters
joblib.dump(arima_results, 'arima_model.pkl')

# Save Isolation Forest model
joblib.dump(iso_forest, 'isolation_forest_model.pkl')


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Mean Absolute Error for the forecast: 65.56027218155381

Anomaly Detection Classification Report:
              precision    recall  f1-score   support

      normal       1.00      0.43      0.60         7
     anomaly       0.43      1.00      0.60         3

    accuracy                           0.60        10
   macro avg       0.71      0.71      0.60        10
weighted avg       0.83      0.60      0.60        10


Detected Anomalies:
Anomaly detected in MCH-075 with the following feature values:
  temperature: 93 (mean: 87.55, std: 2.53)
  pressure: 169 (mean: 178.80, std: 4.99)
  vibration: 0.008 (mean: 0.01, std: 0.00)
  acoustic: 22 (mean: 22.70, std: 1.99)
Anomaly detected in MCH-001 with the following feature values:
  temperature: 94 (mean: 87.55, std: 2.53)
  pressure: 188 (mean: 178.80, std: 4.99)
  vibration: 0.011 (mean: 0.01, std: 0.00)
  acoustic: 22 (mean: 22.70, std: 1.99)
Anomaly detected in MCH-149 with the following feature values:
  temperature: 81 (mean: 87.5

['isolation_forest_model.pkl']