In [1]:
import zipfile
import os


In [3]:
# Path to the zip file
zip_path = 'Vibration_Bearing_RuntoFailure.zip'

# Directory to extract to (you can change this if you like)
extract_to = 'dataset'


In [None]:
# Create the folder if it doesn't exist
os.makedirs(extract_to, exist_ok=True)

# Extract the zip
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print("✅ All files extracted to:", extract_to)


In [None]:
import os
import glob
import pandas as pd

# ✅ Updated path where the CSVs were extracted
extract_to = 'dataset/Vibration_Bearing_RuntoFailure'

# ✅ Recursively find all CSV files in the folder
csv_files = glob.glob(os.path.join(extract_to, '**', '*.csv'), recursive=True)

print(f"✅ Found {len(csv_files)} CSV files.")

# ✅ Load and combine all CSV files
data_list = []
for file in csv_files:
    df = pd.read_csv(file, header=None)  # No header in the CSVs
    data_list.append(df)

# ✅ Check if any CSVs were actually read
if len(data_list) == 0:
    print("❌ No data loaded. Check if CSVs are present.")
else:
    # ✅ Combine into one big DataFrame
    full_data = pd.concat(data_list, ignore_index=True)

    # ✅ Rename the columns
    full_data.columns = ['vibration_x', 'vibration_y', 'bearing_temp', 'ambient_temp']

    print("✅ All files combined into one DataFrame.")
    print(full_data.head())
    print("✅ DataFrame shape:", full_data.shape)



In [None]:
print(full_data.head())


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(full_data)

print("✅ Normalization complete.")


In [None]:
pip install dask


In [None]:
# Step 1: Imports
import os
import dask.dataframe as dd

# Step 2: Set paths
extract_to = 'dataset/Vibration_Bearing_RuntoFailure'  # Where original CSVs were extracted
normalized_dir = 'normalized_dataset'  # New directory for normalized files
os.makedirs(normalized_dir, exist_ok=True)  # Create if not exists

# Step 3: Read all CSV files using Dask (headers are not present, add manually)
dask_df = dd.read_csv(
    os.path.join(extract_to, '*.csv'),
    header=None,
    names=['vibration_x', 'vibration_y', 'bearing_temp', 'ambient_temp']
)

print("✅ Loaded all CSVs with Dask")

# Step 4: Normalize each column using min-max scaling
# First compute min and max for each column
min_vals = dask_df.min().compute()
max_vals = dask_df.max().compute()

# Apply min-max normalization
dask_df_scaled = dask_df.map_partitions(
    lambda df: (df - min_vals) / (max_vals - min_vals)
)

print("✅ Normalization (min-max scaling) complete")

# Step 5: Save normalized chunks to a new folder
dask_df_scaled.to_csv(os.path.join(normalized_dir, 'part_*.csv'), index=False)
print("✅ Normalized files saved to:", normalized_dir)


In [None]:
# Define the output path (change if needed)
output_path = 'researchModel'

# Save the DataFrame
full_data.to_csv(output_path, index=False)

print(f"✅ full_data saved to: {output_path}")


In [None]:
import dask.dataframe as dd

# Read large CSV with Dask
dask_df = dd.read_csv('researchModel')

# Check sample
print(dask_df.head())


In [None]:
!pip  install pyarrow

In [None]:
import dask.dataframe as dd

# Read large CSV with Dask
dask_df = dd.read_csv('researchModel')

# Check sample
print(dask_df.head())


In [None]:
pip install --upgrade pyarrow


In [None]:
import dask.dataframe as dd

# Read CSV
dask_df = dd.read_csv('researchModel')  # Make sure this is a valid CSV path or pattern like 'researchModel/*.csv'

# Preview sample
print(dask_df.head())


In [None]:
pip install dask-ml


In [None]:
import dask.dataframe as dd
from dask_ml.preprocessing import MinMaxScaler

# Load Dask DataFrame


# Initialize Dask-compatible scaler
scaler = MinMaxScaler()

# Fit and transform using Dask
scaled_df = scaler.fit_transform(dask_df)

# Write scaled result to CSV (in partitions)
scaled_df.to_csv('part_*.csv', index=False)


In [7]:
import dask.dataframe as dd

# Load all CSV files as Dask DataFrame
scaled_dask_df = dd.read_csv("part_*.csv")

# If you want to inspect
print(scaled_dask_df.head())


   vibration_x  vibration_y  bearing_temp  ambient_temp
0     0.562966     0.805120      0.146285       0.17329
1     0.561135     0.799422      0.146285       0.17329
2     0.561044     0.798160      0.146285       0.17329
3     0.564357     0.802627      0.146285       0.17329
4     0.565723     0.804576      0.146285       0.17329


In [9]:
# Get approximate number of rows (will compute)
print("Rows:", len(scaled_dask_df))
print("Columns:", len(scaled_dask_df.columns))


Rows: 258000129
Columns: 4




In [11]:
# Load a sample for training (you can increase later)
sample_df = scaled_dask_df.head(1_000_000)  # adjust size based on RAM
print("✅ Sample shape:", sample_df.shape)

# Convert to NumPy
data_np = sample_df.to_numpy()
print("✅ Converted to NumPy:", data_np.shape)


✅ Sample shape: (893365, 4)
✅ Converted to NumPy: (893365, 4)


In [13]:
import numpy as np

def create_sequences(data, time_steps=100):
    sequences = []
    for i in range(len(data) - time_steps):
        sequences.append(data[i:i+time_steps])
    return np.array(sequences)

# Create sequences
TIME_STEPS = 100
sequences = create_sequences(data_np, TIME_STEPS)

print("✅ LSTM input shape:", sequences.shape)  # (samples, 100, 4)


✅ LSTM input shape: (893265, 100, 4)


In [15]:
train_size = int(0.8 * len(sequences))
X_train = sequences[:train_size]
X_test = sequences[train_size:]

print("✅ Train:", X_train.shape)
print("✅ Test:", X_test.shape)


✅ Train: (714612, 100, 4)
✅ Test: (178653, 100, 4)


In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, RepeatVector, TimeDistributed, Dense

model = Sequential([
    LSTM(64, activation='relu', input_shape=(TIME_STEPS, X_train.shape[2]), return_sequences=False),
    RepeatVector(TIME_STEPS),
    LSTM(64, activation='relu', return_sequences=True),
    TimeDistributed(Dense(X_train.shape[2]))
])

model.compile(optimizer='adam', loss='mse')
model.summary()

# Train model
history = model.fit(X_train, X_train, epochs=10, batch_size=64, validation_split=0.1, shuffle=True)

print("✅ Model training complete.")


  super().__init__(**kwargs)


Epoch 1/10
[1m10050/10050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1069s[0m 106ms/step - loss: nan - val_loss: nan
Epoch 2/10
[1m10050/10050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1032s[0m 103ms/step - loss: nan - val_loss: nan
Epoch 3/10
[1m10050/10050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1040s[0m 104ms/step - loss: nan - val_loss: nan
Epoch 4/10
[1m10050/10050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m976s[0m 97ms/step - loss: nan - val_loss: nan
Epoch 5/10
[1m10050/10050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m954s[0m 95ms/step - loss: nan - val_loss: nan
Epoch 6/10
[1m10050/10050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m986s[0m 98ms/step - loss: nan - val_loss: nan
Epoch 7/10
[1m10050/10050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m992s[0m 99ms/step - loss: nan - val_loss: nan
Epoch 8/10
[1m10050/10050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m989s[0m 98ms/step - loss: nan - val_loss: nan
Epoch 9/10
[1m10050/10050

In [27]:
# 📈 STEP 6: Calculate Reconstruction Error with NaN Fix

# Predict reconstruction of the test set
reconstructions = model.predict(X_test)

# ✅ Fix: Check and replace any NaN values
import numpy as np

print("✅ Any NaNs in X_test:", np.isnan(X_test).any())
print("✅ Any NaNs in reconstructions:", np.isnan(reconstructions).any())

# Replace NaNs with zeros (or you can use np.nanmean if needed)
X_test = np.nan_to_num(X_test)
reconstructions = np.nan_to_num(reconstructions)

# Now safely calculate MSE (reconstruction error)
mse = np.mean(np.power(X_test - reconstructions, 2), axis=(1, 2))

print("✅ Reconstruction error calculated.")
print("📊 MSE stats -> min:", np.min(mse), "max:", np.max(mse), "mean:", np.mean(mse))



[1m5583/5583[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 15ms/step
✅ Any NaNs in X_test: False
✅ Any NaNs in reconstructions: True
✅ Reconstruction error calculated.
📊 MSE stats -> min: 0.2579157377624866 max: 0.2645105030795334 mean: 0.26113250546200667


In [29]:
# 🔍 STEP 7: Anomaly Detection

# Use 95th percentile as threshold
threshold = np.percentile(mse, 95)

# Identify which points are anomalies
anomalies = mse > threshold

print(f"✅ Anomaly threshold (95th percentile): {threshold}")
print(f"✅ Number of anomalies detected: {np.sum(anomalies)} out of {len(mse)} samples")


✅ Anomaly threshold (95th percentile): 0.2627829735664438
✅ Number of anomalies detected: 8933 out of 178653 samples


In [None]:
# 🕐 STEP 9: Real-Time Simulation Using Sliding Window

WINDOW_SIZE = TIME_STEPS  # 100
step = 1  # slide one row at a time (can be >1 for faster simulation)
stream_data = data_np  # use full normalized dataset here

real_time_mse = []
real_time_anomalies = []
real_time_indices = []

for i in range(0, len(stream_data) - WINDOW_SIZE, step):
    window = stream_data[i:i+WINDOW_SIZE]
    window_input = np.expand_dims(window, axis=0)  # reshape to (1, 100, 4)
    
    # Predict and calculate error
    reconstructed = model.predict(window_input)
    error = np.mean(np.power(window_input - reconstructed, 2))

    real_time_mse.append(error)
    real_time_anomalies.append(error > threshold)
    real_time_indices.append(i + WINDOW_SIZE)  # point just after the window

print("✅ Real-time simulation complete.")
print("🚨 Total anomalies detected in stream:", sum(real_time_anomalies))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.plot(real_time_indices, real_time_mse, label='Real-Time MSE', color='blue')
plt.axhline(y=threshold, color='red', linestyle='--', label='Threshold')

# Highlight anomalies
anomaly_times = [idx for idx, is_anom in zip(real_time_indices, real_time_anomalies) if is_anom]
plt.scatter(anomaly_times, [threshold]*len(anomaly_times), color='orange', marker='x', label='Anomalies')

plt.title('🕐 Real-Time Sliding Window Anomaly Detection')
plt.xlabel('Time Index')
plt.ylabel('Reconstruction Error (MSE)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
plt.plot([1, 2, 3], [4, 5, 6])
plt.show()


In [None]:
anomaly_indices = np.where(anomalies)[0]

# Adjust for the time window offset (test set starts after 80%)
original_indices = anomaly_indices + train_size + TIME_STEPS

print("📍 First 10 Anomaly Points in Original DataFrame:", original_indices[:10])
sample_df.iloc[original_indices[:5]]
