# FactoryGuard AI - Exploratory Data Analysis
**Week 1, Days 1-2 - All Team Members**

This notebook performs initial exploratory data analysis on the sensor data.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## 1. Load Data

In [None]:
# Load sensor data
df = pd.read_csv('../data/raw/sensor_logs.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
df.head()

## 2. Basic Statistics

In [None]:
print("=== Dataset Summary ===")
print(f"Total records: {len(df):,}")
print(f"Unique machines: {df['machine_id'].nunique()}")
print(f"Time range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"\nFailure events: {df['failure'].sum()}")
print(f"Failure rate: {(df['failure'].sum() / len(df)) * 100:.3f}%")

In [None]:
# Statistical summary
df[['vibration', 'temperature', 'pressure']].describe()

## 3. Missing Values Analysis

In [None]:
print("Missing values:")
print(df.isnull().sum())
print(f"\nMissing percentage: {(df.isnull().sum() / len(df) * 100).round(2)}%")

## 4. Sensor Distributions

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(df['vibration'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('Vibration Distribution')
axes[0].set_xlabel('Vibration')

axes[1].hist(df['temperature'], bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_title('Temperature Distribution')
axes[1].set_xlabel('Temperature (Â°C)')

axes[2].hist(df['pressure'], bins=50, edgecolor='black', alpha=0.7, color='green')
axes[2].set_title('Pressure Distribution')
axes[2].set_xlabel('Pressure')

plt.tight_layout()
plt.show()

## 5. Sensor Correlations

In [None]:
corr = df[['vibration', 'temperature', 'pressure', 'failure']].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, fmt='.3f')
plt.title('Feature Correlation Matrix')
plt.show()

## 6. Time Series Plot (Sample Machine)

In [None]:
# Plot time series for machine 1
machine_data = df[df['machine_id'] == 1].sort_values('timestamp')

fig, axes = plt.subplots(3, 1, figsize=(15, 10), sharex=True)

axes[0].plot(machine_data['timestamp'], machine_data['vibration'], linewidth=1)
axes[0].set_ylabel('Vibration')
axes[0].set_title('Machine 1 - Sensor Timeline')
axes[0].grid(True, alpha=0.3)

axes[1].plot(machine_data['timestamp'], machine_data['temperature'], color='orange', linewidth=1)
axes[1].set_ylabel('Temperature (Â°C)')
axes[1].grid(True, alpha=0.3)

axes[2].plot(machine_data['timestamp'], machine_data['pressure'], color='green', linewidth=1)
axes[2].set_ylabel('Pressure')
axes[2].set_xlabel('Timestamp')
axes[2].grid(True, alpha=0.3)

# Mark failures
failures = machine_data[machine_data['failure'] == 1]
for ax in axes:
    for fail_time in failures['timestamp']:
        ax.axvline(fail_time, color='red', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

## 7. Next Steps

- âœ… Data loaded successfully
- âœ… Understood data structure and distributions
- ðŸ”„ Proceed to data cleaning (`data_cleaning.py`)
- ðŸ”„ Apply temporal feature engineering (`feature_engineering.py`)