# Global Data Processing for All Sensors

This notebook provides a unified preprocessing pipeline for all sensor datasets. It can process any of the sensor datasets (Fridge, Garage Door, GPS Tracker, Modbus, Motion Light, Thermostat, and Weather) using a consistent approach.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
import plotly.express as px
import os
sns.set()

## Define Processing Functions

In [None]:
def load_dataset(file_path):
    """Load a dataset from the given path"""
    return pd.read_csv(file_path)

def handle_missing_values(df):
    """Handle missing values in the dataset"""
    # Fill numeric columns with mean
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
    
    # Fill categorical columns with mode
    categorical_columns = df.select_dtypes(exclude=[np.number]).columns
    df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])
    
    return df

def encode_categorical_features(df):
    """Encode categorical features using LabelEncoder"""
    label_encoders = {}
    categorical_columns = df.select_dtypes(include=['object']).columns
    
    for column in categorical_columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le
    
    return df, label_encoders

def scale_features(df):
    """Scale numerical features using MinMaxScaler"""
    scaler = MinMaxScaler()
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
    return df, scaler

def handle_imbalanced_data(X, y):
    """Handle imbalanced data using RandomOverSampler"""
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(X, y)
    return X_resampled, y_resampled

def generate_visualizations(df, sensor_name):
    """Generate and save visualizations for the dataset"""
    # Correlation heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    plt.title(f'Correlation Heatmap - {sensor_name}')
    plt.savefig(f'Correlation_heatmap_{sensor_name.lower()}.png', dpi=600, bbox_inches='tight')
    plt.close()
    
    # Pairplot
    sns.pairplot(df)
    plt.savefig(f'Pairplot_{sensor_name.lower()}.png', dpi=600)
    plt.close()

def process_dataset(file_path, sensor_name, target_column=None):
    """Process a single dataset with the complete pipeline"""
    # Load the dataset
    print(f"Processing {sensor_name} dataset...")
    df = load_dataset(file_path)
    
    # Handle missing values
    df = handle_missing_values(df)
    
    # Encode categorical features
    df, label_encoders = encode_categorical_features(df)
    
    # Scale features
    df, scaler = scale_features(df)
    
    # Handle imbalanced data if target column is provided
    if target_column and target_column in df.columns:
        y = df[target_column]
        X = df.drop(target_column, axis=1)
        X_resampled, y_resampled = handle_imbalanced_data(X, y)
        df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns),
                       pd.Series(y_resampled, name=target_column)], axis=1)
    
    # Generate visualizations
    generate_visualizations(df, sensor_name)
    
    # Save processed dataset
    output_file = f'Processed_{sensor_name.lower()}_sensor_dataset.csv'
    df.to_csv(output_file, index=False)
    print(f"Processed data saved to {output_file}")
    
    return df, label_encoders, scaler

## Process All Datasets

To process a dataset, call the `process_dataset` function with the appropriate parameters:

In [None]:
# Example usage:
# Replace 'path_to_dataset.csv' with the actual path to your sensor dataset
# Replace 'target_column_name' with the actual target column name if applicable

def process_all_sensors(base_path):
    """Process all sensor datasets in the given directory"""
    sensors = [
        'Fridge',
        'Garage_Door',
        'GPS_Tracker',
        'Modbus',
        'Motion_Light',
        'Thermostat',
        'Weather'
    ]
    
    processed_data = {}
    
    for sensor in sensors:
        file_path = os.path.join(base_path, f'{sensor.lower()}_sensor_dataset.csv')
        if os.path.exists(file_path):
            try:
                df, label_encoders, scaler = process_dataset(
                    file_path=file_path,
                    sensor_name=sensor,
                    target_column=None  # Specify if known
                )
                processed_data[sensor] = {
                    'data': df,
                    'label_encoders': label_encoders,
                    'scaler': scaler
                }
                print(f"Successfully processed {sensor} dataset")
            except Exception as e:
                print(f"Error processing {sensor} dataset: {str(e)}")
        else:
            print(f"Dataset not found for {sensor}")
    
    return processed_data

processed_data = process_all_sensors('path/to/data/directory')