# Combine Processed Sensor Data

This notebook combines all the processed sensor datasets into a unified dataset for federated learning.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
sns.set()

## Load All Processed Datasets

In [None]:
def load_processed_datasets(base_path):
    """Load all processed sensor datasets"""
    sensors = [
        'Fridge',
        'Garage_Door',
        'GPS_Tracker',
        'Modbus',
        'Motion_Light',
        'Thermostat',
        'Weather'
    ]
    
    datasets = {}
    for sensor in sensors:
        file_path = os.path.join(base_path, f'Processed_{sensor.lower()}_sensor_dataset.csv')
        if os.path.exists(file_path):
            try:
                df = pd.read_csv(file_path)
                datasets[sensor] = df
                print(f"Loaded {sensor} dataset with shape {df.shape}")
            except Exception as e:
                print(f"Error loading {sensor} dataset: {str(e)}")
        else:
            print(f"Processed dataset not found for {sensor}")
    
    return datasets

## Analyze Dataset Features

In [None]:
def analyze_features(datasets):
    """Analyze features across all datasets"""
    feature_analysis = {}
    
    for sensor, df in datasets.items():
        feature_analysis[sensor] = {
            'columns': list(df.columns),
            'dtypes': df.dtypes.to_dict(),
            'num_features': len(df.columns),
            'num_samples': len(df)
        }
    
    return feature_analysis

def print_feature_analysis(feature_analysis):
    """Print feature analysis in a readable format"""
    for sensor, analysis in feature_analysis.items():
        print(f"\n{sensor} Dataset:")
        print(f"Number of samples: {analysis['num_samples']}")
        print(f"Number of features: {analysis['num_features']}")
        print("Features:")
        for col, dtype in analysis['dtypes'].items():
            print(f"  - {col}: {dtype}")

## Combine Datasets

In [None]:
def combine_datasets(datasets, feature_analysis):
    """Combine all datasets with proper feature alignment"""
    combined_data = []
    common_features = None
    
    # Find common features across all datasets
    for sensor, analysis in feature_analysis.items():
        if common_features is None:
            common_features = set(analysis['columns'])
        else:
            common_features = common_features.intersection(set(analysis['columns']))
    
    common_features = list(common_features)
    print(f"Found {len(common_features)} common features across all datasets:")
    print(common_features)
    
    # Combine datasets using common features
    for sensor, df in datasets.items():
        # Add source identifier
        df_subset = df[common_features].copy()
        df_subset['source_sensor'] = sensor
        combined_data.append(df_subset)
    
    combined_df = pd.concat(combined_data, axis=0, ignore_index=True)
    return combined_df

## Generate Combined Dataset Analysis

In [None]:
def analyze_combined_dataset(combined_df):
    """Generate analysis and visualizations for the combined dataset"""
    print("\nCombined Dataset Analysis:")
    print(f"Total samples: {len(combined_df)}")
    print(f"Total features: {len(combined_df.columns)}")
    print("\nSamples per sensor:")
    print(combined_df['source_sensor'].value_counts())
    
    # Generate correlation heatmap
    plt.figure(figsize=(12, 8))
    numeric_df = combined_df.select_dtypes(include=[np.number])
    sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Heatmap - Combined Dataset')
    plt.savefig('Correlation_heatmap_combined.png', dpi=600, bbox_inches='tight')
    plt.close()
    
    return combined_df

## Run the Complete Pipeline

In [None]:
base_path = 'path/to/processed/data'
 
# Load all processed datasets
datasets = load_processed_datasets(base_path)
 
# Analyze features
feature_analysis = analyze_features(datasets)
print_feature_analysis(feature_analysis)
 
# Combine datasets
combined_df = combine_datasets(datasets, feature_analysis)
 
# Analyze combined dataset
combined_df = analyze_combined_dataset(combined_df)
 
# Save combined dataset
combined_df.to_csv('combined_sensor_dataset.csv', index=False)
print("\nCombined dataset saved to 'combined_sensor_dataset.csv'")