# Data Preprocessing for Predictive Maintenance

This notebook handles data preprocessing for predictive maintenance tasks, including:
- Loading CSV/zip files
- Data resampling
- Outlier handling
- Feature engineering
- Windowing for time series
- Normalization

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import os
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Data Loading Functions

In [None]:
def load_csv_file(file_path):
    """Load a single CSV file"""
    try:
        df = pd.read_csv(file_path)
        print(f"Loaded {file_path}: {df.shape[0]} rows, {df.shape[1]} columns")
        return df
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

def load_zip_file(zip_path, extract_to='temp_extracted'):
    """Extract and load CSV files from a zip archive"""
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        
        csv_files = [f for f in os.listdir(extract_to) if f.endswith('.csv')]
        dataframes = {}
        
        for csv_file in csv_files:
            file_path = os.path.join(extract_to, csv_file)
            df = load_csv_file(file_path)
            if df is not None:
                dataframes[csv_file] = df
        
        return dataframes
    except Exception as e:
        print(f"Error processing zip file: {e}")
        return {}

def load_data(file_path):
    """Universal data loader for CSV or ZIP files"""
    if file_path.endswith('.csv'):
        return {'data': load_csv_file(file_path)}
    elif file_path.endswith('.zip'):
        return load_zip_file(file_path)
    else:
        print("Unsupported file format. Please use CSV or ZIP files.")
        return {}

## 2. Data Resampling

In [None]:
def resample_data(df, time_col='time', freq='1H', method='mean'):
    """Resample time series data"""
    if time_col not in df.columns:
        print(f"Time column '{time_col}' not found. Available columns: {list(df.columns)}")
        return df
    
    df[time_col] = pd.to_datetime(df[time_col])
    df = df.set_index(time_col)
    
    if method == 'mean':
        resampled = df.resample(freq).mean()
    elif method == 'sum':
        resampled = df.resample(freq).sum()
    elif method == 'max':
        resampled = df.resample(freq).max()
    elif method == 'min':
        resampled = df.resample(freq).min()
    else:
        resampled = df.resample(freq).mean()
    
    print(f"Resampled data from {len(df)} to {len(resampled)} rows")
    return resampled.reset_index()

## 3. Outlier Handling

In [None]:
def detect_outliers_zscore(df, threshold=3, columns=None):
    """Detect outliers using Z-score method"""
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    
    outlier_mask = np.zeros(len(df), dtype=bool)
    
    for col in columns:
        if col in df.columns:
            z_scores = np.abs(stats.zscore(df[col].dropna()))
            outlier_mask |= (z_scores > threshold)
    
    return outlier_mask

def detect_outliers_iqr(df, columns=None):
    """Detect outliers using IQR method"""
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    
    outlier_mask = np.zeros(len(df), dtype=bool)
    
    for col in columns:
        if col in df.columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            outlier_mask |= ((df[col] < lower_bound) | (df[col] > upper_bound))
    
    return outlier_mask

def handle_outliers(df, method='clip', outlier_method='iqr', columns=None):
    """Handle outliers by removing, clipping, or imputing"""
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    
    if outlier_method == 'zscore':
        outlier_mask = detect_outliers_zscore(df, columns=columns)
    else:
        outlier_mask = detect_outliers_iqr(df, columns=columns)
    
    if method == 'remove':
        df_clean = df[~outlier_mask].copy()
        print(f"Removed {outlier_mask.sum()} outliers")
    elif method == 'clip':
        df_clean = df.copy()
        for col in columns:
            if col in df.columns:
                Q1 = df[col].quantile(0.25)
                Q3 = df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                df_clean[col] = np.clip(df[col], lower_bound, upper_bound)
        print(f"Clipped outliers in {len(columns)} columns")
    elif method == 'median':
        df_clean = df.copy()
        for col in columns:
            if col in df.columns:
                median_val = df[col].median()
                df_clean.loc[outlier_mask, col] = median_val
        print(f"Imputed outliers with median in {len(columns)} columns")
    
    return df_clean

## 4. Feature Engineering

In [None]:
def create_rolling_features(df, columns=None, windows=[5, 10, 20]):
    """Create rolling statistics features"""
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    
    df_featured = df.copy()
    
    for col in columns:
        if col in df.columns:
            for window in windows:
                df_featured[f'{col}_rolling_mean_{window}'] = df[col].rolling(window=window).mean()
                df_featured[f'{col}_rolling_std_{window}'] = df[col].rolling(window=window).std()
                df_featured[f'{col}_rolling_min_{window}'] = df[col].rolling(window=window).min()
                df_featured[f'{col}_rolling_max_{window}'] = df[col].rolling(window=window).max()
    
    print(f"Created rolling features for {len(columns)} columns with windows {windows}")
    return df_featured

def create_lag_features(df, columns=None, lags=[1, 2, 3]):
    """Create lag features"""
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    
    df_featured = df.copy()
    
    for col in columns:
        if col in df.columns:
            for lag in lags:
                df_featured[f'{col}_lag_{lag}'] = df[col].shift(lag)
    
    print(f"Created lag features for {len(columns)} columns with lags {lags}")
    return df_featured

def create_difference_features(df, columns=None):
    """Create difference features"""
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    
    df_featured = df.copy()
    
    for col in columns:
        if col in df.columns:
            df_featured[f'{col}_diff'] = df[col].diff()
            df_featured[f'{col}_pct_change'] = df[col].pct_change()
    
    print(f"Created difference features for {len(columns)} columns")
    return df_featured

## 5. Windowing for Time Series

In [None]:
def create_sliding_windows(df, window_size=30, step_size=1, target_col=None):
    """Create sliding windows for time series prediction"""
    windows = []
    targets = []
    
    for i in range(0, len(df) - window_size, step_size):
        window = df.iloc[i:i+window_size].values
        windows.append(window)
        
        if target_col and i + window_size < len(df):
            target = df.iloc[i + window_size][target_col]
            targets.append(target)
    
    windows = np.array(windows)
    
    if target_col:
        targets = np.array(targets)
        print(f"Created {len(windows)} windows with targets")
        return windows, targets
    else:
        print(f"Created {len(windows)} windows")
        return windows

def create_sequences_for_lstm(df, sequence_length=50, target_col=None):
    """Create sequences specifically for LSTM models"""
    sequences = []
    targets = []
    
    for i in range(len(df) - sequence_length):
        seq = df.iloc[i:i+sequence_length].values
        sequences.append(seq)
        
        if target_col:
            target = df.iloc[i+sequence_length][target_col]
            targets.append(target)
    
    sequences = np.array(sequences)
    
    if target_col:
        targets = np.array(targets)
        print(f"Created {len(sequences)} LSTM sequences with targets")
        return sequences, targets
    else:
        print(f"Created {len(sequences)} LSTM sequences")
        return sequences

## 6. Normalization

In [None]:
def normalize_data(df, method='standard', columns=None):
    """Normalize numerical columns"""
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    
    df_normalized = df.copy()
    
    if method == 'standard':
        scaler = StandardScaler()
    elif method == 'minmax':
        scaler = MinMaxScaler()
    else:
        print("Unknown normalization method. Using standard scaling.")
        scaler = StandardScaler()
    
    df_normalized[columns] = scaler.fit_transform(df[columns])
    
    print(f"Normalized {len(columns)} columns using {method} scaling")
    return df_normalized, scaler

## 7. Complete Preprocessing Pipeline

In [None]:
def preprocess_data_pipeline(file_path, config=None):
    """Complete preprocessing pipeline"""
    if config is None:
        config = {
            'resample': {'freq': '1H', 'method': 'mean'},
            'outliers': {'method': 'clip', 'outlier_method': 'iqr'},
            'features': {'rolling_windows': [5, 10], 'lags': [1, 2]},
            'normalize': {'method': 'standard'}
        }
    
    # Load data
    data_dict = load_data(file_path)
    
    processed_data = {}
    
    for name, df in data_dict.items():
        if df is None:
            continue
        
        print(f"\nProcessing {name}...")
        
        # Resample
        if 'resample' in config:
            df = resample_data(df, **config['resample'])
        
        # Handle outliers
        if 'outliers' in config:
            df = handle_outliers(df, **config['outliers'])
        
        # Feature engineering
        if 'features' in config:
            if 'rolling_windows' in config['features']:
                df = create_rolling_features(df, windows=config['features']['rolling_windows'])
            if 'lags' in config['features']:
                df = create_lag_features(df, lags=config['features']['lags'])
            df = create_difference_features(df)
        
        # Normalize
        if 'normalize' in config:
            df, scaler = normalize_data(df, **config['normalize'])
        
        # Drop NaN values created by feature engineering
        df = df.dropna()
        
        processed_data[name] = df
        print(f"Final shape: {df.shape}")
    
    return processed_data

## 8. Example Usage

In [None]:
# Example usage (uncomment and modify for your data)
# file_path = 'path/to/your/data.csv'  # or .zip
# processed_data = preprocess_data_pipeline(file_path)
# 
# # For windowing (if needed for model training)
# if 'data' in processed_data:
#     df = processed_data['data']
#     windows, targets = create_sliding_windows(df, window_size=30, target_col='target_column')
#     print(f"Windows shape: {windows.shape}, Targets shape: {targets.shape}")

print("Data preprocessing functions defined. Ready to use!")