In [None]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
import pyarrow.parquet as pq


Importacion de los datos

In [None]:
def load_data(data_dir='data'):
    """Carga datos desde la carpeta 'data' con paths relativos seguros"""
    try:
        # Verifica existencia de archivos
        train_csv_path = os.path.join(data_dir, 'train.csv')
        test_csv_path = os.path.join(data_dir, 'test.csv')
        train_parquet_path = os.path.join(data_dir, 'train.parquet')
        test_parquet_path = os.path.join(data_dir, 'test.parquet')
        
        if not all(os.path.exists(f) for f in [train_csv_path, test_csv_path]):
            raise FileNotFoundError("Archivos CSV no encontrados en la carpeta 'data'")
        
        # Carga CSV
        train_csv = pd.read_csv(train_csv_path)
        test_csv = pd.read_csv(test_csv_path)
        
        # Carga Parquet si existen
        train_parquet = pd.DataFrame()
        test_parquet = pd.DataFrame()
        if os.path.exists(train_parquet_path):
            train_parquet = pq.read_table(train_parquet_path).to_pandas()
        if os.path.exists(test_parquet_path):
            test_parquet = pq.read_table(test_parquet_path).to_pandas()
        
        # Combina datos
        train = pd.merge(train_csv, train_parquet, on='Subject_ID', how='left') if not train_parquet.empty else train_csv
        test = pd.merge(test_csv, test_parquet, on='Subject_ID', how='left') if not test_parquet.empty else test_csv
        
        return train, test, {}
    
    except Exception as e:
        print(f"Error cargando datos: {str(e)}")
        raise

Hacer feature engineering

In [None]:
def summarize_actigraphy(df, subject_id_col='Subject_ID'):
    """
    Enhanced actigraphy processing with:
    - Percentiles (10th, 25th, 75th, 90th)
    - Robust statistical measures (IQR, MAD)
    - Frequency domain features (FFT)
    """
    exclude_cols = [subject_id_col, 'timestamp']
    num_cols = [col for col in df.columns 
               if col not in exclude_cols 
               and pd.api.types.is_numeric_dtype(df[col])]
    
    # Time-domain features
    stats = {
        'mean': np.mean,
        'std': np.std,
        'min': np.min,
        'max': np.max,
        'median': np.median,
        'skew': skew,
        'kurtosis': kurtosis,
        'q1': lambda x: np.percentile(x, 25),
        'q3': lambda x: np.percentile(x, 75),
        'iqr': lambda x: np.percentile(x, 75) - np.percentile(x, 25),
        'mad': lambda x: np.median(np.abs(x - np.median(x)))
    }
    
    # Frequency-domain features (simplified FFT)
    def dominant_freq(x):
        if len(x) < 2: return 0
        fft = np.abs(np.fft.fft(x))
        return np.argmax(fft[1:len(fft)//2]) + 1
    
    summary = df.groupby(subject_id_col)[num_cols].agg(stats)
    summary.columns = [f'{col}_{stat}' for col, stat in summary.columns]
    
    # Add frequency features
    freq_features = df.groupby(subject_id_col)[num_cols].agg(dominant_freq)
    freq_features.columns = [f'{col}_dominant_freq' for col in freq_features.columns]
    
    return pd.concat([summary, freq_features], axis=1).reset_index()

In [None]:
def extract_time_features(df, subject_col='Subject_ID'):
    """Extrae 15 features temporales clave por sujeto"""
    features = []
    for subject_id, group in df.groupby(subject_col):
        if 'timestamp' in group.columns:
            time_diff = group['timestamp'].diff().dt.total_seconds()
            feat = {
                'Subject_ID': subject_id,
                'total_events': len(group),
                'active_hours': (time_diff < 3600).sum(),
                'night_activity': group[group['timestamp'].dt.hour.between(0, 6)]['value'].mean(),
                'max_activity': group['value'].max(),
                'std_activity': group['value'].std(),
            }
            features.append(feat)
    return pd.DataFrame(features)

Complete the submission

In [None]:
import os
import pandas as pd

'''
Utility functions for competition submission
'''

def save_submission(test, preds, output_dir):
    '''
    Saves predictions in Kaggle submission format
    
    Args:
        test: Test DataFrame
        preds: Model predictions
        output_dir: Directory to save submission file
    '''
    submission = pd.DataFrame({
        'id': test['id'],
        'sii': preds
    })
    os.makedirs(output_dir, exist_ok=True)
    submission_path = os.path.join(output_dir, 'submission.csv')
    submission.to_csv(submission_path, index=False)
    print(f"Submission saved to {submission_path}")