In [34]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
import pyarrow.parquet as pq
from sklearn.preprocessing import OrdinalEncoder
from sentence_transformers import SentenceTransformer
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
import os
from pathlib import Path
import sys
from dotenv import load_dotenv
from openai import OpenAI

In [55]:
# Define robust directory paths within the notebook environment.
BASE_DIR = Path().resolve()
DATA_DIR = BASE_DIR / "data"
OUTPUT_DIR = BASE_DIR / "outputs"

train_path = DATA_DIR / 'train.csv'
test_path = DATA_DIR / 'test.csv'

Importacion de los datos

In [56]:
'# Load environment variables from .env file'
def load_data(data_dir='data'):
    try:
        # Define robust paths for CSV and Parquet files
        train_csv_path = os.path.join(data_dir, 'train.csv')
        test_csv_path = os.path.join(data_dir, 'test.csv')
        train_parquet_path = os.path.join(data_dir, 'train.parquet')
        test_parquet_path = os.path.join(data_dir, 'test.parquet')
        
        if not all(os.path.exists(f) for f in [train_csv_path, test_csv_path]):
            raise FileNotFoundError("CSV files not found in the ‘data’ folder.")
        
        train_csv = pd.read_csv(train_csv_path)
        test_csv = pd.read_csv(test_csv_path)
        
        train_parquet = pd.DataFrame()
        test_parquet = pd.DataFrame()
        if os.path.exists(train_parquet_path):
            train_parquet = pq.read_table(train_parquet_path).to_pandas()
        if os.path.exists(test_parquet_path):
            test_parquet = pq.read_table(test_parquet_path).to_pandas()
        
        # Merge CSV and Parquet data on 'Subject_ID'
        train = pd.merge(train_csv, train_parquet, on='Subject_ID', how='left') if not train_parquet.empty else train_csv
        test = pd.merge(test_csv, test_parquet, on='Subject_ID', how='left') if not test_parquet.empty else test_csv
        
        return train, test, {}
    
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        raise


LLM

In [63]:
load_dotenv() # Load environment variables from .env file

api_key = os.getenv("api1")

In [64]:
client = OpenAI(
    api_key=os.getenv("api1"),
    base_url="https://openrouter.ai/api/v1"
)# Initialize OpenAI client with API key

In [65]:
'# Function to query DeepSeek model with a prompt'
def query_deepseek(prompt: str, model: str = "deepseek/deepseek-r1:free") -> str:
    resp = client.chat.completions.create(
        model=model,
        messages=[{"role":"user", "content":prompt}],
        temperature=0.6,
        max_tokens=4096,
    )
    return resp.choices[0].message.content.strip()

In [66]:
# Load the notebook loader from langchain_community
from langchain_community.document_loaders import NotebookLoader

loader = NotebookLoader("problematic_internet_use_llm.ipynb", include_outputs=False, remove_newline=True)
docs = loader.load()
print(f"Cells found: {len(docs)}")

Cells found: 1


Prompts que hay que realizar de manera general

In [61]:
original_code = """
# Load the active notebook (you can see its name in Kaggle)
from langchain_community.document_loaders import NotebookLoader

loader = NotebookLoader("prueba-llm.ipynb", include_outputs=False, remove_newline=True)
docs = loader.load()
print(f"Cells found: {len(docs)}")
"""

prompt = f"""Rewrite the following code with style and efficiency improvements, without changing its functionality:\n{original_code},
also, everything that is not related to the code should be commented, including the changes made"""

response = query_deepseek(prompt)
print(response)

```python
# Import the NotebookLoader from langchain_community.document_loaders
from langchain_community.document_loaders import NotebookLoader

# Initialize the NotebookLoader with the specified notebook file ('prueba-llm.ipynb')
# - include_outputs=False: Exclude notebook cell outputs from loading
# - remove_newline=True: Remove newline characters for cleaner text processing
loader = NotebookLoader(
    file_path="prueba-llm.ipynb",
    include_outputs=False,
    remove_newline=True
)

# Load the notebook content into documents
docs = loader.load()

# Print the number of cells found in the notebook
print(f"Cells found: {len(docs)}")

# Changes made:
# 1. Added clear comments explaining each section of the code
# 2. Formatted the NotebookLoader initialization with line breaks for better readability
# 3. Used the parameter name 'file_path' explicitly for clarity (though NotebookLoader accepts positional args)
# 4. Maintained all original functionality while improving code style
# 5. Ke

In [48]:
'# Function to analyze a file and answer a question using DeepSeek'
def analyze_file(filepath: str, question: str):
    with open(filepath, "r", encoding="utf-8") as f:
        content = f.read()

    prompt = f"""I have this file named {filepath} with the following content:

[START OF FILE]
{content}
[END OF FILE]

Now, please answer the following:
{question}
"""
    response = query_deepseek(prompt)
    print(response)

In [None]:
analyze_file("data/data_dictionary.csv", "Can you review this file and recommend feature engineering steps?")

FileNotFoundError: [Errno 2] No such file or directory: 'src/data/series_test.parquet/id=001f3379/part-0.parquet'

In [67]:
prompt = f"""
Como puedo ver los datos de un .parquet o decidir que datos manejar y mejorar 
"""

response = query_deepseek(prompt)
print(response)

Para trabajar con archivos **Parquet** (un formato de almacenamiento columnar eficiente) y tomar decisiones sobre los datos, sigue estos pasos:

---

### 1. **Visualizar datos de un archivo Parquet**
#### Herramientas y métodos:

**a. Usando Python (Pandas/PyArrow):**
```python
import pandas as pd

# Leer el archivo Parquet
df = pd.read_parquet("ruta/al/archivo.parquet")

# Ver las primeras filas
print(df.head())

# Ver estadísticas descriptivas
print(df.describe())

# Ver estructura del DataFrame (columnas y tipos de datos)
print(df.info())
```

**b. Con herramientas gráficas:**
- **DuckDB**: Ejecuta consultas SQL directamente sobre Parquet.
  ```sql
  SELECT * FROM 'datos.parquet' LIMIT 10;
  ```
- **Apache Arrow (PyArrow)**: Para inspeccionar el esquema:
  ```python
  import pyarrow.parquet as pq

  tabla = pq.read_table("datos.parquet")
  print(tabla.schema)
  ```

**c. Herramientas externas:**
- **Parquet Viewer** (Windows/Mac): Interfaz gráfica para abrir Parquet.
- **VS Code Ext

Hacer feature engineering

In [None]:
import numpy as np
import pandas as pd
import warnings
from sklearn.preprocessing import (
    OneHotEncoder, 
    OrdinalEncoder, 
    StandardScaler, 
    MinMaxScaler,
    KBinsDiscretizer
)
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

warnings.filterwarnings('ignore')

def preprocess_data(df):
    """
    Comprehensive preprocessing pipeline implementing all feature engineering recommendations
    """
    
    df_processed = df.copy()

    ## 0. Temporal Sorting (antes de eliminar columnas como 'Season')
    season_backup = None
    if 'id' in df_processed.columns and 'Season' in df_processed.columns:
        season_backup = df_processed['Season'].copy()
        df_processed = df_processed.sort_values(['id', 'Season'])

    ## 1. Categorical Variable Encoding
    binary_cols = ['Sex', 'FGC_CU_Zone']  # Add other binary columns
    for col in binary_cols:
        if col in df_processed.columns:
            df_processed[col] = df_processed[col].astype(int)

    # One-hot encoding for nominal categories
    nominal_cols = ['Season']  # Add other nominal columns
    ohe = OneHotEncoder(drop='first', sparse_output=False)
    for col in nominal_cols:
        if col in df_processed.columns:
            encoded = ohe.fit_transform(df_processed[[col]])
            encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out([col]), index=df_processed.index)
            df_processed = pd.concat([df_processed.drop(col, axis=1), encoded_df], axis=1)

    # Ordinal encoding for ordered categories
    ordinal_mappings = {
        'BIA_Activity_Level_num': [1, 2, 3, 4, 5],  # 1=Very Light to 5=Exceptional
        'computerinternet_hoursday': [0, 1, 2, 3]    # 0=Less than 1h/day to 3=More than 3hs/day
    }
    ordinal_encoder = OrdinalEncoder(categories=[ordinal_mappings[col] for col in ordinal_mappings.keys()])
    for col in ordinal_mappings:
        if col in df_processed.columns:
            df_processed[col] = ordinal_encoder.fit_transform(df_processed[[col]])

    ## 2. Feature Transformation
    if season_backup is not None:
        seasons = {'Spring': 0, 'Summer': 1, 'Fall': 2, 'Winter': 3}
        df_processed['Season_sin'] = season_backup.map(seasons).apply(lambda x: np.sin(x * (2*np.pi/4)))
        df_processed['Season_cos'] = season_backup.map(seasons).apply(lambda x: np.cos(x * (2*np.pi/4)))

    skewed_cols = ['BIA_BMR', 'BIA_DEE', 'PAQ_A_Total']
    for col in skewed_cols:
        if col in df_processed.columns:
            df_processed[f'log_{col}'] = np.log1p(df_processed[col])

    if 'Age' in df_processed.columns:
        binner = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
        df_processed['Age_bin'] = binner.fit_transform(df_processed[['Age']])

    ## 3. Composite Features
    if all(col in df_processed.columns for col in ['Fitness_Endurance-Time_Mins', 'Time_Sec']):
        df_processed['Total_Time_Seconds'] = df_processed['Fitness_Endurance-Time_Mins'] * 60 + df_processed['Time_Sec']

    if all(col in df_processed.columns for col in ['FGC_GSD', 'FGC_GSND']):
        df_processed['Grip_Strength_Asymmetry'] = (df_processed['FGC_GSD'] - df_processed['FGC_GSND']) / \
                                                  (df_processed['FGC_GSD'] + df_processed['FGC_GSND'] + 1e-6)

    if all(col in df_processed.columns for col in ['BMI', 'BIA_Fat']):
        df_processed['BMI_Fat_Interaction'] = df_processed['BMI'] * df_processed['BIA_Fat']

    ## 4. Domain-Specific Aggregations
    fitness_cols = ['FGC_CU', 'FGC_PU', 'FGC_TL']
    if all(col in df_processed.columns for col in fitness_cols):
        df_processed['Fitness_Composite'] = df_processed[fitness_cols].mean(axis=1)

    if all(col in df_processed.columns for col in ['BIA_Fat_Mass', 'Height']):
        df_processed['FMI'] = df_processed['BIA_Fat_Mass'] / (df_processed['Height'] / 100)**2

    ## 5. Handling Missing Data
    cols_with_missing = df_processed.columns[df_processed.isnull().any()].tolist()
    for col in cols_with_missing:
        df_processed[f'Missing_{col}'] = df_processed[col].isnull().astype(int)

    numeric_cols = df_processed.select_dtypes(include=['number']).columns
    imputer = SimpleImputer(strategy='median')
    df_processed[numeric_cols] = imputer.fit_transform(df_processed[numeric_cols])

    ## 6. Normalization/Scaling
    to_standard_scale = ['Height', 'Weight']
    scaler = StandardScaler()
    for col in to_standard_scale:
        if col in df_processed.columns:
            df_processed[f'{col}_scaled'] = scaler.fit_transform(df_processed[[col]])

    to_minmax_scale = ['CGAS_Score']
    minmax_scaler = MinMaxScaler()
    for col in to_minmax_scale:
        if col in df_processed.columns:
            df_processed[f'{col}_scaled'] = minmax_scaler.fit_transform(df_processed[[col]])

    ## 7. Target-Specific Engineering
    if all(col in df_processed.columns for col in ['Systolic_BP', 'Diastolic_BP', 'HeartRate']):
        df_processed['Cardio_Risk_Score'] = (
            df_processed['Systolic_BP'] / 140 + 
            df_processed['Diastolic_BP'] / 90 + 
            df_processed['HeartRate'] / 100
        )

    ## 8. Dimensionality Reduction
    correlated_groups = [
        ['BIA_TBW', 'BIA_ECW', 'BIA_ICW'],
        ['SDS_Total_Raw', 'SDS_Total_T']
    ]
    for group in correlated_groups:
        if all(col in df_processed.columns for col in group):
            pca = PCA(n_components=1)
            pca_feature = pca.fit_transform(df_processed[group])
            group_name = '_'.join(group)
            df_processed[f'{group_name}_PCA'] = pca_feature

    ## 9. Temporal Feature Derivatives
    if 'id' in df_processed.columns:
        change_cols = ['BMI', 'Fitness_Composite']
        for col in change_cols:
            if col in df_processed.columns:
                df_processed[f'Delta_{col}'] = df_processed.groupby('id')[col].diff()

    ## 10. Textual Data
    if 'Description' in df_processed.columns:
        df_processed['Dominant_Mentioned'] = df_processed['Description'].str.contains('dominant', case=False).astype(int)

    return df_processed


In [64]:
' FeatureProcessor class for preprocessing data with text and categorical features'
class FeatureProcessor:
    def __init__(self):
        self.encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        self.text_cols = []
        self.cat_cols = []
        self.model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        self.embed_cols = []
        
    def preprocess(self, train, test):
        'Optimized preprocessing to avoid fragmentation'
        try:
            self._identify_safe_columns(train, test)
            
            if self.text_cols:
                train, test = self._process_text_optimized(train, test)
            
            if self.cat_cols:
                train, test = self._encode_categoricals(train, test)
            # normalization
            train, test = self._scale_numerical(train, test)
            
            return train, test
            
        except Exception as e:
            print(f"Error on preprocesing: {str(e)}")
            raise
    
    def _identify_safe_columns(self, train, test):
        """Identifica columnas existentes en ambos datasets"""
        common_cols = list(set(train.columns) & set(test.columns))
        
        self.text_cols = [
            col for col in common_cols 
            if train[col].dtype == 'object' 
            and train[col].str.contains('[a-zA-Z]', regex=True, na=False).any()
        ]
        
        self.cat_cols = [
            col for col in common_cols 
            if train[col].dtype == 'object' 
            and col not in self.text_cols
        ]
    
    def _process_text_optimized(self, train, test):
        'procesing of text without fragmentation'
        train_embeddings = []
        test_embeddings = []
        
        for col in self.text_cols:
            train_text = train[col].fillna('').astype(str)
            test_text = test[col].fillna('').astype(str)
            
            # Embeddings for train and test
            train_emb = self.model.encode(train_text.tolist(), show_progress_bar=False)
            test_emb = self.model.encode(test_text.tolist(), show_progress_bar=False)
            
            train_embeddings.append(train_emb)
            test_embeddings.append(test_emb)
        
        # concatenate all embeddings horizontally
        if train_embeddings:
            train_embeddings = np.hstack(train_embeddings)
            test_embeddings = np.hstack(test_embeddings)
            
            # Create dataframes before assigning
            n_features = train_embeddings.shape[1]
            self.embed_cols = [f'text_embed_{i}' for i in range(n_features)]
            
            train_emb_df = pd.DataFrame(train_embeddings, columns=self.embed_cols, index=train.index)
            test_emb_df = pd.DataFrame(test_embeddings, columns=self.embed_cols, index=test.index)
            
            # Concatenate embeddings
            train = pd.concat([train, train_emb_df], axis=1)
            test = pd.concat([test, test_emb_df], axis=1)
            
        return train, test
    
    def _encode_categoricals(self, train, test):
        """codification of categoricals"""
        if self.cat_cols:
            train_cats = train[self.cat_cols]
            test_cats = test[self.cat_cols]
            
            # Ensure both datasets have the same categories
            train[self.cat_cols] = self.encoder.fit_transform(train_cats)
            test[self.cat_cols] = self.encoder.transform(test_cats)
            
        return train, test
    
    def _scale_numerical(self, train, test):
        num_cols = [col for col in train.select_dtypes(include=np.number).columns 
                   if col not in ['Subject_ID', 'PCIAT-PCIAT_Total'] and col in test.columns]
        
        if num_cols:
            means = train[num_cols].mean()
            stds = train[num_cols].std() + 1e-8
            
            train[num_cols] = (train[num_cols] - means) / stds
            test[num_cols] = (test[num_cols] - means) / stds
            
        return train, test

In [None]:
' Summarize actigraphy data with enhanced statistical features'
def summarize_actigraphy(df, subject_id_col='Subject_ID'):
    """
    Enhanced actigraphy processing with:
    - Percentiles (10th, 25th, 75th, 90th)
    - Robust statistical measures (IQR, MAD)
    - Frequency domain features (FFT)
    """
    exclude_cols = [subject_id_col, 'timestamp']
    num_cols = [col for col in df.columns 
               if col not in exclude_cols 
               and pd.api.types.is_numeric_dtype(df[col])]
    
    # Time-domain features
    stats = {
        'mean': np.mean,
        'std': np.std,
        'min': np.min,
        'max': np.max,
        'median': np.median,
        'skew': skew,
        'kurtosis': kurtosis,
        'q1': lambda x: np.percentile(x, 25),
        'q3': lambda x: np.percentile(x, 75),
        'iqr': lambda x: np.percentile(x, 75) - np.percentile(x, 25),
        'mad': lambda x: np.median(np.abs(x - np.median(x)))
    }
    
    # Frequency-domain features (simplified FFT)
    def dominant_freq(x):
        if len(x) < 2: return 0
        fft = np.abs(np.fft.fft(x))
        return np.argmax(fft[1:len(fft)//2]) + 1
    
    summary = df.groupby(subject_id_col)[num_cols].agg(stats)
    summary.columns = [f'{col}_{stat}' for col, stat in summary.columns]
    
    # Add frequency features
    freq_features = df.groupby(subject_id_col)[num_cols].agg(dominant_freq)
    freq_features.columns = [f'{col}_dominant_freq' for col in freq_features.columns]
    
    return pd.concat([summary, freq_features], axis=1).reset_index()

In [66]:
def extract_time_features(df, subject_col='Subject_ID'):
    """Extracts 15 key time features per subject"""
    features = []
    for subject_id, group in df.groupby(subject_col):
        if 'timestamp' in group.columns:
            time_diff = group['timestamp'].diff().dt.total_seconds()
            feat = {
                'Subject_ID': subject_id,
                'total_events': len(group),
                'active_hours': (time_diff < 3600).sum(),
                'night_activity': group[group['timestamp'].dt.hour.between(0, 6)]['value'].mean(),
                'max_activity': group['value'].max(),
                'std_activity': group['value'].std(),
            }
            features.append(feat)
    return pd.DataFrame(features)

Machine Learning

In [67]:
' Train and evaluate the model with optimized parameters'
def train_and_evaluate(X, y):
    model = XGBClassifier(
        objective='multi:softmax',
        num_class=len(np.unique(y)),
        n_estimators=150,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.5,
        reg_lambda=0.5,
        tree_method='hist',
        random_state=42
    )
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    qwk_scores = []
    """    Stratified K-Fold cross-validation to ensure balanced class distribution"""
    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        # Fit the model
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0
        )
        preds = model.predict(X_val)
        qwk_scores.append(cohen_kappa_score(y_val, preds, weights='quadratic'))
    
    return model, np.mean(qwk_scores)

Complete the submission

In [68]:
'''
Utility functions for competition submission
'''

def save_submission(test, preds, output_dir):
    '''
    Saves predictions in Kaggle submission format
    
    Args:
        test: Test DataFrame
        preds: Model predictions
        output_dir: Directory to save submission file
    '''
    submission = pd.DataFrame({
        'id': test['id'],
        'sii': preds
    })
    os.makedirs(output_dir, exist_ok=True)
    submission_path = os.path.join(output_dir, 'submission.csv')
    submission.to_csv(submission_path, index=False)
    print(f"Submission saved to {submission_path}")

In [21]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [None]:
' Main function to orchestrate the workflow'
def main():
    try:
        print("\n=== Loading data ===")
        train = pd.read_csv(train_path)
        test = pd.read_csv(test_path)
        
        # Verification of critical data
        assert 'PCIAT-PCIAT_Total' in train.columns, "Target column not found in train data"
        train = train.dropna(subset=['PCIAT-PCIAT_Total'])
        
        print("\n=== Preprocessing ===")
        processor = FeatureProcessor()
        train, test = processor.preprocess(train, test)
    
        
        print("\n=== Preparing target ===")
        # Robust version of qcut
        train['SII_group'], bins = pd.qcut(
            train['PCIAT-PCIAT_Total'],
            q=4,
            labels=[0, 1, 2, 3],
            retbins=True,
            duplicates='drop'
        )
        y = train['SII_group'].astype(int)
        
        print("\n=== Selecting features ===")
        # Exclude irrelevant columns and ensure consistency
        exclude = ['PCIAT-PCIAT_Total', 'Subject_ID', 'SII_group', 'timestamp', 'id']
        
        # Only features present in both datasets
        common_features = list(set(train.columns) & set(test.columns))
        features = [
            col for col in common_features
            if col not in exclude
            and pd.api.types.is_numeric_dtype(train[col])
            and col in test.columns
        ]
        
        print(f"Selected features: {len(features)}")
        X = train[features]
        
        print("\n=== Training ===")
        model, qwk = train_and_evaluate(X, y)
        print(f"\n✔ QWK average: {qwk:.4f}")
        
        print("\n=== Generating submission ===")
        # Check features in test
        missing_in_test = [col for col in features if col not in test.columns]
        if missing_in_test:
            print(f"⚠ Features faltantes en test: {missing_in_test}")
            features = [col for col in features if col in test.columns]
        
        test_preds = model.predict(test[features])
        
        # Ensure 'id' column exists in test DataFrame
        if 'id' not in test.columns and 'Subject_ID' in test.columns:
            test['id'] = test['Subject_ID']
        elif 'id' not in test.columns:
            test['id'] = range(len(test))
        
        save_submission(test, test_preds, OUTPUT_DIR)
        print(f"Submission generated in {OUTPUT_DIR / 'submission.csv'}")
        
    except Exception as e:
        print(f"\nCritical error: {str(e)}")
        raise

if __name__ == '__main__':
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    main()

In [45]:
def main():
    try:
        print("\n=== Loading data ===")
        train = pd.read_csv(train_path)
        test = pd.read_csv(test_path)
        
        assert 'PCIAT-PCIAT_Total' in train.columns, "Target column not found in train data"
        train = train.dropna(subset=['PCIAT-PCIAT_Total'])

        print("\n=== General Feature Engineering ===")
        train = preprocess_data(train)
        test = preprocess_data(test)

        print("\n=== Advanced Preprocessing (Text + Categoricals) ===")
        processor = FeatureProcessor()
        train, test = processor.preprocess(train, test)

        print("\n=== Preparing target ===")
        train['SII_group'], bins = pd.qcut(
            train['PCIAT-PCIAT_Total'],
            q=4,
            labels=[0, 1, 2, 3],
            retbins=True,
            duplicates='drop'
        )
        y = train['SII_group'].astype(int)

        print("\n=== Selecting features ===")
        exclude = ['PCIAT-PCIAT_Total', 'Subject_ID', 'SII_group', 'timestamp', 'id']
        common_features = list(set(train.columns) & set(test.columns))
        features = [
            col for col in common_features
            if col not in exclude and pd.api.types.is_numeric_dtype(train[col])
        ]

        print(f"Selected features: {len(features)}")
        X = train[features]

        print("\n=== Training ===")
        model, qwk = train_and_evaluate(X, y)
        print(f"\n✔ QWK average: {qwk:.4f}")

        print("\n=== Generating submission ===")
        if 'id' not in test.columns and 'Subject_ID' in test.columns:
            test['id'] = test['Subject_ID']
        elif 'id' not in test.columns:
            test['id'] = range(len(test))

        test_preds = model.predict(test[features])
        save_submission(test, test_preds, OUTPUT_DIR)
        print(f"Submission generated in {OUTPUT_DIR / 'submission.csv'}")

    except Exception as e:
        print(f"\nCritical error: {str(e)}")
        raise


In [46]:
if __name__ == '__main__':
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    main()


=== Loading data ===

=== General Feature Engineering ===

=== Advanced Preprocessing (Text + Categoricals) ===

=== Preparing target ===

=== Selecting features ===
Selected features: 4327

=== Training ===

✔ QWK average: 0.4255

=== Generating submission ===
Submission saved to C:\TrabajoFinal\Child_mind_institute_problematic_internet_use\src\outputs\submission.csv
Submission generated in C:\TrabajoFinal\Child_mind_institute_problematic_internet_use\src\outputs\submission.csv
