In [35]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
import pyarrow.parquet as pq
from sklearn.preprocessing import OrdinalEncoder
from sentence_transformers import SentenceTransformer
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
import os
from pathlib import Path
import sys
from dotenv import load_dotenv
from openai import OpenAI

In [59]:
# 1. Definir directorios robustamente desde notebook
BASE_DIR = Path().resolve()
DATA_DIR = BASE_DIR / "data"
OUTPUT_DIR = BASE_DIR / "outputs"

# 2. Agregar src al path para poder importar módulos locales
if str(SRC_DIR) not in sys.path:
    sys.path.append(str(SRC_DIR))

# 3. Verificar lectura de los archivos
train_path = DATA_DIR / 'train.csv'
test_path = DATA_DIR / 'test.csv'

Importacion de los datos

In [37]:
def load_data(data_dir='data'):
    """Carga datos desde la carpeta 'data' con paths relativos seguros"""
    try:
        # Verifica existencia de archivos
        train_csv_path = os.path.join(data_dir, 'train.csv')
        test_csv_path = os.path.join(data_dir, 'test.csv')
        train_parquet_path = os.path.join(data_dir, 'train.parquet')
        test_parquet_path = os.path.join(data_dir, 'test.parquet')
        
        if not all(os.path.exists(f) for f in [train_csv_path, test_csv_path]):
            raise FileNotFoundError("Archivos CSV no encontrados en la carpeta 'data'")
        
        # Carga CSV
        train_csv = pd.read_csv(train_csv_path)
        test_csv = pd.read_csv(test_csv_path)
        
        # Carga Parquet si existen
        train_parquet = pd.DataFrame()
        test_parquet = pd.DataFrame()
        if os.path.exists(train_parquet_path):
            train_parquet = pq.read_table(train_parquet_path).to_pandas()
        if os.path.exists(test_parquet_path):
            test_parquet = pq.read_table(test_parquet_path).to_pandas()
        
        # Combina datos
        train = pd.merge(train_csv, train_parquet, on='Subject_ID', how='left') if not train_parquet.empty else train_csv
        test = pd.merge(test_csv, test_parquet, on='Subject_ID', how='left') if not test_parquet.empty else test_csv
        
        return train, test, {}
    
    except Exception as e:
        print(f"Error cargando datos: {str(e)}")
        raise

LLM

In [38]:
load_dotenv()  # Carga las variables desde .env

api_key = os.getenv("DEEPSEEK_KEY")

In [39]:
client = OpenAI(
    api_key=os.getenv("DEEPSEEK_KEY"),
    base_url="https://openrouter.ai/api/v1"
)

In [40]:
def query_deepseek(prompt: str, model: str = "deepseek/deepseek-chat-v3-0324:free") -> str:
    resp = client.chat.completions.create(
        model=model,
        messages=[{"role":"user", "content":prompt}],
        temperature=0.6,
        # Muestra la cantidad de output
        max_tokens=4096,
    )
    return resp.choices[0].message.content.strip()

In [41]:
# Cargar el notebook
from langchain_community.document_loaders import NotebookLoader

loader = NotebookLoader("problematic_internet_use_llm.ipynb", include_outputs=False, remove_newline=True)
docs = loader.load()
print(f"Celdas encontradas: {len(docs)}")

Celdas encontradas: 1


Prompts que hay que realizar de manera general

In [42]:
codigo_original = """
# Carga el notebook activo (puedes ver su nombre en Kaggle)
from langchain_community.document_loaders import NotebookLoader

loader = NotebookLoader("prueba-llm.ipynb", include_outputs=False, remove_newline=True)
docs = loader.load()
print(f"Celdas encontradas: {len(docs)}")
"""

prompt = f"""Reescribe el siguiente código con mejoras de estilo y eficiencia, sin cambiar su funcionalidad:\n{codigo_original},
ademas, todo lo que no este relacionado a codigo lo dejes comentado, incluyendo los cambios realizados"""

respuesta = query_deepseek(prompt)
print(respuesta)

```python
# Importa el cargador de notebooks de LangChain
from langchain_community.document_loaders import NotebookLoader

# Configuración del cargador:
# - Archivo: prueba-llm.ipynb
# - exclude_outputs: No incluir salidas de celdas
# - remove_newline: Eliminar saltos de línea
loader = NotebookLoader(
    file_path="prueba-llm.ipynb",
    include_outputs=False,
    remove_newline=True
)

# Carga los documentos y muestra el conteo de celdas
docs = loader.load()
print(f"Celdas encontradas: {len(docs)}")

# Mejoras realizadas:
# 1. Formateo consistente del código
# 2. Argumentos nombrados explícitamente
# 3. Comentarios descriptivos
# 4. Mantenimiento de la funcionalidad original
```


In [43]:
def analizar_archivo(filepath: str, pregunta: str):
    with open(filepath, "r", encoding="utf-8") as f:
        contenido = f.read()

    prompt = f"""Tengo este archivo llamado `{filepath}` con el siguiente contenido:

[INICIO DEL ARCHIVO]
{contenido}
[FIN DEL ARCHIVO]

Ahora, por favor responde lo siguiente:
{pregunta}
"""
    respuesta = query_deepseek(prompt)
    print(respuesta)

In [44]:
analizar_archivo("data\data_dictionary.csv", "¿Puedes revisar este archivo y recomendarme su feature engineering?")

Basándome en el diccionario de datos proporcionado, aquí tienes recomendaciones de feature engineering organizadas por categorías:

### 1. **Variables Categóricas (One-Hot Encoding/Label Encoding)**
   - **Estacionalidad**: Las variables `*_Season` (presentes en casi todos los instrumentos) podrían convertirse en variables dummy (one-hot encoding) para capturar efectos estacionales.
   - **Variables con etiquetas**: 
     - `Sex` (0/1), `*_Zone` (ej. `FGC_CU_Zone`), `BIA_Activity_Level_num`, `BIA_Frame_num`, `PCIAT_*` (ítems 1-20), y `computerinternet_hoursday` son categóricas ordinales o nominales. Usa:
       - **One-hot encoding** si no hay orden (ej. estaciones).
       - **Label encoding** si hay orden (ej. `BIA_Activity_Level_num`: 1=Very Light a 5=Exceptional).
     - Para `PCIAT_Total`, considera crear categorías basadas en los umbrales de severidad proporcionados (None/Mild/Moderate/Severe).

### 2. **Normalización/Estandarización**
   - **Variables continuas**: `Age`, `BMI`, 

Hacer feature engineering

In [45]:
class FeatureProcessor:
    def __init__(self):
        self.encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        self.text_cols = []
        self.cat_cols = []
        self.model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        self.embed_cols = []
        
    def preprocess(self, train, test):
        """Versión optimizada que evita fragmentación"""
        try:
            # 1. Identificar columnas seguras
            self._identify_safe_columns(train, test)
            
            # 2. Procesar texto
            if self.text_cols:
                train, test = self._process_text_optimized(train, test)
            
            # 3. Codificar categóricas
            if self.cat_cols:
                train, test = self._encode_categoricals(train, test)
            
            # 4. Normalizar numéricas
            train, test = self._scale_numerical(train, test)
            
            return train, test
            
        except Exception as e:
            print(f"Error en preprocesamiento: {str(e)}")
            raise
    
    def _identify_safe_columns(self, train, test):
        """Identifica columnas existentes en ambos datasets"""
        common_cols = list(set(train.columns) & set(test.columns))
        
        self.text_cols = [
            col for col in common_cols 
            if train[col].dtype == 'object' 
            and train[col].str.contains('[a-zA-Z]', regex=True, na=False).any()
        ]
        
        self.cat_cols = [
            col for col in common_cols 
            if train[col].dtype == 'object' 
            and col not in self.text_cols
        ]
    
    def _process_text_optimized(self, train, test):
        """Procesamiento de texto sin fragmentación"""
        # Generar todos los embeddings primero
        train_embeddings = []
        test_embeddings = []
        
        for col in self.text_cols:
            train_text = train[col].fillna('').astype(str)
            test_text = test[col].fillna('').astype(str)
            
            # Embeddings para train y test
            train_emb = self.model.encode(train_text.tolist(), show_progress_bar=False)
            test_emb = self.model.encode(test_text.tolist(), show_progress_bar=False)
            
            train_embeddings.append(train_emb)
            test_embeddings.append(test_emb)
        
        # Concatenar todos los embeddings horizontalmente
        if train_embeddings:
            train_embeddings = np.hstack(train_embeddings)
            test_embeddings = np.hstack(test_embeddings)
            
            # Crear DataFrames completos antes de asignar
            n_features = train_embeddings.shape[1]
            self.embed_cols = [f'text_embed_{i}' for i in range(n_features)]
            
            train_emb_df = pd.DataFrame(train_embeddings, columns=self.embed_cols, index=train.index)
            test_emb_df = pd.DataFrame(test_embeddings, columns=self.embed_cols, index=test.index)
            
            # Concatenar de una sola vez
            train = pd.concat([train, train_emb_df], axis=1)
            test = pd.concat([test, test_emb_df], axis=1)
            
        return train, test
    
    def _encode_categoricals(self, train, test):
        """Codificación segura de categóricas"""
        if self.cat_cols:
            train_cats = train[self.cat_cols]
            test_cats = test[self.cat_cols]
            
            # Codificar y asignar de una vez
            train[self.cat_cols] = self.encoder.fit_transform(train_cats)
            test[self.cat_cols] = self.encoder.transform(test_cats)
            
        return train, test
    
    def _scale_numerical(self, train, test):
        """Normalización optimizada"""
        num_cols = [col for col in train.select_dtypes(include=np.number).columns 
                   if col not in ['Subject_ID', 'PCIAT-PCIAT_Total'] and col in test.columns]
        
        if num_cols:
            means = train[num_cols].mean()
            stds = train[num_cols].std() + 1e-8
            
            train[num_cols] = (train[num_cols] - means) / stds
            test[num_cols] = (test[num_cols] - means) / stds
            
        return train, test

In [46]:
def summarize_actigraphy(df, subject_id_col='Subject_ID'):
    """
    Enhanced actigraphy processing with:
    - Percentiles (10th, 25th, 75th, 90th)
    - Robust statistical measures (IQR, MAD)
    - Frequency domain features (FFT)
    """
    exclude_cols = [subject_id_col, 'timestamp']
    num_cols = [col for col in df.columns 
               if col not in exclude_cols 
               and pd.api.types.is_numeric_dtype(df[col])]
    
    # Time-domain features
    stats = {
        'mean': np.mean,
        'std': np.std,
        'min': np.min,
        'max': np.max,
        'median': np.median,
        'skew': skew,
        'kurtosis': kurtosis,
        'q1': lambda x: np.percentile(x, 25),
        'q3': lambda x: np.percentile(x, 75),
        'iqr': lambda x: np.percentile(x, 75) - np.percentile(x, 25),
        'mad': lambda x: np.median(np.abs(x - np.median(x)))
    }
    
    # Frequency-domain features (simplified FFT)
    def dominant_freq(x):
        if len(x) < 2: return 0
        fft = np.abs(np.fft.fft(x))
        return np.argmax(fft[1:len(fft)//2]) + 1
    
    summary = df.groupby(subject_id_col)[num_cols].agg(stats)
    summary.columns = [f'{col}_{stat}' for col, stat in summary.columns]
    
    # Add frequency features
    freq_features = df.groupby(subject_id_col)[num_cols].agg(dominant_freq)
    freq_features.columns = [f'{col}_dominant_freq' for col in freq_features.columns]
    
    return pd.concat([summary, freq_features], axis=1).reset_index()

In [47]:
def extract_time_features(df, subject_col='Subject_ID'):
    """Extrae 15 features temporales clave por sujeto"""
    features = []
    for subject_id, group in df.groupby(subject_col):
        if 'timestamp' in group.columns:
            time_diff = group['timestamp'].diff().dt.total_seconds()
            feat = {
                'Subject_ID': subject_id,
                'total_events': len(group),
                'active_hours': (time_diff < 3600).sum(),
                'night_activity': group[group['timestamp'].dt.hour.between(0, 6)]['value'].mean(),
                'max_activity': group['value'].max(),
                'std_activity': group['value'].std(),
            }
            features.append(feat)
    return pd.DataFrame(features)

Machine Learning

In [48]:
def train_and_evaluate(X, y):
    """Versión optimizada sin warnings"""
    model = XGBClassifier(
        objective='multi:softmax',
        num_class=len(np.unique(y)),
        n_estimators=150,
        max_depth=5,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.5,
        reg_lambda=0.5,
        tree_method='hist',  # Más eficiente
        random_state=42
    )
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    qwk_scores = []
    
    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0
        )
        preds = model.predict(X_val)
        qwk_scores.append(cohen_kappa_score(y_val, preds, weights='quadratic'))
    
    return model, np.mean(qwk_scores)

Complete the submission

In [49]:
'''
Utility functions for competition submission
'''

def save_submission(test, preds, output_dir):
    '''
    Saves predictions in Kaggle submission format
    
    Args:
        test: Test DataFrame
        preds: Model predictions
        output_dir: Directory to save submission file
    '''
    submission = pd.DataFrame({
        'id': test['id'],
        'sii': preds
    })
    os.makedirs(output_dir, exist_ok=True)
    submission_path = os.path.join(output_dir, 'submission.csv')
    submission.to_csv(submission_path, index=False)
    print(f"Submission saved to {submission_path}")

In [60]:
def main():
    try:
        print("\n=== Cargando datos ===")
        train = pd.read_csv(train_path)
        test = pd.read_csv(test_path)
        
        # Verificación de datos críticos
        assert 'PCIAT-PCIAT_Total' in train.columns, "Columna target no encontrada"
        train = train.dropna(subset=['PCIAT-PCIAT_Total'])
        
        print("\n=== Preprocesamiento ===")
        processor = FeatureProcessor()
        train, test = processor.preprocess(train, test)
        
        print("\n=== Preparando target ===")
        # Versión robusta de qcut
        train['SII_group'], bins = pd.qcut(
            train['PCIAT-PCIAT_Total'],
            q=4,
            labels=[0, 1, 2, 3],
            retbins=True,
            duplicates='drop'
        )
        y = train['SII_group'].astype(int)
        
        print("\n=== Seleccionando features ===")
        # Excluir columnas no relevantes y asegurar consistencia
        exclude = ['PCIAT-PCIAT_Total', 'Subject_ID', 'SII_group', 'timestamp', 'id']
        
        # Solo features presentes en ambos datasets
        common_features = list(set(train.columns) & set(test.columns))
        features = [
            col for col in common_features
            if col not in exclude
            and pd.api.types.is_numeric_dtype(train[col])
            and col in test.columns
        ]
        
        print(f"Features seleccionadas: {len(features)}")
        X = train[features]
        
        print("\n=== Entrenamiento ===")
        model, qwk = train_and_evaluate(X, y)
        print(f"\n✔ QWK promedio: {qwk:.4f}")
        
        print("\n=== Generando submission ===")
        # Verificar features en test
        missing_in_test = [col for col in features if col not in test.columns]
        if missing_in_test:
            print(f"⚠ Features faltantes en test: {missing_in_test}")
            features = [col for col in features if col in test.columns]
        
        test_preds = model.predict(test[features])
        
        # Asegurar ID para submission
        if 'id' not in test.columns and 'Subject_ID' in test.columns:
            test['id'] = test['Subject_ID']
        elif 'id' not in test.columns:
            test['id'] = range(len(test))
        
        save_submission(test, test_preds, OUTPUT_DIR)
        print(f"✔ Submission generado en {OUTPUT_DIR / 'submission.csv'}")
        
    except Exception as e:
        print(f"\n❌ Error crítico: {str(e)}")
        raise

if __name__ == '__main__':
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    main()


=== Cargando datos ===

=== Preprocesamiento ===

=== Preparando target ===

=== Seleccionando features ===
Features seleccionadas: 4272


  train['SII_group'], bins = pd.qcut(



=== Entrenamiento ===

✔ QWK promedio: 0.4285

=== Generando submission ===
Submission saved to C:\TrabajoFinal\Child_mind_institute_problematic_internet_use\src\outputs\submission.csv
✔ Submission generado en C:\TrabajoFinal\Child_mind_institute_problematic_internet_use\src\outputs\submission.csv
