In [2]:
!pip install pandas scikit-learn xgboost matplotlib seaborn

Collecting pandas
  Using cached pandas-2.3.0-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.7.0-cp310-cp310-win_amd64.whl.metadata (14 kB)
Collecting xgboost
  Using cached xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting matplotlib
  Using cached matplotlib-3.10.3-cp310-cp310-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting numpy>=1.22.4 (from pandas)
  Using cached numpy-2.2.6-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Using cached scipy-1.15.3-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GroupKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
import joblib
import numpy as np

# Cargar el dataset
df = pd.read_csv("../data_set_videos/dataset_final.csv")

# Asegúrate de tener la columna 'video_id'
assert 'video_id' in df.columns, "Falta la columna video_id en el CSV"

# Codificar etiquetas
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

# Eliminar filas con demasiados ceros
umbral_ceros = 0.3
porcentaje_ceros = (df == 0).sum(axis=1) / df.shape[1]
df = df[porcentaje_ceros < umbral_ceros]

# Split estratificado por acción (asegura al menos un video por acción en train)
videos_info = df[['video_id', 'label']].drop_duplicates()
train_videos = []
test_videos = []

for action in videos_info['label'].unique():
    vids = videos_info[videos_info['label'] == action]['video_id'].tolist()
    if len(vids) == 1:
        train_videos.append(vids[0])
    else:
        v_train, v_test = train_test_split(vids, test_size=0.3, random_state=42)
        train_videos.extend(v_train)
        test_videos.extend(v_test)

remaining_videos = set(videos_info['video_id']) - set(train_videos) - set(test_videos)
if len(remaining_videos) >= 2:
    v_train, v_test = train_test_split(list(remaining_videos), test_size=0.3, random_state=42)
    train_videos.extend(v_train)
    test_videos.extend(v_test)
elif len(remaining_videos) == 1:
    train_videos.extend(list(remaining_videos))
# Si es 0, no haces nada

train_df = df[df['video_id'].isin(train_videos)]
test_df = df[df['video_id'].isin(test_videos)]

# Features y labels
X_train = train_df.drop(columns=["label", "frame", "video_id", "label_encoded"])
X_test = test_df.drop(columns=["label", "frame", "video_id", "label_encoded"])
y_train = train_df["label_encoded"]
y_test = test_df["label_encoded"]

# Escalar features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Aplicar PCA
pca = PCA(n_components=0.95, svd_solver='full')
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Modelos a evaluar
modelos = {
    "RandomForest": RandomForestClassifier(n_estimators=60, random_state=24),
    "SVM": SVC(kernel='rbf', probability=True),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

for nombre, modelo in modelos.items():
    print(f"\n Entrenando {nombre}...")
    modelo.fit(X_train_pca, y_train)

    y_pred = modelo.predict(X_test_pca)

    train_acc = modelo.score(X_train_pca, y_train)
    test_acc = modelo.score(X_test_pca, y_test)

    print(f"\n Resultados para {nombre}:")
    print(f" Accuracy en entrenamiento: {train_acc:.2f}")
    print(f" Accuracy en prueba:       {test_acc:.2f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))

    # Matriz de confusión
    plt.figure(figsize=(8,6))
    cm = confusion_matrix(y_test, y_pred, labels=range(len(le.classes_)))
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=le.classes_, yticklabels=le.classes_)
    plt.title(f"Matriz de Confusión - {nombre}")
    plt.xlabel("Predicción")
    plt.ylabel("Real")
    plt.tight_layout()
    plt.show()

    # Validación cruzada por video (GroupKFold)
    print(f"\nValidación cruzada por video para {nombre}:")
    X = df.drop(columns=["label", "frame", "video_id", "label_encoded"])
    y = df["label_encoded"]
    groups = df["video_id"]
    X_scaled = scaler.transform(X)
    X_pca = pca.transform(X_scaled)
    gkf = GroupKFold(n_splits=5)
    scores = cross_val_score(modelo, X_pca, y, cv=gkf.split(X_pca, y, groups), scoring='accuracy')
    print(f"Scores por fold: {scores}")
    print(f"Media: {np.mean(scores):.4f} | Desviación estándar: {np.std(scores):.4f}")

    # Guardar el modelo
    joblib.dump(modelo, f"modelo_{nombre}.pkl")

# Guardar preprocesadores y encoder
joblib.dump(le, "label_encoder.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(pca, "pca.pkl")

ValueError: With n_samples=0, test_size=0.3 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.