# Modelo clasificación

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Supongamos que tienes un DataFrame con estas columnas:
# - 'juego': juego que transmite (categórico)
# - 'seguidores': número de seguidores (numérico)
# - 'idioma': idioma del streaming (categórico)
# - 'recomendacion': etiqueta de recomendación (target)
data = pd.DataFrame({
    'juego': ['League of Legends', 'Valorant', 'League of Legends', 'Minecraft', 'Valorant', 'Minecraft'],
    'seguidores': [10000, 15000, 12000, 8000, 16000, 9000],
    'idioma': ['es', 'en', 'es', 'en', 'es', 'en'],
    'recomendacion': ['A', 'B', 'A', 'C', 'B', 'C']
})

# Separar características y target
X = data[['juego', 'seguidores', 'idioma']]
y = data['recomendacion']

# Dividir en datos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definir transformaciones para columnas categóricas y numéricas
transformer = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(), ['juego', 'idioma']),
    ('num', StandardScaler(), ['seguidores'])
])

# Crear pipeline que transforma los datos y luego entrena el clasificador
pipeline = Pipeline(steps=[
    ('preprocesamiento', transformer),
    ('clasificador', DecisionTreeClassifier(random_state=42))
])

# Entrenar el modelo
pipeline.fit(X_train, y_train)

# Evaluar el modelo (opcional)
accuracy = pipeline.score(X_test, y_test)
print("Exactitud del modelo:", accuracy)

# Función para predecir a partir de nuevos datos de usuario (por ejemplo, desde Streamlit)
def predecir_recomendacion(juego, seguidores, idioma):
    # Construir un DataFrame con el formato adecuado
    entrada = pd.DataFrame({
        'juego': [juego],
        'seguidores': [seguidores],
        'idioma': [idioma]
    })
    prediccion = pipeline.predict(entrada)
    return prediccion[0]

# Ejemplo de predicción
recomendacion_usuario = predecir_recomendacion('League of Legends', 11000, 'es')
print("La recomendación para el usuario es:", recomendacion_usuario)


Unnamed: 0,juego,seguidores,idioma,recomendacion
0,League of Legends,10000,es,A
1,Valorant,15000,en,B
2,League of Legends,12000,es,A
3,Minecraft,8000,en,C
4,Valorant,16000,es,B
5,Minecraft,9000,en,C


Exactitud del modelo: 0.5
La recomendación para el usuario es: A


In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import joblib

# Supongamos que tienes un DataFrame con las siguientes columnas:
# 'streamer', 'seguidores', 'suscriptores', 'veterania' (años o fecha de inicio),
# 'horas_directo' y 'ranking'
data = pd.DataFrame({
    'streamer': ['StreamerA', 'StreamerB', 'StreamerC', 'StreamerD', 'StreamerE', 'StreamerF'],
    'seguidores': [10000, 25000, 15000, 30000, 20000, 18000],
    'suscriptores': [500, 1200, 800, 1500, 900, 850],
    'veterania': [3, 5, 4, 6, 4, 3],        # Por ejemplo, años activos
    'horas_directo': [100, 250, 150, 300, 200, 180],
    'ranking': [10, 5, 8, 3, 7, 9]           # Posición en ranking (cuanto menor, mejor)
})

# Seleccionamos las columnas numéricas que usaremos para el cálculo de la similitud.
features = ['seguidores', 'suscriptores', 'veterania', 'horas_directo', 'ranking']

# Normalizar las características para que todas tengan el mismo peso (o ajustar manualmente con ponderaciones).
scaler = StandardScaler()
X = scaler.fit_transform(data[features])

# Entrenar un modelo kNN (en este caso no es un "entrenamiento" tradicional, sino preparar el espacio)
knn = NearestNeighbors(n_neighbors=5, metric='euclidean')
knn.fit(X)

# Guardamos el scaler y el modelo knn para luego usarlos en la aplicación
joblib.dump(scaler, '../modelos/scaler_knn.pkl')
joblib.dump(knn, '../modelos/modelo_knn.pkl')
joblib.dump(data, '../modelos/data_streamers.pkl')  # Guardamos la base de datos también


['../modelos/data_streamers.pkl']

# 1. Preparación de los datos y entrenamiento de los modelos

### a) Preparar el modelo básico (usando sólo el CSV de ranking)

In [3]:
# HACER UN MODELO POR CATEGORIA

# modelo_basico.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
import joblib
import sys
sys.path.append("../")
from src import soporte_bbdd_mongo as sbm

# Obtener los datos desde MongoDB
df_ranking = sbm.obtener_datos_para_modelo_basico()

# Seleccionar las características (asegúrate de que los nombres de columnas coincidan con los documentos de la BD)
features_basico = ["Avg_viewers", "Time_streamed", "All_time_peak_viewers", "Hours_watched",
                     "Followers_gained", "Total_followers", "Total_views"]

features_basico = [feature.lower() for feature in features_basico]

X = df_ranking[features_basico]
y = df_ranking["clasificacion"]

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear y entrenar el pipeline (estandarización + árbol de decisión)
pipeline_basico = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", DecisionTreeClassifier(random_state=42))
])
pipeline_basico.fit(X_train, y_train)

print("Exactitud del modelo básico:", pipeline_basico.score(X_test, y_test))

# Guardar el modelo
joblib.dump(pipeline_basico, "../streamlit/modelos/modelo_basico_streamers.pkl")
client, db = sbm.conectar_mongo()
if "modelo_basico" in db.list_collection_names():
    db["modelo_basico"].drop()
sbm.insertar_en_coleccion(db, "modelo_basico", df_ranking, False, None)
client.close()

      id_streamer  all_time_peak_viewers  avg_viewers  \
0               1               383747.0      25785.0   
1               2               110045.0      17064.0   
2               3              3846256.0      38599.0   
3               4              1098636.0      26608.0   
4               5                87546.0      11394.0   
...           ...                    ...          ...   
2347         2348                  979.0         70.0   
2348         2349                 1448.0         78.0   
2349         2350                 2482.0        113.0   
2350         2351                 1384.0         77.0   
2351         2352                 1503.0         92.0   

                                         enlace  followers_gained  \
0              https://twitchtracker.com/rubius           45400.0   
1              https://twitchtracker.com/knekro           22800.0   
2                https://twitchtracker.com/ibai           10500.0   
3         https://twitchtracker.com/kin

# Prueba para recomendar por categoria

In [3]:
# HACER UN MODELO POR CATEGORIA
# modelo_basico.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
import joblib
import sys
sys.path.append("../")
from src import soporte_bbdd_mongo as sbm

client, db = sbm.conectar_mongo()
df_categorias = db["categorias_streameadas"].find()
df_categorias = pd.DataFrame(list(df_categorias))
categorias = db["id_categorias"].find()
categorias = [categoria["nombre"] for categoria in categorias]
print(categorias)
client.close()

# Obtener los datos desde MongoDB
df_ranking = sbm.obtener_datos_para_modelo_basico()

# Seleccionar las características (asegúrate de que los nombres de columnas coincidan con los documentos de la BD)
features_basico = ["Avg_viewers", "Time_streamed", "All_time_peak_viewers", "Hours_watched",
                     "Followers_gained", "Total_followers", "Total_views"]

features_basico = [feature.lower() for feature in features_basico]

X = df_ranking[features_basico]
y = df_ranking["clasificacion"]

for categoria in categorias:
    # Dividir en conjuntos de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Crear y entrenar el pipeline (estandarización + árbol de decisión)
    pipeline_basico = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", DecisionTreeClassifier(random_state=42))
    ])
    pipeline_basico.fit(X_train, y_train)

    print("Exactitud del modelo básico:", pipeline_basico.score(X_test, y_test))

    # # Guardar el modelo
    joblib.dump(pipeline_basico, "../streamlit/modelos/modelo_basico_streamers.pkl")
    # client, db = sbm.conectar_mongo()
    # if "modelo_basico" in db.list_collection_names():
    #     db["modelo_basico"].drop()
    # sbm.insertar_en_coleccion(db, "modelo_basico", df_ranking, False, None)
    # client.close()

      id_streamer  all_time_peak_viewers  avg_viewers  \
0               1               383747.0      25785.0   
1               2               110045.0      17064.0   
2               3              3846256.0      38599.0   
3               4              1098636.0      26608.0   
4               5                87546.0      11394.0   
...           ...                    ...          ...   
2347         2348                  979.0         70.0   
2348         2349                 1448.0         78.0   
2349         2350                 2482.0        113.0   
2350         2351                 1384.0         77.0   
2351         2352                 1503.0         92.0   

                                         enlace  followers_gained  \
0              https://twitchtracker.com/rubius           45400.0   
1              https://twitchtracker.com/knekro           22800.0   
2                https://twitchtracker.com/ibai           10500.0   
3         https://twitchtracker.com/kin

### b) Preparar el modelo avanzado (fusionando información histórica)

In [6]:
# modelo_avanzado.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
import joblib
from soporte_bbdd_mongo import obtener_datos_para_modelo_avanzado

# Obtener el DataFrame avanzado con datos combinados de ranking, historial y categorías
df_avanzado = obtener_datos_para_modelo_avanzado()

# Revisar las columnas disponibles (puedes imprimir df_avanzado.head() para depurar)
print("Primeras filas de df_avanzado:")
print(df_avanzado.head(10))

# Características del modelo básico (asegúrate de que los nombres coincidan con los de tu BD)
features_basico = ["Avg_viewers", "Time_streamed", "All_time_peak_viewers", "Hours_watched",
                     "Followers_gained", "Total_followers", "Total_views"]

# Características adicionales del modelo avanzado
features_avanzado = features_basico + ["promedio_viewers", "promedio_hours", "total_followers_hist", "veterania"]

# Si se ha agregado la información de categorías, se puede incluir (por ejemplo, número de categorías únicas)
if "num_categorias" in df_avanzado.columns:
    features_avanzado.append("num_categorias")

X_adv = df_avanzado[features_avanzado]
y_adv = df_avanzado["clasificacion"]

X_train_adv, X_test_adv, y_train_adv, y_test_adv = train_test_split(X_adv, y_adv, test_size=0.2, random_state=42)

pipeline_avanzado = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", DecisionTreeClassifier(random_state=42))
])
pipeline_avanzado.fit(X_train_adv, y_train_adv)
print("Exactitud del modelo avanzado:", pipeline_avanzado.score(X_test_adv, y_test_adv))

joblib.dump(pipeline_avanzado, "modelo_avanzado_streamers.pkl")


Exactitud del modelo avanzado monoetiqueta: 0.0
Reporte de clasificación:
                           precision    recall  f1-score   support

                A Way Out       0.00      0.00      0.00       3.0
    ARK: Survival Evolved       0.00      0.00      0.00       1.0
        Age of Empires II       0.00      0.00      0.00       1.0
                  Aimlabs       0.00      0.00      0.00       0.0
                 Among Us       0.00      0.00      0.00       1.0
              Anime Squad       0.00      0.00      0.00       0.0
         At Dead of Night       0.00      0.00      0.00       1.0
          Baldur's Gate 3       0.00      0.00      0.00       1.0
       Black Myth: Wukong       0.00      0.00      0.00       1.0
          Call of Dragons       0.00      0.00      0.00       0.0
Call of Duty: Black Ops 4       0.00      0.00      0.00       1.0
                   Casino       0.00      0.00      0.00       0.0
                   DEVOUR       0.00      0.00      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [2]:
# modelo_avanzado_multietiqueta.py

import os
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.tree import DecisionTreeClassifier
import joblib
from sklearn.metrics import classification_report
import sys
sys.path.append("../")
from src.soporte_bbdd_mongo import conectar_mongo, cargar_datos_en_memoria

# Conectar a MongoDB (aquí se usa False para conectarse a Atlas, por ejemplo)
client, db = conectar_mongo(False)

# Cargar los datos de las colecciones
df_categorias, df_historico = cargar_datos_en_memoria(db)

# Asegurarse de que los nombres de columnas estén en minúsculas para facilitar el procesamiento
df_categorias.columns = df_categorias.columns.str.lower()
df_historico.columns = df_historico.columns.str.lower()

# Convertir a numérico las columnas relevantes en el histórico
for col in ["avgviewers", "hoursstreamed", "peakviewers", "followers", "followersgain"]:
    if col in df_historico.columns:
        df_historico[col] = pd.to_numeric(df_historico[col], errors="coerce")

# --- Agregación básica (datos resumidos por streamer) ---
agg_funcs = {
    "avgviewers": "mean",
    "hoursstreamed": "sum",
    "peakviewers": "max",
    "followersgain": "sum",
    "followers": "last"
}
df_agg = df_historico.groupby(["id_streamer", "nombre"], as_index=False).agg(agg_funcs)
df_agg.rename(columns={
    "avgviewers": "avg_viewers",
    "hoursstreamed": "time_streamed",
    "peakviewers": "all_time_peak_viewers",
    "followersgain": "followers_gained",
    "followers": "total_followers"
}, inplace=True)
# df_agg["total_views"] = None  # Esta columna se deja sin calcular en este ejemplo

# --- Estadísticas avanzadas ---
agg_stats = df_historico.groupby(["id_streamer", "nombre"]).agg(
    promedio_viewers=pd.NamedAgg(column="avgviewers", aggfunc="mean"),
    promedio_hours=pd.NamedAgg(column="hoursstreamed", aggfunc="mean"),
    total_followers_hist=pd.NamedAgg(column="followers", aggfunc="last")
).reset_index()

# Calcular la veteranía utilizando "year" y "month"
def compute_veterania(group):
    try:
        group["date"] = pd.to_datetime(group["year"].astype(str) + " " + group["month"], format="%Y %b")
        fecha_inicial = group["date"].min()
        hoy = pd.to_datetime("today")
        veterania = (hoy - fecha_inicial).days / 365.0
        return round(veterania, 2)
    except Exception as e:
        return np.nan

veterania_list = []
for _, group in df_historico.groupby(["id_streamer", "nombre"]):
    veterania_val = compute_veterania(group)
    veterania_list.append({
        "id_streamer": group["id_streamer"].iloc[0],
        "nombre": group["nombre"].iloc[0],
        "veterania": veterania_val
    })
df_veterania = pd.DataFrame(veterania_list)

# Unir las estadísticas avanzadas
df_hist_stats = pd.merge(agg_stats, df_veterania, on=["id_streamer", "nombre"], how="left")

# --- Unir todo: datos básicos + avanzados ---
df_ranking = pd.merge(df_agg, df_hist_stats, on=["id_streamer", "nombre"], how="left")

# --- Procesar la información de categorías para multietiqueta ---
# En lugar de quedarse con la moda, se agrupan todas las categorías únicas por streamer
df_cat_grouped = df_categorias.groupby("id_streamer")["categoria"].agg(
    lambda x: list(x.unique())
).reset_index()

# Unir la información de categorías con el resto de datos
df_avanzado = pd.merge(df_ranking, df_cat_grouped, on="id_streamer", how="left")

# Revisar algunos datos (opcional)
print("Primeras filas del DataFrame combinado:")
print(df_avanzado.head())

# --- Preparar datos para el modelado avanzado ---
features_basico = ["avg_viewers", "time_streamed", "all_time_peak_viewers", "followers_gained", "total_followers"]
features_avanzado = features_basico + ["promedio_viewers", "promedio_hours", "total_followers_hist", "veterania"]

# Eliminar filas que tengan NaN en las características o en la lista de categorías
df_model_adv = df_avanzado.dropna(subset=features_avanzado + ["categoria"])
X_adv = df_model_adv[features_avanzado]

# La variable objetivo ahora será la lista de categorías para cada streamer
y_multilabel = df_model_adv["categoria"]  # Cada valor es una lista, e.g. ['juego A', 'juego B']

# Utilizar MultiLabelBinarizer para transformar la variable objetivo a un formato binario
mlb = MultiLabelBinarizer()
Y_adv = mlb.fit_transform(y_multilabel)

print("Clases (categorías) detectadas:")
print(mlb.classes_)

# Dividir en conjuntos de entrenamiento y prueba
X_train_adv, X_test_adv, Y_train_adv, Y_test_adv = train_test_split(X_adv, Y_adv, test_size=0.2, random_state=42)

# Crear un pipeline con imputación, escalado y el clasificador multietiqueta
pipeline_avanzado = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
    # DecisionTreeClassifier soporta multietiqueta si Y tiene la forma (n_samples, n_classes)
    ("clf", DecisionTreeClassifier(random_state=42))
])

# Entrenar el modelo
pipeline_avanzado.fit(X_train_adv, Y_train_adv)

score_adv = pipeline_avanzado.score(X_test_adv, Y_test_adv)
print("Exactitud del modelo avanzado (multietiqueta):", score_adv)

Y_pred_adv = pipeline_avanzado.predict(X_test_adv)
print("Reporte de clasificación (por etiqueta):")
print(classification_report(Y_test_adv, Y_pred_adv, target_names=mlb.classes_))

# Guardar el modelo y el objeto mlb para poder transformar la salida en el futuro
joblib.dump(pipeline_avanzado, "modelo_avanzado_streamers_multilabel.pkl")
joblib.dump(mlb, "mlb_categorias.pkl")

client.close()


Primeras filas del DataFrame combinado:
   id_streamer       nombre   avg_viewers  time_streamed  \
0            1       Rubius  35798.855263         7592.1   
1            2       Knekro   6001.747253        14556.7   
2            3         Ibai  49425.473684         7544.7   
3            4  Kingsleague  80444.142857         1154.7   
4            5      Mixwell   6563.481481         5625.2   

   all_time_peak_viewers  followers_gained  total_followers  promedio_viewers  \
0               383747.0               0.0              NaN      35798.855263   
1               110045.0               0.0              NaN       6001.747253   
2              3846256.0               0.0              NaN      49425.473684   
3              1098636.0               0.0              NaN      80444.142857   
4                87546.0               0.0              NaN       6563.481481   

   promedio_hours  total_followers_hist  veterania  \
0       99.896053                   NaN       6.26   
1   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
import joblib
import sys
sys.path.append("../")
from src import soporte_bbdd_mongo as sbm
from sklearn.metrics import silhouette_score, pairwise

# Conectar a MongoDB
client, db = sbm.conectar_mongo()

# Cargar datos
historico = pd.DataFrame(list(db.historico_streamers.find()))
categorias = pd.DataFrame(list(db.categorias_streameadas.find()))
streamers = pd.DataFrame(list(db.streamers.find()))  # Cargar ranking

client.close()

# Unir datasets usando el ID del streamer
historico = historico.drop(columns=['_id'])
categorias = categorias.drop(columns=['_id'])
streamers = streamers.drop(columns=['_id'])
df = historico.merge(categorias, on='id_streamer', how='left')
df = df.merge(streamers[['id_streamer', 'rank']], on='id_streamer', how='left')  # Agregar ranking

display(df)

# Reemplazar valores no numéricos ('-') por NaN
df.replace('-', np.nan, inplace=True)

# Convertir columnas numéricas al tipo adecuado
num_cols = ['avgviewers', 'viewersgain', 'percentagegainviewers', 'peakviewers', 'hoursstreamed', 'hoursgain', 'percentagegainhours', 'followers', 'percentagegainfollowers', 'followersgain', 'perhour', 'rank']
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Reemplazar NaN en 'rank' con la mediana
df['rank'] = df['rank'].fillna(df['rank'].median())

# Optimizar memoria reduciendo tipos de datos
df[num_cols] = df[num_cols].astype(np.float32)
df['id_streamer'] = df['id_streamer'].astype(np.int32)

# Rellenar NaN con la mediana de cada columna
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Liberar memoria innecesaria
gc.collect()

# Codificar la categoría como variable numérica
le = LabelEncoder()
df['categoria_codificada'] = le.fit_transform(df['categoria'])

# Normalizar los datos con MinMaxScaler
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Clustering con K-Means
kmeans = KMeans(n_clusters=5, random_state=42)
df['cluster'] = kmeans.fit_predict(df[num_cols])

# Entrenar y guardar modelos KNN por categoría
categorias_unicas = df['categoria'].unique()
modelos_knn = {}

for categoria in categorias_unicas:
    df_categoria = df[df['categoria'] == categoria]
    if df_categoria.shape[0] < 5:
        continue
    
    X_categoria = df_categoria[num_cols]
    nn = NearestNeighbors(n_neighbors=min(5, len(df_categoria)), metric='cosine', n_jobs=14)
    nn.fit(X_categoria)
    
    modelos_knn[categoria] = nn

# # Guardar modelos KNN entrenados con el nombre correcto para Streamlit
# joblib.dump(modelos_knn, "../streamlit/modelos/modelo_lastversion.pkl")
print("Ya ha entrenado")
print("Empezando la evaluación")

inertia = kmeans.inertia_  # SSE
silhouette_avg = silhouette_score(df[num_cols], df['cluster'])

print(f"Inertia (SSE): {inertia}")
print(f"Silhouette Score: {silhouette_avg}")

def evaluar_knn(modelos_knn, df, num_cols):
    resultados = []
    
    for categoria, modelo in modelos_knn.items():
        df_categoria = df[df['categoria'] == categoria]
        if df_categoria.shape[0] < 5:
            continue
        
        X_categoria = df_categoria[num_cols].values
        _, indices = modelo.kneighbors(X_categoria)
        
        # Comparar cada punto con sus vecinos
        similitudes = []
        for i, vecinos in enumerate(indices):
            original = X_categoria[i].reshape(1, -1)
            for vecino in vecinos[1:]:  # Omitir el propio punto
                vecino_data = X_categoria[vecino].reshape(1, -1)
                similitudes.append(pairwise.cosine_similarity(original, vecino_data)[0][0])
        
        # Promedio de similitudes
        resultados.append(np.mean(similitudes))
    
    return np.mean(resultados)

similitud_promedio = evaluar_knn(modelos_knn, df, num_cols)
print(f"Similitud promedio de KNN: {similitud_promedio:.4f}")

# Guardar modelos KNN entrenados con el nombre correcto para Streamlit
joblib.dump(modelos_knn, "../streamlit/modelos/modelo_lastversion.pkl")

Unnamed: 0,id_streamer,nombre_x,year,month,avgviewers,viewersgain,percentagegainviewers,peakviewers,hoursstreamed,hoursgain,percentagegainhours,followers,percentagegainfollowers,followersgain,perhour,nombre_y,categoria,rank
0,1830,02pomyu,2025,Feb,96,-37,-27.8,129.0,6.5,-31.2,-82.7,8698,1.0,86,13.2,02pomyu,Just Chatting,23952
1,1830,02pomyu,2025,Feb,96,-37,-27.8,129.0,6.5,-31.2,-82.7,8698,1.0,86,13.2,02pomyu,Mobile Legends: Bang Bang,23952
2,1830,02pomyu,2025,Feb,96,-37,-27.8,129.0,6.5,-31.2,-82.7,8698,1.0,86,13.2,02pomyu,Fall Guys,23952
3,1830,02pomyu,2025,Feb,96,-37,-27.8,129.0,6.5,-31.2,-82.7,8698,1.0,86,13.2,02pomyu,God of War,23952
4,1830,02pomyu,2025,Feb,96,-37,-27.8,129.0,6.5,-31.2,-82.7,8698,1.0,86,13.2,02pomyu,Resident Evil 4,23952
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2254901,1351,Zyrpesland,2020,Oct,4,0,0.0,11.0,22.6,-,-,100,0.0,76,3.4,Zyrpesland,WEB3WAR,17021
2254902,1351,Zyrpesland,2020,Oct,4,0,0.0,11.0,22.6,-,-,100,0.0,76,3.4,Zyrpesland,Fall Guys,17021
2254903,1351,Zyrpesland,2020,Oct,4,0,0.0,11.0,22.6,-,-,100,0.0,76,3.4,Zyrpesland,Apex Legends,17021
2254904,1351,Zyrpesland,2020,Oct,4,0,0.0,11.0,22.6,-,-,100,0.0,76,3.4,Zyrpesland,New World: Aeternum,17021


In [None]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
import joblib
import sys
sys.path.append("../")
from src import soporte_bbdd_mongo as sbm
from sklearn.metrics import silhouette_score, pairwise
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm

def sumar_datos_por_streamer(df:pd.DataFrame) -> pd.DataFrame:
    # Asegurar que las columnas numéricas están bien definidas
    columnas_numericas = ["avgviewers", "viewersgain", "peakviewers", 
                          "hoursstreamed", "hoursgain", "followers", 
                          "followersgain", "perhour"]
    
    for col in columnas_numericas:
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)

    # Agrupar por id_streamer y sumar las columnas numéricas
    df_sumado = df.groupby(["id_streamer", "nombre"], as_index=False)[columnas_numericas].sum().round(2)

    # Ordenar por id_streamer
    df_sumado = df_sumado.sort_values(by="id_streamer").reset_index(drop=True)

    return df_sumado

# Conectar a MongoDB
client, db = sbm.conectar_mongo()

# Cargar datos
historico = pd.DataFrame(list(db.historico_streamers.find())).drop(columns=["_id"])
categorias = pd.DataFrame(list(db.categorias_streameadas.find())).drop(columns=["_id"])
streamers = pd.DataFrame(list(db.streamers.find())).drop(columns=["_id"])  # Cargar ranking

historico = sumar_datos_por_streamer(historico)

# sbm.insertar_en_coleccion(db, "historico_streamers_total", historico, False, "id_streamer")

client.close()

# Unir datasets usando el ID del streamer
# historico = historico.drop(columns=['_id'])
# categorias = categorias.drop(columns=['_id'])
# streamers = streamers.drop(columns=['_id'])
# df = historico.merge(categorias, on='id_streamer', how='left')
# df = df.merge(streamers[['id_streamer', 'rank']], on='id_streamer', how='left')  # Agregar ranking

df = historico[["id_streamer", "avgviewers", "peakviewers", "hoursstreamed"]].merge(categorias, on='id_streamer', how='left')
df = df.merge(streamers[['id_streamer', 'rank', 'all_time_peak_viewers', 'total_followers', 'total_views']], on='id_streamer', how='left')  # Agregar ranking

display(df)

# Reemplazar valores no numéricos ('-') por NaN
df.replace(r'(?<!.)-(?!.)', np.nan, regex=True, inplace=True)

# Convertir columnas numéricas al tipo adecuado
num_cols = ['avgviewers', 'peakviewers', 'hoursstreamed', 'all_time_peak_viewers', 'total_followers', 'total_views', 'rank']
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Reemplazar NaN en 'rank' con la mediana
# df['rank'] = df['rank'].fillna(df['rank'].median())

# Optimizar memoria reduciendo tipos de datos
df[num_cols] = df[num_cols].astype(np.float32)
df['id_streamer'] = df['id_streamer'].astype(np.int32)

# Rellenar NaN con la mediana de cada columna
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Liberar memoria innecesaria
gc.collect()

# Codificar la categoría como variable numérica
le = LabelEncoder()
df['categoria_codificada'] = le.fit_transform(df['categoria'])

num_cols.append("categoria_codificada")

df["rank"] = df["rank"].max() + df["rank"].min() - df["rank"]
df["rank"] = df["rank"].astype(np.int32)

display(df)

# Ponderar columnas clave antes de normalizar
df['rank'] *= 5  # Dar más peso al ranking
df['total_followers'] *= 3  # Más peso a seguidores
df['total_views'] *= 3  # Más peso a vistas totales

# Normalizar los datos con MinMaxScaler
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Clustering con K-Means
kmeans = KMeans(n_clusters=20, random_state=42)
df['cluster'] = kmeans.fit_predict(df[num_cols])

num_cols.append("cluster")

display(df)

# Entrenar y guardar modelos KNN por categoría
categorias_unicas = df['categoria'].unique()
modelos_knn = {}

for categoria in categorias_unicas:
    df_categoria = df[df['categoria'] == categoria]
    if df_categoria.shape[0] < 5:
        continue
    
    X_categoria = df_categoria[num_cols]
    nn = NearestNeighbors(n_neighbors=min(5, len(df_categoria)), metric='euclidean', n_jobs=14)
    nn.fit(X_categoria)
    
    modelos_knn[categoria] = nn

# Guardar modelos KNN entrenados con el nombre correcto para Streamlit
# joblib.dump(modelos_knn, "../streamlit/modelos/modelo_lastversion.pkl")
print("Ya ha entrenado")
print("Empezando la evaluación")

inertia = kmeans.inertia_  # SSE
silhouette_avg = silhouette_score(df[num_cols], df['cluster'])

print(f"Inertia (SSE): {inertia}")
print(f"Silhouette Score: {silhouette_avg}")

def evaluar_knn(modelos_knn, df:pd.DataFrame, num_cols):
    resultados = []
    
    for categoria, modelo in tqdm(modelos_knn.items()):
        df_categoria = df[df['categoria'] == categoria]
        if df_categoria.shape[0] < 5:
            continue
        
        X_categoria = df_categoria[num_cols].values
        _, indices = modelo.kneighbors(X_categoria)
        
        # Comparar cada punto con sus vecinos
        similitudes = []
        for i, vecinos in enumerate(indices):
            original = X_categoria[i].reshape(1, -1)
            for vecino in vecinos[1:]:  # Omitir el propio punto
                vecino_data = X_categoria[vecino].reshape(1, -1)
                similitudes.append(pairwise.cosine_similarity(original, vecino_data)[0][0])
        
        # Promedio de similitudes
        resultados.append(np.mean(similitudes))
    
    return np.mean(resultados)

similitud_promedio = evaluar_knn(modelos_knn, df, num_cols)
print(f"Similitud promedio de KNN: {similitud_promedio:.4f}")

# Guardar modelos KNN entrenados con el nombre correcto para Streamlit
joblib.dump((modelos_knn, num_cols, kmeans, scaler, le), "../streamlit/modelos/modelo_lastversion.pkl")

Unnamed: 0,id_streamer,avgviewers,peakviewers,hoursstreamed,nombre,categoria,rank,all_time_peak_viewers,total_followers,total_views
0,1,2720713,8649631.0,7592.1,Rubius,Just Chatting,29,383747.0,15600000.0,273000000.0
1,1,2720713,8649631.0,7592.1,Rubius,Minecraft,29,383747.0,15600000.0,273000000.0
2,1,2720713,8649631.0,7592.1,Rubius,Fortnite,29,383747.0,15600000.0,273000000.0
3,1,2720713,8649631.0,7592.1,Rubius,Grand Theft Auto V,29,383747.0,15600000.0,273000000.0
4,1,2720713,8649631.0,7592.1,Rubius,Rust,29,383747.0,15600000.0,273000000.0
...,...,...,...,...,...,...,...,...,...,...
40359,2287,7011,15990.0,819.8,Rl_crr,Fortnite,29906,1185.0,24900.0,0.0
40360,2287,7011,15990.0,819.8,Rl_crr,FIFA 22,29906,1185.0,24900.0,0.0
40361,2287,7011,15990.0,819.8,Rl_crr,Fall Guys,29906,1185.0,24900.0,0.0
40362,2287,7011,15990.0,819.8,Rl_crr,Just Chatting,29906,1185.0,24900.0,0.0


Unnamed: 0,id_streamer,avgviewers,peakviewers,hoursstreamed,nombre,categoria,rank,all_time_peak_viewers,total_followers,total_views,categoria_codificada
0,1,2720713.0,8649631.0,7592.100098,Rubius,Just Chatting,30010,383747.0,15600000.0,273000000.0,1434
1,1,2720713.0,8649631.0,7592.100098,Rubius,Minecraft,30010,383747.0,15600000.0,273000000.0,1761
2,1,2720713.0,8649631.0,7592.100098,Rubius,Fortnite,30010,383747.0,15600000.0,273000000.0,1068
3,1,2720713.0,8649631.0,7592.100098,Rubius,Grand Theft Auto V,30010,383747.0,15600000.0,273000000.0,1190
4,1,2720713.0,8649631.0,7592.100098,Rubius,Rust,30010,383747.0,15600000.0,273000000.0,2406
...,...,...,...,...,...,...,...,...,...,...,...
40359,2287,7011.0,15990.0,819.799988,Rl_crr,Fortnite,133,1185.0,24900.0,0.0,1068
40360,2287,7011.0,15990.0,819.799988,Rl_crr,FIFA 22,133,1185.0,24900.0,0.0,932
40361,2287,7011.0,15990.0,819.799988,Rl_crr,Fall Guys,133,1185.0,24900.0,0.0,966
40362,2287,7011.0,15990.0,819.799988,Rl_crr,Just Chatting,133,1185.0,24900.0,0.0,1434


Unnamed: 0,id_streamer,avgviewers,peakviewers,hoursstreamed,nombre,categoria,rank,all_time_peak_viewers,total_followers,total_views,categoria_codificada,cluster
0,1,0.573280,0.290391,0.227508,Rubius,Just Chatting,1.000000,0.099764,0.906977,0.760446,0.420281,17
1,1,0.573280,0.290391,0.227508,Rubius,Minecraft,1.000000,0.099764,0.906977,0.760446,0.516120,17
2,1,0.573280,0.290391,0.227508,Rubius,Fortnite,1.000000,0.099764,0.906977,0.760446,0.313013,17
3,1,0.573280,0.290391,0.227508,Rubius,Grand Theft Auto V,1.000000,0.099764,0.906977,0.760446,0.348769,17
4,1,0.573280,0.290391,0.227508,Rubius,Rust,1.000000,0.099764,0.906977,0.760446,0.705158,17
...,...,...,...,...,...,...,...,...,...,...,...,...
40359,2287,0.001470,0.000534,0.024406,Rl_crr,Fortnite,0.003469,0.000300,0.001448,0.000000,0.313013,9
40360,2287,0.001470,0.000534,0.024406,Rl_crr,FIFA 22,0.003469,0.000300,0.001448,0.000000,0.273154,9
40361,2287,0.001470,0.000534,0.024406,Rl_crr,Fall Guys,0.003469,0.000300,0.001448,0.000000,0.283118,9
40362,2287,0.001470,0.000534,0.024406,Rl_crr,Just Chatting,0.003469,0.000300,0.001448,0.000000,0.420281,9


Ya ha entrenado
Empezando la evaluación
Inertia (SSE): 927.9440646283829
Silhouette Score: 0.8389025684414436


100%|██████████| 923/923 [01:01<00:00, 15.07it/s]


Similitud promedio de KNN: 0.9587


['../streamlit/modelos/modelo_lastversion.pkl']

In [21]:
from pymongo import MongoClient
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
import joblib
import sys
sys.path.append("../")
from src import soporte_bbdd_mongo as sbm
from sklearn.metrics import silhouette_score, pairwise
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm

def sumar_datos_por_streamer(df:pd.DataFrame) -> pd.DataFrame:
    # Asegurar que las columnas numéricas están bien definidas
    columnas_numericas = ["avgviewers", "viewersgain", "peakviewers", 
                          "hoursstreamed", "hoursgain", "followers", 
                          "followersgain", "perhour"]
    
    for col in columnas_numericas:
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)

    # Agrupar por id_streamer y sumar las columnas numéricas
    df_sumado = df.groupby(["id_streamer", "nombre"], as_index=False)[columnas_numericas].sum().round(2)

    # Ordenar por id_streamer
    df_sumado = df_sumado.sort_values(by="id_streamer").reset_index(drop=True)

    return df_sumado

# Conectar a MongoDB
client, db = sbm.conectar_mongo()

# Cargar datos
historico = pd.DataFrame(list(db.historico_streamers.find())).drop(columns=["_id"])
categorias = pd.DataFrame(list(db.categorias_streameadas.find())).drop(columns=["_id"])
streamers = pd.DataFrame(list(db.streamers.find())).drop(columns=["_id"])  # Cargar ranking

historico = sumar_datos_por_streamer(historico)

# sbm.insertar_en_coleccion(db, "historico_streamers_total", historico, False, "id_streamer")

client.close()

# Unir datasets usando el ID del streamer
# historico = historico.drop(columns=['_id'])
# categorias = categorias.drop(columns=['_id'])
# streamers = streamers.drop(columns=['_id'])
# df = historico.merge(categorias, on='id_streamer', how='left')
# df = df.merge(streamers[['id_streamer', 'rank']], on='id_streamer', how='left')  # Agregar ranking

df = historico[["id_streamer", "avgviewers", "peakviewers", "hoursstreamed"]].merge(categorias, on='id_streamer', how='left')
df = df.merge(streamers[['id_streamer', 'rank', 'all_time_peak_viewers', 'total_followers', 'total_views']], on='id_streamer', how='left')  # Agregar ranking

display(df)

# Reemplazar valores no numéricos ('-') por NaN
df.replace(r'(?<!.)-(?!.)', np.nan, regex=True, inplace=True)

# Convertir columnas numéricas al tipo adecuado
num_cols = ['avgviewers', 'peakviewers', 'hoursstreamed', 'all_time_peak_viewers', 'total_followers', 'total_views', 'rank']
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Reemplazar NaN en 'rank' con la mediana
# df['rank'] = df['rank'].fillna(df['rank'].median())

# Optimizar memoria reduciendo tipos de datos
df[num_cols] = df[num_cols].astype(np.float32)
df['id_streamer'] = df['id_streamer'].astype(np.int32)

# Rellenar NaN con la mediana de cada columna
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Liberar memoria innecesaria
gc.collect()

# Codificar la categoría como variable numérica
# le = LabelEncoder()
# df['categoria_codificada'] = le.fit_transform(df['categoria'])

# num_cols.append("categoria_codificada")

df["rank"] = df["rank"].max() + df["rank"].min() - df["rank"]
df["rank"] = df["rank"].astype(np.int32)

display(df)

# Ponderar columnas clave antes de normalizar
df['rank'] *= 5  # Dar más peso al ranking
df['total_followers'] *= 3  # Más peso a seguidores
df['total_views'] *= 3  # Más peso a vistas totales

# Normalizar los datos con MinMaxScaler
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Clustering con K-Means
# kmeans = KMeans(n_clusters=10, random_state=42)
# df['cluster'] = kmeans.fit_predict(df[num_cols])

# num_cols.append("cluster")

display(df)

# Entrenar y guardar modelos KNN por categoría
categorias_unicas = df['categoria'].unique()
modelos_knn = {}

for categoria in categorias_unicas:
    df_categoria = df[df['categoria'] == categoria]
    if df_categoria.shape[0] < 5:
        continue
    
    X_categoria = df_categoria[num_cols]
    nn = NearestNeighbors(n_neighbors=min(5, len(df_categoria)), metric='manhattan', n_jobs=14)
    nn.fit(X_categoria)
    
    modelos_knn[categoria] = nn

# Guardar modelos KNN entrenados con el nombre correcto para Streamlit
# joblib.dump(modelos_knn, "../streamlit/modelos/modelo_lastversion.pkl")
print("Ya ha entrenado")
print("Empezando la evaluación")

# inertia = kmeans.inertia_  # SSE
# silhouette_avg = silhouette_score(df[num_cols], df['cluster'])

# print(f"Inertia (SSE): {inertia}")
# print(f"Silhouette Score: {silhouette_avg}")

def evaluar_knn(modelos_knn, df:pd.DataFrame, num_cols):
    resultados = []
    
    for categoria, modelo in tqdm(modelos_knn.items()):
        df_categoria = df[df['categoria'] == categoria]
        if df_categoria.shape[0] < 5:
            continue
        
        X_categoria = df_categoria[num_cols].values
        _, indices = modelo.kneighbors(X_categoria)
        
        # Comparar cada punto con sus vecinos
        similitudes = []
        for i, vecinos in enumerate(indices):
            original = X_categoria[i].reshape(1, -1)
            for vecino in vecinos[1:]:  # Omitir el propio punto
                vecino_data = X_categoria[vecino].reshape(1, -1)
                similitudes.append(pairwise.cosine_similarity(original, vecino_data)[0][0])
        
        # Promedio de similitudes
        resultados.append(np.mean(similitudes))
    
    return np.mean(resultados)

similitud_promedio = evaluar_knn(modelos_knn, df, num_cols)
print(f"Similitud promedio de KNN: {similitud_promedio:.4f}")

# Guardar modelos KNN entrenados con el nombre correcto para Streamlit
joblib.dump((modelos_knn, num_cols, scaler), "../streamlit/modelos/modelo_basico.pkl")

Unnamed: 0,id_streamer,avgviewers,peakviewers,hoursstreamed,nombre,categoria,rank,all_time_peak_viewers,total_followers,total_views
0,1,2720713,8649631.0,7592.1,Rubius,Just Chatting,29,383747.0,15600000.0,273000000.0
1,1,2720713,8649631.0,7592.1,Rubius,Minecraft,29,383747.0,15600000.0,273000000.0
2,1,2720713,8649631.0,7592.1,Rubius,Fortnite,29,383747.0,15600000.0,273000000.0
3,1,2720713,8649631.0,7592.1,Rubius,Grand Theft Auto V,29,383747.0,15600000.0,273000000.0
4,1,2720713,8649631.0,7592.1,Rubius,Rust,29,383747.0,15600000.0,273000000.0
...,...,...,...,...,...,...,...,...,...,...
40359,2287,7011,15990.0,819.8,Rl_crr,Fortnite,29906,1185.0,24900.0,0.0
40360,2287,7011,15990.0,819.8,Rl_crr,FIFA 22,29906,1185.0,24900.0,0.0
40361,2287,7011,15990.0,819.8,Rl_crr,Fall Guys,29906,1185.0,24900.0,0.0
40362,2287,7011,15990.0,819.8,Rl_crr,Just Chatting,29906,1185.0,24900.0,0.0


Unnamed: 0,id_streamer,avgviewers,peakviewers,hoursstreamed,nombre,categoria,rank,all_time_peak_viewers,total_followers,total_views
0,1,2720713.0,8649631.0,7592.100098,Rubius,Just Chatting,30010,383747.0,15600000.0,273000000.0
1,1,2720713.0,8649631.0,7592.100098,Rubius,Minecraft,30010,383747.0,15600000.0,273000000.0
2,1,2720713.0,8649631.0,7592.100098,Rubius,Fortnite,30010,383747.0,15600000.0,273000000.0
3,1,2720713.0,8649631.0,7592.100098,Rubius,Grand Theft Auto V,30010,383747.0,15600000.0,273000000.0
4,1,2720713.0,8649631.0,7592.100098,Rubius,Rust,30010,383747.0,15600000.0,273000000.0
...,...,...,...,...,...,...,...,...,...,...
40359,2287,7011.0,15990.0,819.799988,Rl_crr,Fortnite,133,1185.0,24900.0,0.0
40360,2287,7011.0,15990.0,819.799988,Rl_crr,FIFA 22,133,1185.0,24900.0,0.0
40361,2287,7011.0,15990.0,819.799988,Rl_crr,Fall Guys,133,1185.0,24900.0,0.0
40362,2287,7011.0,15990.0,819.799988,Rl_crr,Just Chatting,133,1185.0,24900.0,0.0


Unnamed: 0,id_streamer,avgviewers,peakviewers,hoursstreamed,nombre,categoria,rank,all_time_peak_viewers,total_followers,total_views
0,1,0.573280,0.290391,0.227508,Rubius,Just Chatting,1.000000,0.099764,0.906977,0.760446
1,1,0.573280,0.290391,0.227508,Rubius,Minecraft,1.000000,0.099764,0.906977,0.760446
2,1,0.573280,0.290391,0.227508,Rubius,Fortnite,1.000000,0.099764,0.906977,0.760446
3,1,0.573280,0.290391,0.227508,Rubius,Grand Theft Auto V,1.000000,0.099764,0.906977,0.760446
4,1,0.573280,0.290391,0.227508,Rubius,Rust,1.000000,0.099764,0.906977,0.760446
...,...,...,...,...,...,...,...,...,...,...
40359,2287,0.001470,0.000534,0.024406,Rl_crr,Fortnite,0.003469,0.000300,0.001448,0.000000
40360,2287,0.001470,0.000534,0.024406,Rl_crr,FIFA 22,0.003469,0.000300,0.001448,0.000000
40361,2287,0.001470,0.000534,0.024406,Rl_crr,Fall Guys,0.003469,0.000300,0.001448,0.000000
40362,2287,0.001470,0.000534,0.024406,Rl_crr,Just Chatting,0.003469,0.000300,0.001448,0.000000


Ya ha entrenado
Empezando la evaluación


100%|██████████| 923/923 [00:58<00:00, 15.72it/s]


Similitud promedio de KNN: 0.9552


['../streamlit/modelos/modelo_basico.pkl']