In [127]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score

In [128]:
url = "../data/data_UFC_EDA/Enfrentamientos/Enfrentamientos.csv"
enfrentamientos = pd.read_csv(url)
enfrentamientos.head()

Unnamed: 0,event,date,location,r_fighter,b_fighter,status,r_kd,b_kd,r_str,b_str,r_td,b_td,r_sub,b_sub,weight_class,method,method_detailed,round,time
0,UFC Fight Night: Ribas vs. Namajunas,3/23/2024,"Las Vegas, Nevada, USA",Rose Namajunas,Amanda Ribas,win,0.0,0.0,93.0,83.0,1.0,4.0,0.0,0.0,Women's Flyweight,U-DEC,,5.0,5:00
1,UFC Fight Night: Ribas vs. Namajunas,3/23/2024,"Las Vegas, Nevada, USA",Karl Williams,Justin Tafa,win,0.0,0.0,40.0,21.0,7.0,0.0,1.0,0.0,Heavyweight,U-DEC,,3.0,5:00
2,UFC Fight Night: Ribas vs. Namajunas,3/23/2024,"Las Vegas, Nevada, USA",Edmen Shahbazyan,AJ Dobson,win,1.0,0.0,27.0,15.0,1.0,0.0,0.0,0.0,Middleweight,KO/TKO,Punches,1.0,4:33
3,UFC Fight Night: Ribas vs. Namajunas,3/23/2024,"Las Vegas, Nevada, USA",Payton Talbott,Cameron Saaiman,win,1.0,0.0,79.0,31.0,0.0,0.0,0.0,0.0,Bantamweight,KO/TKO,Punches,2.0,0:21
4,UFC Fight Night: Ribas vs. Namajunas,3/23/2024,"Las Vegas, Nevada, USA",Youssef Zalal,Billy Quarantillo,win,0.0,0.0,33.0,10.0,2.0,0.0,2.0,0.0,Featherweight,SUB,Rear Naked Choke,2.0,1:50


In [139]:
enfrentamientos.columns

Index(['event', 'date', 'location', 'r_fighter', 'b_fighter', 'r_kd', 'b_kd',
       'r_str', 'b_str', 'r_td', 'b_td', 'r_sub', 'b_sub', 'method', 'round',
       'time'],
      dtype='object')

In [129]:
enfrentamientos['date'] = pd.to_datetime(enfrentamientos['date'], format='%m/%d/%Y')

In [130]:
enfrentamientos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7582 entries, 0 to 7581
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   event            7497 non-null   object        
 1   date             7497 non-null   datetime64[ns]
 2   location         7497 non-null   object        
 3   r_fighter        7497 non-null   object        
 4   b_fighter        7497 non-null   object        
 5   status           7582 non-null   object        
 6   r_kd             7476 non-null   float64       
 7   b_kd             7476 non-null   float64       
 8   r_str            7476 non-null   float64       
 9   b_str            7476 non-null   float64       
 10  r_td             7476 non-null   float64       
 11  b_td             7476 non-null   float64       
 12  r_sub            7476 non-null   float64       
 13  b_sub            7476 non-null   float64       
 14  weight_class     7497 non-null   object 

In [131]:
# # Hay que tratar Los NATS
# enfrentamientos[['minutes', 'seconds']] = enfrentamientos['time'].str.split(':', expand=True)
# enfrentamientos.info()
# enfrentamientos['time_sec'] = enfrentamientos['minutes'].astype(int) * 60 + enfrentamientos['seconds'].astype(int)

# # # Mostrar los primeros resultados para verificar
# # enfrentamientos[['time', 'time_sec']].head()


In [132]:
from sklearn.model_selection import train_test_split

In [133]:
enfrentamientos["method"] = enfrentamientos["method"].str.strip().str.upper()
method_dict = {"U-DEC": 1,
               "KO/TKO": 2,
               "SUB": 3,
               "DQ": 4,
               "S-DEC" : 5,
               "M-DEC" : 6}
enfrentamientos["method"] = enfrentamientos["method"].map(method_dict)

In [134]:
enfrentamientos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7582 entries, 0 to 7581
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   event            7497 non-null   object        
 1   date             7497 non-null   datetime64[ns]
 2   location         7497 non-null   object        
 3   r_fighter        7497 non-null   object        
 4   b_fighter        7497 non-null   object        
 5   status           7582 non-null   object        
 6   r_kd             7476 non-null   float64       
 7   b_kd             7476 non-null   float64       
 8   r_str            7476 non-null   float64       
 9   b_str            7476 non-null   float64       
 10  r_td             7476 non-null   float64       
 11  b_td             7476 non-null   float64       
 12  r_sub            7476 non-null   float64       
 13  b_sub            7476 non-null   float64       
 14  weight_class     7497 non-null   object 

In [135]:
enfrentamientos.drop(columns=["status","weight_class","method_detailed"], inplace=True)

In [136]:
enfrentamientos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7582 entries, 0 to 7581
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   event      7497 non-null   object        
 1   date       7497 non-null   datetime64[ns]
 2   location   7497 non-null   object        
 3   r_fighter  7497 non-null   object        
 4   b_fighter  7497 non-null   object        
 5   r_kd       7476 non-null   float64       
 6   b_kd       7476 non-null   float64       
 7   r_str      7476 non-null   float64       
 8   b_str      7476 non-null   float64       
 9   r_td       7476 non-null   float64       
 10  b_td       7476 non-null   float64       
 11  r_sub      7476 non-null   float64       
 12  b_sub      7476 non-null   float64       
 13  method     7495 non-null   float64       
 14  round      7497 non-null   float64       
 15  time       7497 non-null   object        
dtypes: datetime64[ns](1), float64(10), object(

In [137]:
enfrentamientos

Unnamed: 0,event,date,location,r_fighter,b_fighter,r_kd,b_kd,r_str,b_str,r_td,b_td,r_sub,b_sub,method,round,time
0,UFC Fight Night: Ribas vs. Namajunas,2024-03-23,"Las Vegas, Nevada, USA",Rose Namajunas,Amanda Ribas,0.0,0.0,93.0,83.0,1.0,4.0,0.0,0.0,1.0,5.0,5:00
1,UFC Fight Night: Ribas vs. Namajunas,2024-03-23,"Las Vegas, Nevada, USA",Karl Williams,Justin Tafa,0.0,0.0,40.0,21.0,7.0,0.0,1.0,0.0,1.0,3.0,5:00
2,UFC Fight Night: Ribas vs. Namajunas,2024-03-23,"Las Vegas, Nevada, USA",Edmen Shahbazyan,AJ Dobson,1.0,0.0,27.0,15.0,1.0,0.0,0.0,0.0,2.0,1.0,4:33
3,UFC Fight Night: Ribas vs. Namajunas,2024-03-23,"Las Vegas, Nevada, USA",Payton Talbott,Cameron Saaiman,1.0,0.0,79.0,31.0,0.0,0.0,0.0,0.0,2.0,2.0,0:21
4,UFC Fight Night: Ribas vs. Namajunas,2024-03-23,"Las Vegas, Nevada, USA",Youssef Zalal,Billy Quarantillo,0.0,0.0,33.0,10.0,2.0,0.0,2.0,0.0,3.0,2.0,1:50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7577,,NaT,,,,,,,,,,,,,,
7578,,NaT,,,,,,,,,,,,,,
7579,,NaT,,,,,,,,,,,,,,
7580,,NaT,,,,,,,,,,,,,,


In [138]:
X = enfrentamientos.drop(columns=['round', 'time','date'])
y = enfrentamientos["round"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred_proba = model.predict_proba(X_test)

auc = roc_auc_score(y_test, y_pred_proba, average="weighted", multi_class="ovr")  # o "ovo"
print(f"AUC: {auc}")


ValueError: could not convert string to float: 'UFC 220: Miocic vs. Ngannou'

In [None]:
# enfrentamientos["kd"] = enfrentamientos["r_kd"].fillna(0) - enfrentamientos["b_kd"].fillna(0)
# enfrentamientos["str"] = enfrentamientos["r_str"].fillna(0) - enfrentamientos["b_str"].fillna(0)
# enfrentamientos["td"] = enfrentamientos["r_td"].fillna(0) - enfrentamientos["b_td"].fillna(0)
# enfrentamientos["sub"] = enfrentamientos["r_sub"].fillna(0) - enfrentamientos["b_sub"].fillna(0)

# enfrentamientos.drop(columns=["r_kd", "b_kd"], inplace=True)
# enfrentamientos.drop(columns=["r_str", "b_str"], inplace=True)
# enfrentamientos.drop(columns=["r_td", "b_td"], inplace=True)
# enfrentamientos.drop(columns=["r_sub", "b_sub"], inplace=True)


In [None]:
enfrentamientos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7582 entries, 0 to 7581
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   event      7497 non-null   object        
 1   date       7497 non-null   datetime64[ns]
 2   location   7497 non-null   object        
 3   r_fighter  7497 non-null   object        
 4   b_fighter  7497 non-null   object        
 5   r_kd       7476 non-null   float64       
 6   b_kd       7476 non-null   float64       
 7   r_str      7476 non-null   float64       
 8   b_str      7476 non-null   float64       
 9   r_td       7476 non-null   float64       
 10  b_td       7476 non-null   float64       
 11  r_sub      7476 non-null   float64       
 12  b_sub      7476 non-null   float64       
 13  method     7495 non-null   float64       
 14  round      7497 non-null   float64       
dtypes: datetime64[ns](1), float64(10), object(4)
memory usage: 888.6+ KB


In [None]:
enfrentamientos = enfrentamientos.dropna()

In [None]:
train, test_1 = train_test_split(enfrentamientos, test_size=0.2, random_state=42)

print(f"Tamaño del conjunto de entrenamiento: {train.shape[0]} filas")
print(f"Tamaño del conjunto de prueba: {test_1.shape[0]} filas")

test=test_1.drop(columns="round")

Tamaño del conjunto de entrenamiento: 5979 filas
Tamaño del conjunto de prueba: 1495 filas


In [None]:
enfrentamientos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7474 entries, 0 to 7496
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   event      7474 non-null   object        
 1   date       7474 non-null   datetime64[ns]
 2   location   7474 non-null   object        
 3   r_fighter  7474 non-null   object        
 4   b_fighter  7474 non-null   object        
 5   r_kd       7474 non-null   float64       
 6   b_kd       7474 non-null   float64       
 7   r_str      7474 non-null   float64       
 8   b_str      7474 non-null   float64       
 9   r_td       7474 non-null   float64       
 10  b_td       7474 non-null   float64       
 11  r_sub      7474 non-null   float64       
 12  b_sub      7474 non-null   float64       
 13  method     7474 non-null   float64       
 14  round      7474 non-null   float64       
dtypes: datetime64[ns](1), float64(10), object(4)
memory usage: 934.2+ KB


In [None]:
import pickle

In [None]:
list(train.columns)

['event',
 'date',
 'location',
 'r_fighter',
 'b_fighter',
 'r_kd',
 'b_kd',
 'r_str',
 'b_str',
 'r_td',
 'b_td',
 'r_sub',
 'b_sub',
 'method',
 'round']

In [None]:
# def calcular_auc(train, target_column, drop_columns, test_size=0.2, random_state=42, max_iter=1000):
#     """
#     Calcula el AUC para un modelo de regresión logística.
    
#     Parameters:
#     - train: DataFrame que contiene los datos de entrenamiento.
#     - target_column: Nombre de la columna objetivo (target).
#     - drop_columns: Lista de nombres de las columnas que se deben eliminar de X.
#     - test_size: Proporción de los datos para el conjunto de prueba (default: 0.2).
#     - random_state: Semilla para la partición aleatoria (default: 42).
#     - max_iter: Número máximo de iteraciones para el modelo (default: 1000).
    
#     Returns:
#     - AUC: El valor calculado de AUC.
#     """
#     # Separar X (características) y y (target)
#     X = train.drop(columns=drop_columns)
#     y = train[target_column]

#     # Dividir en conjuntos de entrenamiento y prueba
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

#     # Crear y entrenar el modelo de regresión logística
#     model = LogisticRegression(max_iter=max_iter)
#     model.fit(X_train, y_train)

#     # Predecir probabilidades en el conjunto de prueba
#     y_pred_proba = model.predict_proba(X_test)

#     # Calcular el AUC (Área bajo la curva ROC)
#     auc = roc_auc_score(y_test, y_pred_proba, average="weighted", multi_class="ovr")  # o "ovo"

#     return auc

# # Ejemplo de uso:
# auc = calcular_auc(train, target_column="round", drop_columns=['round', 'time', 'date'])
# print(f"AUC: {auc}")

In [None]:
X = train.drop(columns=['round', 'time','date'])
y = train["round"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred_proba = model.predict_proba(X_test)

auc = roc_auc_score(y_test, y_pred_proba, average="weighted", multi_class="ovr")  # o "ovo"
print(f"AUC: {auc}")

#Guardarlo
with open('ufc_model.pkl',"wb") as archivo_salida:
    pickle.dump(model, archivo_salida)

print("Modelo guardado como ufc_model.pkl")

KeyError: "['time'] not found in axis"

In [None]:
test_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1499 entries, 6416 to 3742
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    1499 non-null   datetime64[ns]
 1   method  1499 non-null   float64       
 2   round   1499 non-null   float64       
 3   time    1499 non-null   object        
 4   kd      1499 non-null   float64       
 5   str     1499 non-null   float64       
 6   td      1499 non-null   float64       
 7   sub     1499 non-null   float64       
dtypes: datetime64[ns](1), float64(6), object(1)
memory usage: 105.4+ KB


In [None]:
url = "../data/data_UFC_EDA/Enfrentamientos/Enfrentamientos.csv"
df_stream = pd.read_csv(url)

In [None]:
test_1.columns

Index(['date', 'method', 'round', 'time', 'kd', 'str', 'td', 'sub'], dtype='object')

In [None]:
df_stream.drop(columns=["event", "location","status","weight_class","method_detailed","time"], inplace=True)

In [None]:
df_stream = df_stream.dropna()
train, test = train_test_split(df_stream, test_size=0.2, random_state=42)

print(f"Tamaño del conjunto de entrenamiento: {train.shape[0]} filas")
print(f"Tamaño del conjunto de prueba: {test.shape[0]} filas")

test=test.drop(columns="round")

Tamaño del conjunto de entrenamiento: 5980 filas
Tamaño del conjunto de prueba: 1496 filas


In [None]:
test["method"] = test["method"].str.strip().str.upper()
method_dict = {"U-DEC": 1,
               "KO/TKO": 2,
               "SUB": 3,
               "DQ": 4,
               "S-DEC" : 5,
               "M-DEC" : 6}
test["method"] = test["method"].map(method_dict)
test['date'] = pd.to_datetime(test['date'], format='%m/%d/%Y')

In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1496 entries, 2191 to 805
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       1496 non-null   datetime64[ns]
 1   r_fighter  1496 non-null   object        
 2   b_fighter  1496 non-null   object        
 3   r_kd       1496 non-null   float64       
 4   b_kd       1496 non-null   float64       
 5   r_str      1496 non-null   float64       
 6   b_str      1496 non-null   float64       
 7   r_td       1496 non-null   float64       
 8   b_td       1496 non-null   float64       
 9   r_sub      1496 non-null   float64       
 10  b_sub      1496 non-null   float64       
 11  method     1495 non-null   float64       
dtypes: datetime64[ns](1), float64(9), object(2)
memory usage: 151.9+ KB


In [None]:
# test.to_csv("../data/test_stream.csv")