In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
df = pd.read_csv('data (2).csv')

In [3]:
df.head()

Unnamed: 0,league,season,date,team,h_a,result,pts,goals_scored,goals_missed,deep_passes,deep_passes_allowed,ppda,oppda
0,Bundesliga,2014,2014-08-22 19:30:00,Bayern Munich,h,w,3,2,1,5,4,9.625,21.85
1,Bundesliga,2014,2014-08-30 17:30:00,Bayern Munich,a,d,1,1,1,10,1,4.756098,17.695652
2,Bundesliga,2014,2014-09-13 14:30:00,Bayern Munich,h,w,3,2,0,13,3,5.060606,16.961538
3,Bundesliga,2014,2014-09-20 14:30:00,Bayern Munich,a,d,1,0,0,6,2,4.423077,9.446809
4,Bundesliga,2014,2014-09-23 19:00:00,Bayern Munich,h,w,3,4,0,23,2,4.25,44.8


# Misión 1

# Organizamos la info

In [4]:
df_uso = df.copy()

In [5]:
# Local
df_local = df[df['h_a'] == 'h'].copy()
df_local.rename(columns={'team': 'local_team', 'result': 'local_result'}, inplace=True)
df_local.drop(columns=['h_a'], inplace=True)
# Visita
df_visit = df[df['h_a'] == 'a'].copy()
df_visit.rename(columns={'team': 'visitor_team', 'result': 'visitor_result'}, inplace=True)
df_visit.drop(columns=['h_a'], inplace=True)

# Merge
df_combined = df_local.merge(df_visit, on=['league', 'season', 'date'], how='inner')

df_combined['result'] = df_combined.apply(lambda row: 'local' if row['local_result'] > row['visitor_result']
                                          else ('visitante' if row['local_result'] < row['visitor_result'] else 'empate'), axis=1)

df_combined.drop(columns=['local_result', 'visitor_result'], inplace=True)
df_combined.sort_values(by=['season', 'date'], inplace=True)
df_combined.reset_index(drop=True, inplace=True)

df_combined.head()

Unnamed: 0,league,season,date,local_team,pts_x,goals_scored_x,goals_missed_x,deep_passes_x,deep_passes_allowed_x,ppda_x,oppda_x,visitor_team,pts_y,goals_scored_y,goals_missed_y,deep_passes_y,deep_passes_allowed_y,ppda_y,oppda_y,result
0,RFPL,2014,2014-08-01 17:00:00,Rubin Kazan,0,0,4,6,1,5.538462,3.97561,Spartak Moscow,3,4,0,1,6,3.97561,5.538462,visitante
1,RFPL,2014,2014-08-02 13:30:00,Ural,0,2,3,3,8,6.26087,10.478261,Mordovya,3,3,2,8,3,10.478261,6.26087,visitante
2,RFPL,2014,2014-08-02 16:00:00,Arsenal Tula,0,0,4,1,11,11.826087,7.954545,Zenit St. Petersburg,3,4,0,11,1,7.954545,11.826087,visitante
3,RFPL,2014,2014-08-02 18:30:00,CSKA Moscow,3,4,1,17,1,7.526316,19.545455,Torpedo Moscow,0,1,4,1,17,19.545455,7.526316,local
4,RFPL,2014,2014-08-03 13:30:00,Dinamo Moscow,3,7,3,6,6,4.789474,16.0,FC Rostov,0,3,7,6,6,16.0,4.789474,local


In [6]:
df_uso_1 = df_combined.copy()
df_uso_1.drop(columns=['league', 'season', 'date', 'pts_x', 'goals_scored_x', 'goals_missed_x', 'deep_passes_x', 'deep_passes_allowed_x', 'ppda_x', 'oppda_x', 'pts_y', 'goals_scored_y', 'goals_missed_y', 'deep_passes_y', 'deep_passes_allowed_y', 'ppda_y', 'oppda_y'], inplace=True)

In [7]:
df_uso_1.head()

Unnamed: 0,local_team,visitor_team,result
0,Rubin Kazan,Spartak Moscow,visitante
1,Ural,Mordovya,visitante
2,Arsenal Tula,Zenit St. Petersburg,visitante
3,CSKA Moscow,Torpedo Moscow,local
4,Dinamo Moscow,FC Rostov,local


In [8]:
encoder = LabelEncoder()

df_uso_1['local_team'] = encoder.fit_transform(df_uso_1['local_team'])
df_uso_1['visitor_team'] = encoder.fit_transform(df_uso_1['visitor_team'])
df_uso_1['result'] = encoder.fit_transform(df_uso_1['result'])

df_uso_1.head()

Unnamed: 0,local_team,visitor_team,result
0,129,141,2
1,156,102,2
2,8,167,2
3,27,149,1
4,42,53,1


In [9]:
X_train, X_test, y_train, y_test = train_test_split(df_uso_1[['local_team', 'visitor_team']], df_uso_1['result'], test_size=0.3)

In [10]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

y_pred_validation = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_validation)
precision = precision_score(y_test, y_pred_validation, average='macro')
recall = recall_score(y_test, y_pred_validation, average='macro')
f1 = f1_score(y_test, y_pred_validation, average='macro')

print("Métricas de validación:")
print(f"Accuracy: {accuracy}")
print(f"Precisión: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Métricas de validación:
Accuracy: 0.3881371451826229
Precisión: 0.3738514332639136
Recall: 0.36572252497992025
F1 Score: 0.36337841066444865


In [11]:
probs = model.predict_proba(X_test)

prob_clase_1 = probs[:, 2].mean()
prob_clase_1

0.28549741520336325

# Misión 2

In [12]:
df.head()

Unnamed: 0,league,season,date,team,h_a,result,pts,goals_scored,goals_missed,deep_passes,deep_passes_allowed,ppda,oppda
0,Bundesliga,2014,2014-08-22 19:30:00,Bayern Munich,h,w,3,2,1,5,4,9.625,21.85
1,Bundesliga,2014,2014-08-30 17:30:00,Bayern Munich,a,d,1,1,1,10,1,4.756098,17.695652
2,Bundesliga,2014,2014-09-13 14:30:00,Bayern Munich,h,w,3,2,0,13,3,5.060606,16.961538
3,Bundesliga,2014,2014-09-20 14:30:00,Bayern Munich,a,d,1,0,0,6,2,4.423077,9.446809
4,Bundesliga,2014,2014-09-23 19:00:00,Bayern Munich,h,w,3,4,0,23,2,4.25,44.8


In [13]:
df_uso_2 = df_combined.copy()
df_uso_2.drop(columns=['season', 'date', 'pts_x', 'goals_scored_x', 'goals_missed_x', 'deep_passes_x', 'deep_passes_allowed_x', 'ppda_x', 'oppda_x'], inplace=True)

In [34]:
df_groups = pd.DataFrame(columns=['pts_y', 'goals_scored_y', 'goals_missed_y', 'deep_passes_y', 'deep_passes_allowed_y', 'ppda_y', 'oppda_y', 'league'])

# Obtener la lista única de ligas en el DataFrame.
unique_leagues = df_uso_2['league'].unique()

# Iterar a través de las ligas únicas y crear grupos para cada liga.
for league in unique_leagues:
    # Filtrar los partidos de la liga actual.
    league_matches = df_uso_2[df_uso_2['league'] == league]

    # Barajar los partidos de la liga actual de manera aleatoria.
    league_matches = league_matches.sample(frac=1, random_state=42)

    # Dividir los partidos en grupos de 10 (o menos si no hay suficientes partidos).
    num_groups = len(league_matches) // 10
    for i in range(num_groups):
        group = league_matches.iloc[i*10:(i+1)*10].mean(numeric_only=True)
        group['league'] = league
        # Agregamos el grupo al dataframe
        df_groups = pd.concat([df_groups, group.to_frame().T], ignore_index=True)

df_groups

Unnamed: 0,pts_y,goals_scored_y,goals_missed_y,deep_passes_y,deep_passes_allowed_y,ppda_y,oppda_y,league
0,1.5,1.9,1.7,6.7,7.1,11.924643,10.488593,RFPL
1,1.1,0.7,0.9,7.0,6.6,10.410623,11.256577,RFPL
2,1.6,1.0,1.2,7.6,6.7,10.094294,11.237744,RFPL
3,0.7,0.7,1.3,4.7,7.1,13.129183,8.583688,RFPL
4,0.7,0.9,2.0,4.2,7.4,16.992294,8.748168,RFPL
...,...,...,...,...,...,...,...,...
3267,1.4,1.5,1.4,6.5,5.8,14.440461,11.779194,Serie_A
3268,1.2,0.9,1.1,5.8,6.8,10.546742,7.90127,Serie_A
3269,1.1,1.3,2.3,5.1,7.7,14.233723,9.369363,Serie_A
3270,1.7,1.5,1.0,4.8,4.4,9.352101,11.654293,Serie_A


In [15]:
df_groups.head()

In [38]:
# Normalizar
scaler = StandardScaler()
columnas = ['pts_y',	'goals_scored_y',	'goals_missed_y',	'deep_passes_y',	'deep_passes_allowed_y',	'ppda_y', 'oppda_y']

df_groups[columnas] =scaler.fit_transform(df_groups[columnas])

In [39]:
encoder = LabelEncoder()

df_groups['league'] = encoder.fit_transform(df_groups['league'])

In [45]:
X_train, X_test, y_train, y_test = train_test_split(df_groups[columnas], df_groups['league'], test_size=0.3)

In [46]:
model = DecisionTreeClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print("Métricas de validación:")
print(f"Accuracy: {accuracy}")
print(f"Precisión: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Métricas de validación:
Accuracy: 0.31873727087576376
Precisión: 0.27307835994975177
Recall: 0.27304182478986244
F1 Score: 0.2728783087135413
