In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from joblib import Parallel, delayed
import json
import os
import sys
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from random import choice
from time import time
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
df = pd.read_csv("datos_fotmob_completo.csv")
# Cambiamos los nombres de algunas variables, dropeamos los valores vacios y ordenamos por fecha
# La fecha va de la más antigua a la más nueva
df.rename(columns={'homeTeam_score': 'score_home',
          "awayTeam_score": "score_away"}, inplace=True)
df.rename(columns={'homeTeam': 'team_home',
          "awayTeam": "team_away"}, inplace=True)
df = df.sort_values(by="date", ignore_index=True)
df = df.drop(["discipline_away", "discipline_home", "duels_home", "duels_away",
             "defense_home", "defense_away", "shots_home", "shots_away"], axis=1)
# Este dropeo es por mientras
df = df.drop(["accurate_passes_home", "accurate_passes_away", "long_balls_accurate_home", "long_balls_accurate_away", "accurate_crosses_home", "accurate_crosses_away",
              "tackles_succeeded_home", "tackles_succeeded_away", "ground_duels_won_home", "ground_duels_won_away", "aerials_won_home", "aerials_won_away",
              "dribbles_succeeded_home", "dribbles_succeeded_away"], axis=1)
# Filtros de liga y equipo
df = df.drop(["leagueId", "parentLeagueId", "leagueName",
             "homeIdTeam", "awayIdTeam"], axis=1)
# Este dropeo hay que conversarlo
df.head()

In [None]:
df.info()

In [None]:
# Estadisticas simples, ver si jugar en casa afecta
home_win = 0
away_win = 0
draw = 0

total = len(df)
home_win_percentage = sum(df["score_home"] > df["score_away"])/total
away_win_percentage = sum(df["score_home"] < df["score_away"])/total
draw_perentage = sum(df["score_home"] == df["score_away"])/total

In [None]:
print(home_win_percentage)
print(away_win_percentage)
print(draw_perentage)

In [None]:
def get_features(data):
    return pd.DataFrame(data[4:54]).T


def search_features(i, df, last_games):
    team_home = df.iloc[i]["team_home"]
    team_away = df.iloc[i]["team_away"]
    # Para hacer la busqueda más eficiente, solo buscamos en los partidos que estos equipos juegan
    df_match_home1 = df[df["team_home"] == team_home]
    df_match_home2 = df[df["team_away"] == team_home]
    df_match_away1 = df[df["team_home"] == team_away]
    df_match_away2 = df[df["team_away"] == team_away]

    merged_df_home = pd.concat([df_match_home1, df_match_home2])
    merged_df_home.drop_duplicates(inplace=True)
    merged_df_home = merged_df_home.sort_values(by="date", ascending=False)

    merged_df_away = pd.concat([df_match_away1, df_match_away2])
    merged_df_away.drop_duplicates(inplace=True)
    merged_df_away = merged_df_away.sort_values(by="date", ascending=False)

    # El index 0 es el partido, por lo que iteramos desde el 1
    data_home = pd.DataFrame()
    data_away = pd.DataFrame()
    try:
        for j in range(1, last_games+1):
            if team_home == merged_df_home.iloc[j]["team_home"]:
                d = get_features(merged_df_home.iloc[j])
                d["played_home"] = 1
                d["played_away"] = 0
                data_home = pd.concat([data_home, d])
            elif team_home == merged_df_home.iloc[j]["team_away"]:
                d = get_features(merged_df_home.iloc[j])
                d["played_home"] = 0
                d["played_away"] = 1
                data_home = pd.concat([data_home, d])

        for j in range(1, last_games+1):
            if team_away == merged_df_away.iloc[j]["team_home"]:
                d = get_features(merged_df_away.iloc[j])
                d["played_home"] = 1
                d["played_away"] = 0

                data_away = pd.concat([data_away, d])
            elif team_away == merged_df_away .iloc[j]["team_away"]:
                d = get_features(merged_df_away.iloc[j])
                d["played_home"] = 0
                d["played_away"] = 1
                data_away = pd.concat([data_away, d])

        data = pd.concat([data_home, data_away])

        data = data.reset_index(drop=True)

        data = data.stack().to_frame().T

        #### Hiperparametro de max score
        maxs = 4
        # Resultado:
        # W = home_win, D = draw, L = home_lose
        if df.iloc[i]["score_home"] > df.iloc[i]["score_away"]:
            data["Resultado"] = "W"
        elif df.iloc[i]["score_home"] == df.iloc[i]["score_away"]:
            data["Resultado"] = "D"
        elif df.iloc[i]["score_home"] < df.iloc[i]["score_away"]:
            data["Resultado"] = "L"

        # Resultado númerico:
        home_score = df.iloc[i]["score_home"]
        away_score = df.iloc[i]["score_away"]
        # Buscar a cual le queda
        if home_score >= maxs and away_score < maxs:
            data["Resultado_Numerico"] = f"{maxs}>-{away_score}"
        elif home_score < maxs and away_score >= maxs:
            data["Resultado_Numerico"] = f"{home_score}-{maxs}>"
        elif home_score >= maxs and away_score >= maxs:
            data["Resultado_Numerico"] = f"{maxs}>-{maxs}>"
        else:
            data["Resultado_Numerico"] = f"{home_score}-{away_score}"
        return data
    except:
        return pd.DataFrame()



In [None]:
# Pruebas de dataframe
df_test = df[df["red_cards_away"].notnull()].reset_index(drop=True)
lista_de_testeo= list(df_test.index)
df_test.info()

In [None]:
# Programa lineal
last_games = 15
resultados = []
df_nn = pd.DataFrame()
for i in tqdm(lista_de_testeo):
    info = search_features(i,df_test.head(i+1),last_games)
    resultados.append(info)
df_nn = pd.concat(resultados)

In [None]:
# Programa paralelo (Cuidado que a veces las particiones se quedan sin memoria, error de pickle)

# Last games = Juegos pasados a tener en cuenta
# Num_trabajadores = Trabajadores en paralelo (usar -1 para todos los posibles)
# Se demora un par de minutos
last_games = 15
num_trabajadores = -1
resultados = Parallel(n_jobs=num_trabajadores,verbose=10)(
    delayed(search_features)(i,df_test.head(i+1), last_games) for i in lista_de_testeo)

In [None]:
df_nn.to_csv("Dataframe_Auxiliar.csv")

In [None]:
df_nn = pd.read_csv("Dataframe_Auxiliar.csv")

In [None]:
df_f = pd.DataFrame()
for i in range(2*last_games):
    parte = df_nn[i].add_suffix(f"_{i}",axis=1)
    df_f = pd.concat([df_f,parte],axis=1)
df_f["Resultado"] = df_nn["Resultado"]
df_f["Resultado_Numerico"] = df_nn["Resultado_Numerico"]
df_f.to_csv("Dataframe_Auxiliar.csv")
