In [1]:
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')

In [2]:
import sqlite3
# descargar el database de kaggle: https://www.kaggle.com/datasets/hugomathien/soccer
address = '../data/database.sqlite'
con = sqlite3.connect(address)

In [3]:
# Creamos un df con todos los datos que nos interesan

df_partidos = pd.read_sql("""SELECT season,
                                date,
                                Country.name AS country_name,                                 
                                HT.team_long_name AS home_team,
                                AT.team_long_name AS away_team,
                                home_team_goal,
                                away_team_goal,
                                B365H,
                                B365D,
                                B365A,
                                BWH,
                                BWD,
                                BWA                                    
                                    FROM Match
                                    JOIN Country on Country.id = Match.country_id
                                    JOIN League on League.id = Match.league_id
                                    LEFT JOIN Team AS HT on HT.team_api_id = Match.home_team_api_id
                                    LEFT JOIN Team AS AT on AT.team_api_id = Match.away_team_api_id
                                WHERE Country.name = "Spain"
                                """, con)

-----
Ver cómo ha quedado el df

In [4]:
pd.options.display.max_columns = None

In [5]:
df_partidos.head()

Unnamed: 0,season,date,country_name,home_team,away_team,home_team_goal,away_team_goal,B365H,B365D,B365A,BWH,BWD,BWA
0,2008/2009,2008-08-30 00:00:00,Spain,Valencia CF,RCD Mallorca,3,0,1.7,3.6,5.25,1.65,3.35,5.0
1,2008/2009,2008-08-31 00:00:00,Spain,CA Osasuna,Villarreal CF,1,1,2.8,3.3,2.5,2.9,3.25,2.2
2,2008/2009,2008-08-31 00:00:00,Spain,RC Deportivo de La Coruña,Real Madrid CF,2,1,3.5,3.3,2.1,3.9,3.25,1.85
3,2008/2009,2008-08-31 00:00:00,Spain,CD Numancia,FC Barcelona,1,0,7.0,4.0,1.5,6.45,3.55,1.5
4,2008/2009,2008-08-31 00:00:00,Spain,Racing Santander,Sevilla FC,1,1,2.8,3.3,2.5,2.65,3.25,2.4


In [6]:
# Vemos que hay 1 partido con NaN en las apuestas, por lo que lo quitamos para que no dé problemas
df_partidos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   season          3040 non-null   object 
 1   date            3040 non-null   object 
 2   country_name    3040 non-null   object 
 3   home_team       3040 non-null   object 
 4   away_team       3040 non-null   object 
 5   home_team_goal  3040 non-null   int64  
 6   away_team_goal  3040 non-null   int64  
 7   B365H           3039 non-null   float64
 8   B365D           3039 non-null   float64
 9   B365A           3039 non-null   float64
 10  BWH             3039 non-null   float64
 11  BWD             3039 non-null   float64
 12  BWA             3039 non-null   float64
dtypes: float64(6), int64(2), object(5)
memory usage: 308.9+ KB


In [7]:
# 1. Vemos que hay un partido con NaN en las apuestas, por lo que lo quitamos para que no dé problemas

# 2. Vemos que en la fecha el horario está vacío en todos los partidos 00:00:00, así que procedemos a quitarlo.

-----

In [8]:
# Quitar el valor NaN
df_partidos=df_partidos.dropna()
df_partidos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3039 entries, 0 to 3039
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   season          3039 non-null   object 
 1   date            3039 non-null   object 
 2   country_name    3039 non-null   object 
 3   home_team       3039 non-null   object 
 4   away_team       3039 non-null   object 
 5   home_team_goal  3039 non-null   int64  
 6   away_team_goal  3039 non-null   int64  
 7   B365H           3039 non-null   float64
 8   B365D           3039 non-null   float64
 9   B365A           3039 non-null   float64
 10  BWH             3039 non-null   float64
 11  BWD             3039 non-null   float64
 12  BWA             3039 non-null   float64
dtypes: float64(6), int64(2), object(5)
memory usage: 332.4+ KB


In [9]:
# Quitar hh:mm:ss de 'date'
df_partidos['date']=df_partidos['date'].replace({" 00:00:00":""}, regex=True)
df_partidos.head()

Unnamed: 0,season,date,country_name,home_team,away_team,home_team_goal,away_team_goal,B365H,B365D,B365A,BWH,BWD,BWA
0,2008/2009,2008-08-30,Spain,Valencia CF,RCD Mallorca,3,0,1.7,3.6,5.25,1.65,3.35,5.0
1,2008/2009,2008-08-31,Spain,CA Osasuna,Villarreal CF,1,1,2.8,3.3,2.5,2.9,3.25,2.2
2,2008/2009,2008-08-31,Spain,RC Deportivo de La Coruña,Real Madrid CF,2,1,3.5,3.3,2.1,3.9,3.25,1.85
3,2008/2009,2008-08-31,Spain,CD Numancia,FC Barcelona,1,0,7.0,4.0,1.5,6.45,3.55,1.5
4,2008/2009,2008-08-31,Spain,Racing Santander,Sevilla FC,1,1,2.8,3.3,2.5,2.65,3.25,2.4


-------


In [10]:
# Creación de la columna 'results'

In [11]:
# Victoria del equipo local = 1
# Victoria del equipo visitante = 2
# Empate = 0

def result (row):
    if row['home_team_goal'] > row['away_team_goal']:
        return 1
    if row['home_team_goal'] == row['away_team_goal']:
        return 0
    if row['home_team_goal'] < row['away_team_goal']:
        return 2

In [12]:
df_partidos['result'] = df_partidos.apply (lambda row: result(row), axis=1)
df_partidos.head(6)

Unnamed: 0,season,date,country_name,home_team,away_team,home_team_goal,away_team_goal,B365H,B365D,B365A,BWH,BWD,BWA,result
0,2008/2009,2008-08-30,Spain,Valencia CF,RCD Mallorca,3,0,1.7,3.6,5.25,1.65,3.35,5.0,1
1,2008/2009,2008-08-31,Spain,CA Osasuna,Villarreal CF,1,1,2.8,3.3,2.5,2.9,3.25,2.2,0
2,2008/2009,2008-08-31,Spain,RC Deportivo de La Coruña,Real Madrid CF,2,1,3.5,3.3,2.1,3.9,3.25,1.85,1
3,2008/2009,2008-08-31,Spain,CD Numancia,FC Barcelona,1,0,7.0,4.0,1.5,6.45,3.55,1.5,1
4,2008/2009,2008-08-31,Spain,Racing Santander,Sevilla FC,1,1,2.8,3.3,2.5,2.65,3.25,2.4,0
5,2008/2009,2008-08-31,Spain,Real Sporting de Gijón,Getafe CF,1,2,2.62,3.3,2.62,2.6,3.15,2.5,2


----


------

-----------

ACIERTOS CASAS DE APUESTAS

In [13]:
# Predicciones de las casas de apuestas
df_bets=df_partidos.copy()

In [14]:
# Predicciones B365
def bet_prediction_B365 (row):
    if row['B365H'] < row['B365D'] and row['B365H'] < row['B365A']:
        return 1
    if row['B365A'] < row['B365D'] and row['B365A'] < row['B365H']:
        return 2
    else:
        return 0

In [15]:
df_bets['B365_prediction'] = df_bets.apply (lambda row: bet_prediction_B365(row), axis=1)
df_bets.head()

Unnamed: 0,season,date,country_name,home_team,away_team,home_team_goal,away_team_goal,B365H,B365D,B365A,BWH,BWD,BWA,result,B365_prediction
0,2008/2009,2008-08-30,Spain,Valencia CF,RCD Mallorca,3,0,1.7,3.6,5.25,1.65,3.35,5.0,1,1
1,2008/2009,2008-08-31,Spain,CA Osasuna,Villarreal CF,1,1,2.8,3.3,2.5,2.9,3.25,2.2,0,2
2,2008/2009,2008-08-31,Spain,RC Deportivo de La Coruña,Real Madrid CF,2,1,3.5,3.3,2.1,3.9,3.25,1.85,1,2
3,2008/2009,2008-08-31,Spain,CD Numancia,FC Barcelona,1,0,7.0,4.0,1.5,6.45,3.55,1.5,1,2
4,2008/2009,2008-08-31,Spain,Racing Santander,Sevilla FC,1,1,2.8,3.3,2.5,2.65,3.25,2.4,0,2


In [16]:
# Predicciones BW
def bet_prediction_BW (row):
    if row['BWH'] < row['BWD'] and row['BWH'] < row['BWA']:
        return 1
    if row['BWA'] < row['BWD'] and row['BWA'] < row['BWH']:
        return 2
    else:
        return 0

In [17]:
df_bets['BW_prediction'] = df_bets.apply (lambda row: bet_prediction_BW(row), axis=1)
df_bets.head()

Unnamed: 0,season,date,country_name,home_team,away_team,home_team_goal,away_team_goal,B365H,B365D,B365A,BWH,BWD,BWA,result,B365_prediction,BW_prediction
0,2008/2009,2008-08-30,Spain,Valencia CF,RCD Mallorca,3,0,1.7,3.6,5.25,1.65,3.35,5.0,1,1,1
1,2008/2009,2008-08-31,Spain,CA Osasuna,Villarreal CF,1,1,2.8,3.3,2.5,2.9,3.25,2.2,0,2,2
2,2008/2009,2008-08-31,Spain,RC Deportivo de La Coruña,Real Madrid CF,2,1,3.5,3.3,2.1,3.9,3.25,1.85,1,2,2
3,2008/2009,2008-08-31,Spain,CD Numancia,FC Barcelona,1,0,7.0,4.0,1.5,6.45,3.55,1.5,1,2,2
4,2008/2009,2008-08-31,Spain,Racing Santander,Sevilla FC,1,1,2.8,3.3,2.5,2.65,3.25,2.4,0,2,2


In [18]:
df_bets['result'][df_bets['BW_prediction']==df_bets['B365_prediction']].count()

2988

In [19]:
# Las dos casas de apuestas sólo han estado en desacuerdo en 51 partidos de 3039
3039-2988

51

In [20]:
# Ver tasa de acierto en la predicción las casas de apuestas

In [21]:
print('B365 ha acertado' ,df_bets['B365_prediction'][df_bets.result == df_bets.B365_prediction].count(),'en 3039 partidos')
print('BW ha aertado' ,df_bets['BW_prediction'][df_bets.result == df_bets.BW_prediction].count(),'en 3039 partidos')

B365 ha acertado 1707 en 3039 partidos
BW ha aertado 1708 en 3039 partidos


In [22]:
# Vemos que las dos han tenido unas tasas de acierto casi idénticas. Un 56%
round((1707/3039)*100)

56

In [23]:
# GUARDAR DF
df_bets.to_csv('../data/df_bets.csv')

------