In [529]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import os

In [530]:
db_path = os.path.join(os.getcwd(), 'laliga.sqlite')
laliga_con = sqlite3.connect(str(db_path))
cursor = laliga_con.cursor()

In [531]:
cursor.execute('SELECT * FROM Matches')
data = cursor.fetchall()

columns = cursor.execute('PRAGMA table_info(Matches)').fetchall()
columns = [col[1] for col in columns]

df = pd.DataFrame(data, columns=columns)
df = df[df['score'].notna()]

In [532]:
df[['goals_home', 'goals_away']] = df['score'].str.split(':', expand=True).astype(int)
df['diff_goals'] = abs(df['goals_home'] - df['goals_away'])
results = [
    (df['goals_home'] > df['goals_away']),
    (df['goals_home'] == df['goals_away']),
    (df['goals_home'] < df['goals_away'])
]

df['result'] = np.select(results, [1, 0, 2]) # mejor en númeritos, el 0 es el empate
df = df.drop(['score'], axis = 1) # dropeamos el score porque pah k
df.head()

Unnamed: 0,season,division,matchday,date,time,home_team,away_team,goals_home,goals_away,diff_goals,result
0,1928-1929,1,1,2/10/29,,Arenas Club,Athletic Madrid,2,3,1,2
1,1928-1929,1,1,2/10/29,,Espanyol,Real Unión,3,2,1,1
2,1928-1929,1,1,2/10/29,,Real Madrid,Catalunya,5,0,5,1
3,1928-1929,1,1,2/10/29,,Donostia,Athletic,1,1,0,0
4,1928-1929,1,1,2/12/29,,Racing,Barcelona,0,2,2,2


In [533]:
def convert_todate(obs):
    split_date = obs.split('/')

    if len(split_date[0]) == 1:
        split_date[0] = '0'+split_date[0]
    if len(split_date[1]) == 1:
        split_date[1] = '0'+split_date[1]
    if int(split_date[2]) > 24:
        split_date[2] = '19'+split_date[2]
    else:
        split_date[2] = '20'+split_date[2]

    return '/'.join(split_date)

In [534]:
df['date'] = df['date'].apply(lambda x: convert_todate(x))
df['date'] = pd.to_datetime(df['date'])

In [535]:
df['season'] = df['season'].str.split('-').str[0].astype(int)

In [536]:
mask = (df['date'] > '1950-01-01') & (df['date'] <= '2015-01-01')
train = df[mask].copy(deep=True)
mask = df['date'] > '2015-01-01'
test = df[mask].copy(deep=True)

In [537]:
all_teams = {}
for count, team in enumerate(train['home_team'].unique()):
    all_teams[team] = count

In [538]:
def count_goals(data):
    sample_dict = {'scored_home':0, 'conceded_home':0, 'scored_away':0, 'conceded_away':0}
    goals_dictionary = { team : sample_dict.copy() for team in data['home_team'].unique()}

    data['scored_home'] = 0
    data['conceded_home'] = 0
    data['scored_away'] = 0
    data['conceded_away'] = 0

    for index, partido in data.iterrows():

        home_team = partido['home_team']
        away_team = partido['away_team']
        
        home_goals = partido['goals_home']
        away_goals = partido['goals_away']

        goals_dictionary[home_team]['scored_home'] += home_goals
        data.at[index, 'scored_home'] = goals_dictionary[home_team]['scored_home']

        goals_dictionary[away_team]['scored_away'] += away_goals
        data.at[index, 'scored_away'] = goals_dictionary[away_team]['scored_away']

        goals_dictionary[home_team]['conceded_home'] += away_goals
        data.at[index, 'conceded_home'] = goals_dictionary[home_team]['conceded_home']

        goals_dictionary[away_team]['conceded_away'] += home_goals
        data.at[index, 'conceded_away'] = goals_dictionary[away_team]['conceded_away']
        
    return data, goals_dictionary

In [539]:
def add_goals(data, goals_dict):
    data['scored_home'] = -1
    data['conceded_home'] = -1
    data['scored_away'] = -1
    data['conceded_away'] = -1

    for index, partido in data.iterrows():
        home_team = partido['home_team']
        away_team = partido['away_team']

        if home_team  in goals_dict:
            data.at[index, 'scored_home'] = goals_dict[home_team]['scored_home']
            data.at[index, 'conceded_home'] = goals_dict[home_team]['conceded_home']
        
        if away_team in goals_dict:
            data.at[index, 'scored_away'] = goals_dict[away_team]['scored_away']
            data.at[index, 'conceded_away'] = goals_dict[away_team]['conceded_away']

    return data

In [540]:
def wins(data):    
    sample_dict = {'win':0, 'tie':0, 'lost':0}
    wins_dictionary = { team : sample_dict.copy() for team in data['home_team'].unique()}

    data['win_home'] = 0
    data['tie_home'] = 0
    data['lost_home'] = 0

    data['win_away'] = 0
    data['tie_away'] = 0
    data['lost_away'] = 0
    
    for index, partido in data.iterrows():

        home_team = partido['home_team']
        away_team = partido['away_team']
        
        if partido['result'] == 1:
            wins_dictionary[home_team]['win'] += 1
            wins_dictionary[away_team]['lost'] += 1

        if partido['result'] == 2:
            wins_dictionary[away_team]['win'] += 1
            wins_dictionary[home_team]['lost'] += 1

        else:
            wins_dictionary[home_team]['tie'] += 1
            wins_dictionary[away_team]['tie'] += 1


        data.at[index, 'win_home'] = wins_dictionary[home_team]['win']
        data.at[index, 'tie_home'] = wins_dictionary[home_team]['tie']
        data.at[index, 'lost_home'] = wins_dictionary[home_team]['lost']

        data.at[index, 'win_away'] = wins_dictionary[away_team]['win']
        data.at[index, 'tie_away'] = wins_dictionary[away_team]['tie']
        data.at[index, 'lost_away'] = wins_dictionary[away_team]['lost']

    return data, wins_dictionary

In [541]:
def add_wins(data, wins_dict):
    data['win'] = 0
    data['tie'] = 0
    data['lost'] = 0

    for index, partido in data.iterrows():
        home_team = partido['home_team']
        away_team = partido['away_team']

        if home_team in wins_dict:
            data.at[index, 'win_home'] = wins_dict[home_team]['win']
            data.at[index, 'tie_home'] = wins_dict[home_team]['tie']
            data.at[index, 'lost_home'] = wins_dict[home_team]['lost']
        
        if away_team in wins_dict:
            data.at[index, 'win_away'] = wins_dict[away_team]['win']
            data.at[index, 'tie_away'] = wins_dict[away_team]['tie']
            data.at[index, 'lost_away'] = wins_dict[away_team]['lost']

    return data

In [542]:
train, goals_dict = count_goals(train)
test = add_goals(test, goals_dict)

train, wins_dict = wins(train)
test = add_wins(test, wins_dict)

In [543]:
train['home_team'] = train['home_team'].map(all_teams)
train['away_team'] = train['away_team'].map(all_teams)

test['home_team'] = test['home_team'].map(all_teams)
test['away_team'] = test['away_team'].map(all_teams)

test.fillna(-1, inplace=True)

In [544]:
cols = ['home_team', 'away_team', 'division', 
'matchday', 'win_home', 'win_away', 'scored_home', 'scored_away', 'season']

x_train = train[cols]
y_train = train['result']

x_test = test[cols]
y_test = test['result']


In [545]:
model = DecisionTreeClassifier()

model.fit(x_train, y_train)

In [546]:
prediction = model.predict(x_test)

In [547]:
hits = (y_test == prediction).value_counts()
print(f'Accuracy Tree: {hits[1] / len(y_test)}')

Accuracy Tree: 0.3834613323807823


In [548]:
model2 = GradientBoostingClassifier()
model2.fit(x_train, y_train)

prediction = model2.predict(x_test)

hits = (y_test == prediction).value_counts()
print(f'Accuracy BoostingC: {hits[1] / len(y_test)}')

Accuracy BoostingC: 0.4793713163064833


In [549]:
model3 = RandomForestClassifier()
model3.fit(x_train, y_train)

prediction = model3.predict(x_test)

hits = (y_test == prediction).value_counts()
print(f'Accuracy RandomForest: {hits[1] / len(y_test)}')

Accuracy RandomForest: 0.44257903196999465


In [550]:
model3 = LogisticRegression()
model3.fit(x_train, y_train)

prediction = model3.predict(x_test)

hits = (y_test == prediction).value_counts()
print(f'Accuracy Logistic: {hits[1] / len(y_test)}')

Accuracy Logistic: 0.4672262904090016


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


DecisionTreeClassifier -> ACC: 37,21%\
GradientBoostingClassifier -> ACC: 47,06%\
RandomForest -> ACC: 40,71%
\
\
Con historico de goles:\
DecisionTreeClassifier -> ACC: 36.56%\
GradientBoostingClassifier -> ACC: 44.64%\
RandomForest -> ACC: 43.89%
\
\
Con historico de wins:\
DecisionTreeClassifier -> ACC: 37.60%\
GradientBoostingClassifier -> ACC: 47.34%\
RandomForest -> ACC: 43.87%
