In [16]:
#Fetches the data for the current Série A

import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv

# Make a request to the webpage URL and extract data from fourth table
url = 'https://www.chancedegol.com.br/br23.htm'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find_all('table')[6]

# Loop through the rows of the table and extract the data from the cells
data = []
for row in table.find_all('tr'):
    cols = row.find_all('td')
    cols = [col.text.strip() for col in cols]
    data.append(cols)

# Write the data to a CSV file
with open('dataBRA23raw.csv', 'w', newline='', encoding='utf-8-sig') as file:
    writer = csv.writer(file)
    writer.writerows(data)
    


In [17]:
df = pd.read_csv('dataBRA23raw.csv')
df

Unnamed: 0,Data,Mandante,Visitante,Vitória domandante,Empate,Vitória dovisitante
0,19/08/2023,Fluminense,América MG,69.7 %,18.1 %,12.2 %
1,19/08/2023,São Paulo,Botafogo,32.7 %,28.9 %,38.3 %
2,19/08/2023,Cruzeiro,Corinthians,40.5 %,32.3 %,27.2 %
3,19/08/2023,Cuiabá,Palmeiras,30.4 %,26.6 %,43.0 %
4,20/08/2023,Vasco,Atlético MG,28.5 %,29.5 %,42.0 %
5,20/08/2023,Santos,Grêmio,32.3 %,25.5 %,42.2 %
6,20/08/2023,Goiás,Athletico PR,36.1 %,26.8 %,37.1 %
7,20/08/2023,Coritiba,Flamengo,19.2 %,21.7 %,59.1 %
8,20/08/2023,Bahia,Red Bull Bragantino,35.2 %,29.0 %,35.8 %
9,21/08/2023,Internacional,Fortaleza,37.5 %,27.6 %,34.9 %


In [18]:
# This output cleans and organizes the Data.

df.rename(columns={'Mandante':'Casa','Visitante':'Fora'}, inplace =True)
# Changes dtype to datetime in first dataframe
df['Data'] = df['Data'].str.replace('/', '-')
df['Data'] = pd.to_datetime(df['Data'], format='%d-%m-%Y')
df['Temp'] = df['Data'].dt.year
df = df.sort_values(by='Data')
df['Data'] = df['Data'].dt.strftime('%d-%m-%Y')
# Creates a new column with the for the competition name

df = df.assign(Comp='Série A')


# Organizes de columns of the sacond dataframe
new_order = ['Comp','Temp','Data','Casa', 'Fora']
df= df.reindex(columns=new_order)


#Padronizes the names of the teams
df.replace({'América MG': 'América-MG',
            'Atlético MG': 'Atlético-MG',
            'Athletico PR': 'Athletico-PR',                   
            }, inplace=True)
df.dropna(axis=0, how='all', inplace=True)


df

Unnamed: 0,Comp,Temp,Data,Casa,Fora
0,Série A,2023,19-08-2023,Fluminense,América-MG
1,Série A,2023,19-08-2023,São Paulo,Botafogo
2,Série A,2023,19-08-2023,Cruzeiro,Corinthians
3,Série A,2023,19-08-2023,Cuiabá,Palmeiras
4,Série A,2023,20-08-2023,Vasco,Atlético-MG
5,Série A,2023,20-08-2023,Santos,Grêmio
6,Série A,2023,20-08-2023,Goiás,Athletico-PR
7,Série A,2023,20-08-2023,Coritiba,Flamengo
8,Série A,2023,20-08-2023,Bahia,Red Bull Bragantino
9,Série A,2023,21-08-2023,Internacional,Fortaleza


In [26]:
df_elo = pd.read_json('data/dataELOranking.json')
df_elo

Unnamed: 0,Time,Elo,Jogos,Rank,Média,#,Gols,Vitórias,Derrotas,Empates,Série
0,Botafogo,1913.20,856,1,1666.61,12,1164,343,287,226,A
1,Palmeiras,1874.13,847,2,1733.62,2,1305,405,231,211,A
2,Flamengo,1852.19,881,3,1733.21,3,1300,406,241,234,A
3,Fluminense,1825.18,877,4,1691.08,10,1227,361,292,224,A
4,Corinthians,1797.37,858,5,1715.29,5,1150,373,243,242,A
...,...,...,...,...,...,...,...,...,...,...,...
129,Campinense,1402.62,150,128,1456.12,114,171,46,68,36,-
130,Fast Clube,1400.57,47,129,1430.75,129,56,14,23,10,-
131,Mogi Mirim,1389.92,173,130,1425.75,131,184,47,76,50,-
132,Guaratinguetá,1380.26,240,131,1446.20,124,290,80,105,55,-


In [46]:
def get_elo_rating(team_name, elo_dataframe):
    row = elo_dataframe[elo_dataframe['Time'] == team_name]
    if not row.empty:
        return row['Elo'].iloc[0]
    else:
        return None  # Team not found in Elo DataFrame

# Add a new column to the matches DataFrame for Elo ratings
df['Home_Elo'] = df['Casa'].apply(lambda x: get_elo_rating(x, df_elo))
df['Away_Elo'] = df['Fora'].apply(lambda x: get_elo_rating(x, df_elo))
df['Elo_Difference'] = df['Home_Elo'] - df['Away_Elo']

def normalize_elo_difference(elo_difference):
    return round(abs(elo_difference) / 5) * 5

# Assuming you have loaded the probabilities DataFrame as "probabilities_df"
df['Normalized_Elo_Difference'] = df['Elo_Difference'].apply(normalize_elo_difference)
merged_df = df.merge(probabilities_df, left_on='Normalized_Elo_Difference', right_on='Elo Difference Group', how='left')


df

Unnamed: 0,Comp,Temp,Data,Casa,Fora,Home_Elo,Away_Elo,Elo_Difference,Probabilities,Normalized_Elo_Difference
0,Série A,2023,19-08-2023,Fluminense,América-MG,1825.18,1669.77,155.41,"{'Higher Elo Team Won': 54.3046357615894, 'Goo...",155
1,Série A,2023,19-08-2023,São Paulo,Botafogo,1792.28,1913.2,-120.92,"{'Higher Elo Team Won': 50.56216931216931, 'Go...",120
2,Série A,2023,19-08-2023,Cruzeiro,Corinthians,1693.49,1797.37,-103.88,"{'Higher Elo Team Won': 49.65397923875432, 'Go...",105
3,Série A,2023,19-08-2023,Cuiabá,Palmeiras,1774.04,1874.13,-100.09,"{'Higher Elo Team Won': 48.20426487093154, 'Go...",100
4,Série A,2023,20-08-2023,Vasco,Atlético-MG,1616.36,1793.3,-176.94,"{'Higher Elo Team Won': 60.294117647058826, 'G...",175
5,Série A,2023,20-08-2023,Santos,Grêmio,1680.01,1777.68,-97.67,"{'Higher Elo Team Won': 48.20426487093154, 'Go...",100
6,Série A,2023,20-08-2023,Goiás,Athletico-PR,1717.91,1778.39,-60.48,"{'Higher Elo Team Won': 40.32700421940928, 'Go...",60
7,Série A,2023,20-08-2023,Coritiba,Flamengo,1648.94,1852.19,-203.25,"{'Higher Elo Team Won': 60.60606060606061, 'Go...",205
8,Série A,2023,20-08-2023,Bahia,Red Bull Bragantino,1698.07,1791.03,-92.96,"{'Higher Elo Team Won': 44.91776315789474, 'Go...",95
9,Série A,2023,21-08-2023,Internacional,Fortaleza,1785.13,1777.07,8.06,"{'Higher Elo Team Won': 36.02481242106827, 'Go...",10


In [52]:
columns_to_drop = ['Equal Elo (%)','Draw (%)', 'Total Games']

# Drop the specified columns from the DataFrame
merged_df = merged_df.drop(columns=columns_to_drop)



In [53]:
merged_df

Unnamed: 0,Comp,Temp,Data,Casa,Fora,Home_Elo,Away_Elo,Elo Difference Group,Higher Elo Team Won (%),Draw at Home (Good Outcome) (%),Higher Elo Team Lost (%),Draw at Home (Bad Outcome) (%),Sum of Victories and Good Draw Outcomes (%),Sum of Losses and Bad Draw Outcomes (%)
0,Série A,2023,19-08-2023,Fluminense,América-MG,1825.18,1669.77,155.0,54.304636,13.245033,25.165563,7.284768,67.549669,32.450331
1,Série A,2023,19-08-2023,São Paulo,Botafogo,1792.28,1913.2,120.0,50.562169,14.517196,25.562169,9.358466,65.079365,34.920635
2,Série A,2023,19-08-2023,Cruzeiro,Corinthians,1693.49,1797.37,105.0,49.653979,12.572088,25.663206,12.110727,62.226067,37.773933
3,Série A,2023,19-08-2023,Cuiabá,Palmeiras,1774.04,1874.13,100.0,48.204265,10.886644,29.096521,11.81257,59.090909,40.909091
4,Série A,2023,20-08-2023,Vasco,Atlético-MG,1616.36,1793.3,175.0,60.294118,13.72549,18.137255,7.843137,74.019608,25.980392
5,Série A,2023,20-08-2023,Santos,Grêmio,1680.01,1777.68,100.0,48.204265,10.886644,29.096521,11.81257,59.090909,40.909091
6,Série A,2023,20-08-2023,Goiás,Athletico-PR,1717.91,1778.39,60.0,40.327004,15.45007,33.403657,10.819269,55.777075,44.222925
7,Série A,2023,20-08-2023,Coritiba,Flamengo,1648.94,1852.19,205.0,60.606061,15.151515,14.393939,9.848485,75.757576,24.242424
8,Série A,2023,20-08-2023,Bahia,Red Bull Bragantino,1698.07,1791.03,95.0,44.917763,12.631579,29.583333,12.867325,57.549342,42.450658
9,Série A,2023,21-08-2023,Internacional,Fortaleza,1785.13,1777.07,10.0,36.024812,14.765248,36.825644,12.384295,50.79006,49.20994


In [39]:
probabilities_df = pd.read_csv('data/probabilidades.csv')


def calculate_probabilities(elo_difference, probabilities_df):
    row = probabilities_df[probabilities_df['Elo Difference Group'] == elo_difference]
    if not row.empty:
        higher_elo_won = row['Higher Elo Team Won (%)'].iloc[0]
        good_draw_at_home = row['Draw at Home (Good Outcome) (%)'].iloc[0]
        higher_elo_lost = row['Higher Elo Team Lost (%)'].iloc[0]
        bad_draw_at_home = row['Draw at Home (Bad Outcome) (%)'].iloc[0]
        equal_elo = row['Equal Elo (%)'].iloc[0]
        draw = row['Draw (%)'].iloc[0]
        return {
            'Higher Elo Team Won': higher_elo_won,
            'Good Draw at Home': good_draw_at_home,
            'Higher Elo Team Lost': higher_elo_lost,
            'Bad Draw at Home': bad_draw_at_home,
            'Equal Elo': equal_elo,
            'Draw': draw
        }
    else:
        return None  # Elo difference group not found in the probabilities DataFrame

# Assuming you have loaded the probabilities DataFrame as "probabilities_df"
df['Probabilities'] = df['Elo_Difference'].apply(lambda x: calculate_probabilities(x, probabilities_df))

df


Unnamed: 0,Comp,Temp,Data,Casa,Fora,Home_Elo,Away_Elo,Elo_Difference,Probabilities,Normalized_Elo_Difference
0,Série A,2023,19-08-2023,Fluminense,América-MG,1825.18,1669.77,155.41,,155
1,Série A,2023,19-08-2023,São Paulo,Botafogo,1792.28,1913.2,-120.92,,120
2,Série A,2023,19-08-2023,Cruzeiro,Corinthians,1693.49,1797.37,-103.88,,105
3,Série A,2023,19-08-2023,Cuiabá,Palmeiras,1774.04,1874.13,-100.09,,100
4,Série A,2023,20-08-2023,Vasco,Atlético-MG,1616.36,1793.3,-176.94,,175
5,Série A,2023,20-08-2023,Santos,Grêmio,1680.01,1777.68,-97.67,,100
6,Série A,2023,20-08-2023,Goiás,Athletico-PR,1717.91,1778.39,-60.48,,60
7,Série A,2023,20-08-2023,Coritiba,Flamengo,1648.94,1852.19,-203.25,,205
8,Série A,2023,20-08-2023,Bahia,Red Bull Bragantino,1698.07,1791.03,-92.96,,95
9,Série A,2023,21-08-2023,Internacional,Fortaleza,1785.13,1777.07,8.06,,10
