In [130]:
#Fetches the data for the current Série A

import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv

# Make a request to the webpage URL and extract data from fourth table
url = 'https://www.chancedegol.com.br/br23.htm'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find_all('table')[6]

# Loop through the rows of the table and extract the data from the cells
data = []
for row in table.find_all('tr'):
    cols = row.find_all('td')
    cols = [col.text.strip() for col in cols]
    data.append(cols)

# Write the data to a CSV file
with open('dataBRA23raw.csv', 'w', newline='', encoding='utf-8-sig') as file:
    writer = csv.writer(file)
    writer.writerows(data)
    


In [131]:
df = pd.read_csv('dataBRA23raw.csv')
df

Unnamed: 0,Data,Mandante,Visitante,Vitória domandante,Empate,Vitória dovisitante
0,21/08/2023,Goiás,Athletico PR,36.0 %,26.7 %,37.3 %


In [132]:
# This output cleans and organizes the Data.

df.rename(columns={'Mandante':'Casa','Visitante':'Fora'}, inplace =True)
# Changes dtype to datetime in first dataframe
df['Data'] = df['Data'].str.replace('/', '-')
df['Data'] = pd.to_datetime(df['Data'], format='%d-%m-%Y')
df['Temp'] = df['Data'].dt.year
df = df.sort_values(by='Data')
df['Data'] = df['Data'].dt.strftime('%d-%m-%Y')
# Creates a new column with the for the competition name

df = df.assign(Comp='Série A')


# Organizes de columns of the sacond dataframe
new_order = ['Comp','Temp','Data','Casa', 'Fora']
df= df.reindex(columns=new_order)


#Padronizes the names of the teams
df.replace({'América MG': 'América-MG',
            'Atlético MG': 'Atlético-MG',
            'Athletico PR': 'Athletico-PR',                   
            }, inplace=True)
df.dropna(axis=0, how='all', inplace=True)


df

Unnamed: 0,Comp,Temp,Data,Casa,Fora
0,Série A,2023,21-08-2023,Goiás,Athletico-PR


In [133]:
df_elo = pd.read_json('data/dataELOranking.json')
df_elo

Unnamed: 0,Time,Elo,Jogos,Rank,Média,#,Gols,Vitórias,Derrotas,Empates,Série
0,Botafogo,1913.20,856,1,1666.61,12,1164,343,287,226,A
1,Palmeiras,1874.13,847,2,1733.62,2,1305,405,231,211,A
2,Flamengo,1852.19,881,3,1733.21,3,1300,406,241,234,A
3,Fluminense,1825.18,877,4,1691.08,10,1227,361,292,224,A
4,Corinthians,1797.37,858,5,1715.29,5,1150,373,243,242,A
...,...,...,...,...,...,...,...,...,...,...,...
129,Campinense,1402.62,150,128,1456.12,114,171,46,68,36,-
130,Fast Clube,1400.57,47,129,1430.75,129,56,14,23,10,-
131,Mogi Mirim,1389.92,173,130,1425.75,131,184,47,76,50,-
132,Guaratinguetá,1380.26,240,131,1446.20,124,290,80,105,55,-


In [134]:
probabilities_df = pd.read_csv('data/probabilidades.csv')

def get_elo_rating(team_name, elo_dataframe):
    row = elo_dataframe[elo_dataframe['Time'] == team_name]
    if not row.empty:
        return row['Elo'].iloc[0]
    else:
        return None  # Team not found in Elo DataFrame

# Add a new column to the matches DataFrame for Elo ratings
df['Elo casa'] = df['Casa'].apply(lambda x: get_elo_rating(x, df_elo))
df['Elo fora'] = df['Fora'].apply(lambda x: get_elo_rating(x, df_elo))
df['Diferença'] = df['Elo casa'] - df['Elo fora']

def normalize_elo_difference(elo_difference):
    return round(abs(elo_difference) / 5) * 5

# Assuming you have loaded the probabilities DataFrame as "probabilities_df"
df['Normalized_Elo_Difference'] = df['Diferença'].apply(normalize_elo_difference)
merged_df = df.merge(probabilities_df, left_on='Normalized_Elo_Difference', right_on='Tamanho da diferença', how='left')


df

Unnamed: 0,Comp,Temp,Data,Casa,Fora,Elo casa,Elo fora,Diferença,Normalized_Elo_Difference
0,Série A,2023,21-08-2023,Goiás,Athletico-PR,1717.91,1778.39,-60.48,60


In [135]:
# List of columns to drop
columns_to_drop = ['Normalized_Elo_Difference', 'Total de jogos']

# Drop the specified columns from the DataFrame
merged_df = merged_df.drop(columns=columns_to_drop)

merged_df

Unnamed: 0,Comp,Temp,Data,Casa,Fora,Elo casa,Elo fora,Diferença,Tamanho da diferença,Vitória do melhor Elo casa (%),Vitória do melhor Elo fora (%),Empate (%),Vitória do pior Elo casa (%),Vitória do pior Elo fora (%)
0,Série A,2023,21-08-2023,Goiás,Athletico-PR,1717.91,1778.39,-60.48,60.0,12.872714,27.45429,26.269339,23.797468,9.606188


In [136]:


def calculate_probabilities(elo_difference, probabilities_df):
    row = probabilities_df[probabilities_df['Elo Difference Group'] == elo_difference]
    if not row.empty:
        higher_elo_won = row['Vitória do melhor Elo (%)'].iloc[0]
        draw = row['Empate (%)'].iloc[0]
        higher_elo_lost = row['Vitória do pior Elo (%)'].iloc[0]
        return {
            'Vitória do melhor Elo': higher_elo_won,
            'Empate': draw,
            'Vitória do pior Elo': higher_elo_lost,
                    }
    else:
        return None  # Elo difference group not found in the probabilities DataFrame

# Assuming you have loaded the probabilities DataFrame as "probabilities_df"
df['Probabilidade'] = df['Tamanho da diferença'].apply(lambda x: calculate_probabilities(x, probabilities_df))

df


KeyError: 'Tamanho da diferença'