In [1]:
#Fetches the data for the current Série B


import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv

# Make a request to the webpage URL and extract data from fourth table
url = 'https://www.chancedegol.com.br/brb23.htm'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find_all('table')[7]

# Loop through the rows of the table and extract the data from the cells
data = []
for row in table.find_all('tr'):
    cols = row.find_all('td')
    cols = [col.text.strip() for col in cols]
    data.append(cols)

# Write the data to a CSV file
with open('dataBRB23raw.csv', 'w', newline='', encoding='utf-8-sig') as file:
    writer = csv.writer(file)
    writer.writerows(data)
 

In [2]:
df = pd.read_csv('dataBRB23raw.csv')
df

Unnamed: 0,Data,Mandante,Unnamed: 2,Visitante,Vitória doMandante,Empate,Vitória doVisitante
0,14/04/2023,Guarani,4x1,Avaí,49.0 %,26.4 %,24.7 %
1,14/04/2023,Ituano,2x0,Ceará,31.8 %,27.3 %,40.9 %
2,15/04/2023,Vila Nova GO,2x1,Grêmio Novorizontino,38.1 %,31.0 %,31.0 %
3,15/04/2023,Mirassol,1x0,Chapecoense,54.0 %,26.6 %,19.4 %
4,15/04/2023,Sampaio Corrêa,3x3,Atlético GO,37.4 %,24.7 %,37.9 %
...,...,...,...,...,...,...,...
223,12/08/2023,Sampaio Corrêa,1x1,CRB,35.3 %,29.7 %,35.0 %
224,13/08/2023,Guarani,0x1,Juventude,56.8 %,26.7 %,16.6 %
225,13/08/2023,Vitória,1x0,Ceará,42.3 %,26.5 %,31.2 %
226,13/08/2023,Chapecoense,0x1,Atlético GO,42.3 %,24.0 %,33.7 %


In [3]:
# Cleans and organizes the Data.


df.rename(columns={'Mandante':'Casa','Visitante':'Fora'}, inplace =True)
#changes dtype to datetime in first dataframe
df['Data'] = df['Data'].str.replace('/', '-')
df['Data'] = pd.to_datetime(df['Data'], format='%d-%m-%Y')
df['Temp'] = df['Data'].dt.year
df = df.sort_values(by='Data')
df['Data'] = df['Data'].dt.strftime('%d-%m-%Y')
# Creates a new column with the for the competition name
df = df.assign(Comp='Série B')

# Separates the results of second dataframe into new columns
new = df["Unnamed: 2"].str.split("x", n = 1, expand = True)
df["GC"]= new[0] 
df["GF"]= new[1]

# Comparing scores to display the results in the second dataframe
df.loc[df['GC'] == df['GF'], 'Res'] = 'E'
df.loc[df['GC'] > df['GF'], 'Res'] = 'VC' 
df.loc[df['GC'] < df['GF'], 'Res'] = 'VV'

# Organizes de columns of the sacond dataframe
new_order = ['Comp','Temp','Data','Casa','GC', 'GF','Fora','Res']
df= df.reindex(columns=new_order)


# Padronizes the names of the teams
df['Casa'] = df['Casa'].str.strip()
df['Fora'] = df['Fora'].str.strip()

df.replace({'Atlético GO': 'Atlético-GO', 
             'Botafogo SP': 'Botafogo-SP',
             'Grêmio Novorizontino': 'Novorizontino', 
             'Vila Nova GO': 'Vila Nova-GO',
            }, inplace=True)
df.dropna(axis=0, how='all', inplace=True)

# Atributes a pontuation to each game
df['PC'] = df.apply(lambda x: 3 if x['Res'] == 'VC' else
                              1 if x['Res'] == 'E' else 0, axis=1)
df['PF'] = df.apply(lambda x: 3 if x['Res'] == 'VV' else
                              1 if x['Res'] == 'E' else 0, axis=1)

# Atributes a number of games for each season
df['J'] = df.groupby('Temp').cumcount() + 1

# Corrects any possible issue before saving as new file
df = df.dropna()



In [4]:
df.tail(10)


Unnamed: 0,Comp,Temp,Data,Casa,GC,GF,Fora,Res,PC,PF,J
218,Série B,2023,11-08-2023,Novorizontino,2,1,Mirassol,VC,3,0,219
219,Série B,2023,11-08-2023,Tombense,0,0,Sport,E,1,1,220
222,Série B,2023,12-08-2023,Botafogo-SP,0,0,Ponte Preta,E,1,1,221
223,Série B,2023,12-08-2023,Sampaio Corrêa,1,1,CRB,E,1,1,222
220,Série B,2023,12-08-2023,Vila Nova-GO,1,1,Avaí,E,1,1,223
221,Série B,2023,12-08-2023,Criciúma,2,0,Londrina,VC,3,0,224
225,Série B,2023,13-08-2023,Vitória,1,0,Ceará,VC,3,0,225
226,Série B,2023,13-08-2023,Chapecoense,0,1,Atlético-GO,VV,0,3,226
224,Série B,2023,13-08-2023,Guarani,0,1,Juventude,VV,0,3,227
227,Série B,2023,14-08-2023,ABC,1,1,Ituano,E,1,1,228


In [5]:
#Saves de df in a new file
df.to_csv('dataBRB23.csv', index=False)

###### 