In [13]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv

# Step 1: Make a request to the webpage URL and extract data from fourth table
url = 'https://www.chancedegol.com.br/brb23.htm'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find_all('table')[7]

# Step 2: Loop through the rows of the table and extract the data from the cells
data = []
for row in table.find_all('tr'):
    cols = row.find_all('td')
    cols = [col.text.strip() for col in cols]
    data.append(cols)

# Step 3: Write the data to a CSV file
with open('dataBRB23raw.csv', 'w', newline='', encoding='utf-8-sig') as file:
    writer = csv.writer(file)
    writer.writerows(data)
 

In [14]:
df = pd.read_csv('dataBRB23raw.csv')
#Step 8: cleaning data frame

df.rename(columns={'Mandante':'Casa','Visitante':'Fora'}, inplace =True)
#changes dtype to datetime in first dataframe
df['Data'] = pd.to_datetime(df['Data'], infer_datetime_format=True, errors='coerce')
#creates a new column with the for the competition name

df = df.assign(Comp='Série B')

#separets the results of second dataframe into new columns
new = df["Unnamed: 2"].str.split("x", n = 1, expand = True)
df["GC"]= new[0] 
df["GF"]= new[1]
#creates new column with competition name

#comparing scores to display the results in the second dataframe
df.loc[df['GC'] == df['GF'], 'Res'] = 'E'
df.loc[df['GC'] > df['GF'], 'Res'] = 'VC' 
df.loc[df['GC'] < df['GF'], 'Res'] = 'VV'
# create a new column to store the season information in the second data frame
df['Data'] = pd.to_datetime(df['Data'], infer_datetime_format=True, errors='coerce')
df['Temp'] = df['Data'].dt.year
#organizes de columns of the sacond dataframe
new_order = ['Comp','Temp','Data','Casa','GC', 'GF','Fora','Res']
df= df.reindex(columns=new_order)

df['Casa'] = df['Casa'].str.strip()
df['Fora'] = df['Fora'].str.strip()

df.replace({'Atlético GO': 'Atlético-GO', 
             'Botafogo SP': 'Botafogo-SP',
             'Grêmio Novorizontino': 'Novorizontino', 
             'Vila Nova GO': 'Vila Nova-GO',
            }, inplace=True)

#drops unvalid data
df.dropna(axis=0, how='all', inplace=True)
#sort values by date
df = df.sort_values(by='Data')

#atributes a pontuation to each game
df['PC'] = df.apply(lambda x: 3 if x['Res'] == 'VC' else
                              1 if x['Res'] == 'E' else 0, axis=1)
df['PF'] = df.apply(lambda x: 3 if x['Res'] == 'VV' else
                              1 if x['Res'] == 'E' else 0, axis=1)

#atributes a number of games for each season
df['J'] = df.groupby('Temp').cumcount() + 1
#corrects any possible issue before saving as new file
df = df.dropna()

#saves de df in a new file
df.to_csv('data/dataBRB23.csv', index=False)

  df['Data'] = pd.to_datetime(df['Data'], infer_datetime_format=True, errors='coerce')
  df['Data'] = pd.to_datetime(df['Data'], infer_datetime_format=True, errors='coerce')
  df['Data'] = pd.to_datetime(df['Data'], infer_datetime_format=True, errors='coerce')


In [15]:
df

Unnamed: 0,Comp,Temp,Data,Casa,GC,GF,Fora,Res,PC,PF,J
0,Série B,2023,2023-04-14,Guarani,4,1,Avaí,VC,3,0,1
1,Série B,2023,2023-04-14,Ituano,2,0,Ceará,VC,3,0,2
2,Série B,2023,2023-04-15,Vila Nova-GO,2,1,Novorizontino,VC,3,0,3
3,Série B,2023,2023-04-15,Mirassol,1,0,Chapecoense,VC,3,0,4
4,Série B,2023,2023-04-15,Sampaio Corrêa,3,3,Atlético-GO,E,1,1,5
...,...,...,...,...,...,...,...,...,...,...,...
245,Série B,2023,2023-08-26,Tombense,2,2,Ceará,E,1,1,245
244,Série B,2023,2023-08-26,Sampaio Corrêa,1,1,Guarani,E,1,1,246
246,Série B,2023,2023-08-26,CRB,1,0,Novorizontino,VC,3,0,247
247,Série B,2023,2023-08-27,Atlético-GO,0,0,Vitória,E,1,1,248


In [16]:
unique_values = df['Casa'].unique()
unique_values_sorted = sorted(unique_values)
print(unique_values_sorted)

['ABC', 'Atlético-GO', 'Avaí', 'Botafogo-SP', 'CRB', 'Ceará', 'Chapecoense', 'Criciúma', 'Guarani', 'Ituano', 'Juventude', 'Londrina', 'Mirassol', 'Novorizontino', 'Ponte Preta', 'Sampaio Corrêa', 'Sport', 'Tombense', 'Vila Nova-GO', 'Vitória']


In [11]:
df.tail(10)

Unnamed: 0,Comp,Temp,Data,Casa,GC,GF,Fora,Res,PC,PF,J
239,Série B,2023,2023-08-22,Mirassol,0,1,Juventude,VV,0,3,240
240,Série B,2023,2023-08-23,Botafogo-SP,1,2,Chapecoense,VV,0,3,241
241,Série B,2023,2023-08-23,CRB,1,1,Avaí,E,1,1,242
242,Série B,2023,2023-08-25,Ponte Preta,1,0,Londrina,VC,3,0,243
243,Série B,2023,2023-08-25,Sport,1,2,Ituano,VV,0,3,244
245,Série B,2023,2023-08-26,Tombense,2,2,Ceará,E,1,1,245
244,Série B,2023,2023-08-26,Sampaio Corrêa,1,1,Guarani,E,1,1,246
246,Série B,2023,2023-08-26,CRB,1,0,Novorizontino,VC,3,0,247
247,Série B,2023,2023-08-27,Atlético-GO,0,0,Vitória,E,1,1,248
248,Série B,2023,2023-08-27,Chapecoense,0,0,Avaí,E,1,1,249


In [12]:
team_goals

NameError: name 'team_goals' is not defined