In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv

# Step 4: Make a request to the webpage URL for each subdivision
urls = ['https://www.chancedegol.com.br/br{:02d}.htm'.format(i) for i in range(7, 23)]

# Step 5: Open the file and create the CSV writer object
with open('dataBRARaw.csv', 'w', newline='', encoding='utf-8-sig') as file:
    writer = csv.writer(file)

    # Step 6: Loop through the subdivisions and extract the data
    for url in urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table')

        # Step 7: Loop through the rows of the table and extract the data from the cells
        data = []
        for row in table.find_all('tr'):
            cols = row.find_all('td')
            cols = [col.text.strip() for col in cols]
            data.append(cols)

        # Step 8: Write the data to the CSV file
        writer.writerows(data)


In [2]:
df = pd.read_csv('dataBRARaw.csv')
df

Unnamed: 0,Data,Mandante,Unnamed: 2,Visitante,Vitória doMandante,Empate,Vitória doVisitante
0,12/05/2007,Figueirense,3x6,Atlético PR,54.6 %,22.1 %,23.3 %
1,12/05/2007,Fluminense,2x2,Cruzeiro,34.5 %,24.9 %,40.6 %
2,12/05/2007,São Paulo,2x0,Goiás,54.3 %,22.5 %,23.2 %
3,13/05/2007,Atlético MG,2x1,Náutico,52.5 %,25.5 %,22.0 %
4,13/05/2007,Internacional,2x3,Botafogo,42.1 %,26.7 %,31.2 %
...,...,...,...,...,...,...,...
6087,13/11/2022,Internacional,3x0,Palmeiras,32.6 %,29.0 %,38.4 %
6088,13/11/2022,Ceará,4x1,Juventude,65.8 %,26.3 %,7.9 %
6089,13/11/2022,Goiás,0x4,São Paulo,33.9 %,26.3 %,39.8 %
6090,13/11/2022,Cuiabá,2x1,Coritiba,49.1 %,27.1 %,23.9 %


In [3]:
df = pd.read_csv('dataBRARaw.csv')
unique_values = df['Mandante'].unique()
unique_values_sorted = sorted(unique_values)
print(unique_values_sorted)

['América MG', 'América RN', 'Athletico PR', 'Atlético GO', 'Atlético MG', 'Atlético PR', 'Avaí', 'Bahia', 'Barueri', 'Botafogo', 'CSA', 'Ceará', 'Chapecoense', 'Corinthians', 'Coritiba', 'Criciúma', 'Cruzeiro', 'Cuiabá', 'Figueirense', 'Flamengo', 'Fluminense', 'Fortaleza', 'Goiás', 'Grêmio', 'Guarani', 'Internacional', 'Ipatinga', 'Joinville', 'Juventude', 'Mandante', 'Náutico', 'Palmeiras', 'Paraná', 'Ponte Preta', 'Portuguesa', 'Prudente', 'Red Bull Bragantino', 'Santa Cruz', 'Santo André', 'Santos', 'Sport', 'São Paulo', 'Vasco', 'Vitória']


In [4]:
#Step 8: cleaning data frame
df = pd.read_csv('dataBRARaw.csv')
df.rename(columns={'Mandante':'Casa','Visitante':'Fora'}, inplace =True)
#changes dtype to datetime in first dataframe
df['Data'] = pd.to_datetime(df['Data'], infer_datetime_format=True, errors='coerce')
#creates a new column with the for the competition name

df = df.assign(Comp='Série A')

#separets the results of second dataframe into new columns
new = df["Unnamed: 2"].str.split("x", n = 1, expand = True)
df["GC"]= new[0] 
df["GF"]= new[1]
#creates new column with competition name

#comparing scores to display the results in the second dataframe
df.loc[df['GC'] == df['GF'], 'Res'] = 'E'
df.loc[df['GC'] > df['GF'], 'Res'] = 'VC' 
df.loc[df['GC'] < df['GF'], 'Res'] = 'VV'
# create a new column to store the season information in the second data frame



# Remove rows where 'Data' column is NaT (not a valid date)
df = df.dropna(subset=['Data'])

# Create the 'temp' column to store the year
df['Temp'] = df['Data'].dt.year

# Convert the 'Data' column to the format 'yyyy-mm-dd'
df['Data'] = df['Data'].dt.strftime('%Y-%m-%d')




#organizes de columns of the sacond dataframe
new_order = ['Comp','Temp','Data','Casa','GC', 'GF','Fora','Res']
df= df.reindex(columns=new_order)




df.replace({'América MG': 'América-MG',
            'América RN': 'América-RN',
            'Athletico PR': 'Athletico-PR', 
            'Atletico-PR': 'Athletico-PR',
            'Atlético PR': 'Athletico-PR', 
            'Atlético GO': 'Atlético-GO', 
            'Atlético MG': 'Atlético-MG',
            'Barueri':'G. Barueri/Prudente',
            'Prudente':'G. Barueri/Prudente',
            }, inplace=True)

#drops unvalid data
df.dropna(axis=0, how='all', inplace=True)
#sort values by date
df = df.sort_values(by='Data')

#atributes a pontuation to each game
df['PC'] = df.apply(lambda x: 3 if x['Res'] == 'VC' else
                              1 if x['Res'] == 'E' else 0, axis=1)
df['PF'] = df.apply(lambda x: 3 if x['Res'] == 'VV' else
                              1 if x['Res'] == 'E' else 0, axis=1)

#atributes a number of games for each season
df['J'] = df.groupby('Temp').cumcount() + 1
#corrects any possible issue before saving as new file
df = df.dropna()

#saves de df in a new file
df.to_csv('dataBRAraw.csv', index=False)
df

  df['Data'] = pd.to_datetime(df['Data'], infer_datetime_format=True, errors='coerce')


Unnamed: 0,Comp,Temp,Data,Casa,GC,GF,Fora,Res,PC,PF,J
152,Série A,2007,2007-01-08,Internacional,0,2,Vasco,VV,0,3,1
151,Série A,2007,2007-01-08,Atlético-MG,1,2,Santos,VV,0,3,2
150,Série A,2007,2007-01-08,Athletico-PR,2,2,Corinthians,E,1,1,3
149,Série A,2007,2007-01-08,Botafogo,4,2,América-RN,VC,3,0,4
148,Série A,2007,2007-01-08,Palmeiras,1,2,Sport,VV,0,3,5
...,...,...,...,...,...,...,...,...,...,...,...
5818,Série A,2022,2022-12-06,Fortaleza,0,0,Athletico-PR,E,1,1,162
5819,Série A,2022,2022-12-06,Goiás,1,1,Ceará,E,1,1,163
5820,Série A,2022,2022-12-06,Coritiba,0,2,Palmeiras,VV,0,3,164
5817,Série A,2022,2022-12-06,São Paulo,1,0,América-MG,VC,3,0,165


In [5]:
unique_values = df['Casa'].unique()
unique_values_sorted = sorted(unique_values)
print(unique_values_sorted)

['América-MG', 'América-RN', 'Athletico-PR', 'Atlético-GO', 'Atlético-MG', 'Avaí', 'Bahia', 'Botafogo', 'CSA', 'Ceará', 'Chapecoense', 'Corinthians', 'Coritiba', 'Criciúma', 'Cruzeiro', 'Cuiabá', 'Figueirense', 'Flamengo', 'Fluminense', 'Fortaleza', 'G. Barueri/Prudente', 'Goiás', 'Grêmio', 'Guarani', 'Internacional', 'Ipatinga', 'Joinville', 'Juventude', 'Náutico', 'Palmeiras', 'Paraná', 'Ponte Preta', 'Portuguesa', 'Red Bull Bragantino', 'Santa Cruz', 'Santo André', 'Santos', 'Sport', 'São Paulo', 'Vasco', 'Vitória']


In [6]:
df1 = pd.read_csv('dataBRA.csv')
frames = [df, df1]
df = pd.concat(frames)
df['Data'] = pd.to_datetime(df['Data'], infer_datetime_format=True, errors='coerce')
df = df.sort_values(by='Data')
df.to_csv('dataBRA.csv', index=False)
df


FileNotFoundError: [Errno 2] No such file or directory: 'dataBRA.csv'