In [8]:
#Fetches the data for the current Série C
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv

# Step 1: Make a request to the webpage URL and extract data from fourth table
url = 'https://www.chancedegol.com.br/brc23.htm'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find_all('table')[6]

# Step 2: Loop through the rows of the table and extract the data from the cells
data = []
for row in table.find_all('tr'):
    cols = row.find_all('td')
    cols = [col.text.strip() for col in cols]
    data.append(cols)

    
# Step 3:Define file path

    # Define the absolute path to the data directory (where you want to save the CSV file)
data_dir = 'C:\\Users\\Cristina Jorge\\OneDrive\\Documentos\\GitHub\\Back-End-Elo\\data'


    # Specify the absolute path to the CSV file within the data directory
csv_file_path = os.path.join(data_dir, 'dataBRC23raw.csv')

    # Ensure the data directory exists; create it if not
os.makedirs(data_dir, exist_ok=True)

# Step 4: Write the data to a CSV file using the absolute path
with open(csv_file_path, 'w', newline='', encoding='utf-8-sig') as file:
    writer = csv.writer(file)
    writer.writerows(data)


In [9]:
#step 4: reading raw data
df = pd.read_csv(csv_file_path)
df

Unnamed: 0,Data,Mandante,Unnamed: 2,Visitante,Vitória doMandante,Empate,Vitória doVisitante
0,02/05/2023,São José RS,0x0,Floresta,62.7 %,21.7 %,15.6 %
1,02/05/2023,Volta Redonda,0x1,Pouso Alegre,55.5 %,23.2 %,21.3 %
2,02/05/2023,Manaus,2x1,Náutico,24.0 %,28.6 %,47.4 %
3,03/05/2023,Paysandu,2x1,Aparecidense,46.8 %,30.4 %,22.8 %
4,03/05/2023,América RN,1x2,Ypiranga RS,52.6 %,23.9 %,23.5 %
...,...,...,...,...,...,...,...
211,07/10/2023,Amazonas,2x0,Botafogo PB,59.1 %,25.2 %,15.8 %
212,08/10/2023,Operário PR,1x0,São Bernardo,43.8 %,31.1 %,25.1 %
213,08/10/2023,Brusque,0x0,São José RS,55.3 %,23.9 %,20.8 %
214,15/10/2023,Amazonas,0x0,Brusque,43.2 %,30.7 %,26.1 %


In [10]:

#Step 5: Padroning the names of the teams

unique_values = df['Mandante'].unique()
unique_values_sorted = sorted(unique_values)

print(unique_values_sorted)

df.replace({'América RN': 'América-RN',
            'Botafogo PB': 'Botafogo-PB',
            'Operário PR': 'Operário-PR',
            'São José RS': 'São José-RS',
            'Ypiranga RS': 'Ypiranga-RS',
            }, inplace=True)
df.dropna(axis=0, how='all', inplace=True)

['Altos', 'Amazonas', 'América RN', 'Aparecidense', 'Botafogo PB', 'Brusque', 'CSA', 'Confiança', 'Figueirense', 'Floresta', 'Manaus', 'Náutico', 'Operário PR', 'Paysandu', 'Pouso Alegre', 'Remo', 'São Bernardo', 'São José RS', 'Volta Redonda', 'Ypiranga RS']


In [11]:
#Step 6: organazing data

df.rename(columns={'Mandante':'Casa','Visitante':'Fora'}, inplace =True)
    # Changes dtype to datetime in first dataframe
df['Data'] = df['Data'].str.replace('/', '-')
df['Data'] = pd.to_datetime(df['Data'], format='%d-%m-%Y')
df['Temp'] = df['Data'].dt.year
df = df.sort_values(by='Data')
df['Data'] = df['Data'].dt.strftime('%Y-%m-%d')

    # Creates a new column with the for the competition name

df = df.assign(Comp='Série C')

    # Separates the results of second dataframe into new columns
new = df["Unnamed: 2"].str.split("x", n = 1, expand = True)
df["GC"]= new[0] 
df["GF"]= new[1]

    # Compares scores to display the results in the second dataframe
df.loc[df['GC'] == df['GF'], 'Res'] = 'E'
df.loc[df['GC'] > df['GF'], 'Res'] = 'VC' 
df.loc[df['GC'] < df['GF'], 'Res'] = 'VV'
# Create a new column to store the season information in the second data frame

    # Organizes de columns of the sacond dataframe
new_order = ['Comp','Temp','Data','Casa','GC', 'GF','Fora','Res']
df= df.reindex(columns=new_order)


    # Atributes a pontuation to each game
df['PC'] = df.apply(lambda x: 3 if x['Res'] == 'VC' else
                              1 if x['Res'] == 'E' else 0, axis=1)
df['PF'] = df.apply(lambda x: 3 if x['Res'] == 'VV' else
                              1 if x['Res'] == 'E' else 0, axis=1)

    # Atributes a number of games for each season
df['J'] = df.groupby('Temp').cumcount() + 1
    # Corrects any possible issue before saving as new file
df = df.dropna()
df = df.drop(index = 0)
df

Unnamed: 0,Comp,Temp,Data,Casa,GC,GF,Fora,Res,PC,PF,J
1,Série C,2023,2023-05-02,Volta Redonda,0,1,Pouso Alegre,VV,0,3,2
2,Série C,2023,2023-05-02,Manaus,2,1,Náutico,VC,3,0,3
3,Série C,2023,2023-05-03,Paysandu,2,1,Aparecidense,VC,3,0,4
4,Série C,2023,2023-05-03,América-RN,1,2,Ypiranga-RS,VV,0,3,5
5,Série C,2023,2023-05-03,Brusque,1,0,Amazonas,VC,3,0,6
...,...,...,...,...,...,...,...,...,...,...,...
211,Série C,2023,2023-10-07,Amazonas,2,0,Botafogo-PB,VC,3,0,212
212,Série C,2023,2023-10-08,Operário-PR,1,0,São Bernardo,VC,3,0,213
213,Série C,2023,2023-10-08,Brusque,0,0,São José-RS,E,1,1,214
214,Série C,2023,2023-10-15,Amazonas,0,0,Brusque,E,1,1,215


In [12]:
#Step 7:
    # Specify the absolute path to the new CSV file within the data directory
cleaned_csv_file_path = os.path.join(data_dir, 'serieC_2023_matches.csv')
df.to_csv(cleaned_csv_file_path, index=False, encoding='utf-8-sig')

    # Save the cleaned DataFrame to a new CSV file
df.to_csv(cleaned_csv_file_path, index=False)

    # Display the cleaned DataFrame
df

Unnamed: 0,Comp,Temp,Data,Casa,GC,GF,Fora,Res,PC,PF,J
1,Série C,2023,2023-05-02,Volta Redonda,0,1,Pouso Alegre,VV,0,3,2
2,Série C,2023,2023-05-02,Manaus,2,1,Náutico,VC,3,0,3
3,Série C,2023,2023-05-03,Paysandu,2,1,Aparecidense,VC,3,0,4
4,Série C,2023,2023-05-03,América-RN,1,2,Ypiranga-RS,VV,0,3,5
5,Série C,2023,2023-05-03,Brusque,1,0,Amazonas,VC,3,0,6
...,...,...,...,...,...,...,...,...,...,...,...
211,Série C,2023,2023-10-07,Amazonas,2,0,Botafogo-PB,VC,3,0,212
212,Série C,2023,2023-10-08,Operário-PR,1,0,São Bernardo,VC,3,0,213
213,Série C,2023,2023-10-08,Brusque,0,0,São José-RS,E,1,1,214
214,Série C,2023,2023-10-15,Amazonas,0,0,Brusque,E,1,1,215
