In [None]:
# Fetches the data for the current Copa do Brasil
import requests
from bs4 import BeautifulSoup
import csv

# List to store the game data
game_data = []

# Make request to page
for i in range(1663, 1715):  # replace range with your desired id range
    url = f'https://www.cbf.com.br/futebol-brasileiro/competicoes/copa-brasil-masculino/2023?phase={i}'
    response = requests.get(url)

    # Parse HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all game blocks
    game_blocks = soup.find_all('div', class_='block')

    # Extract game information from each block
    for block in game_blocks:
        date_time = block.find('div', class_='text-1').text.strip()
        team_1_img = block.find('div', class_='col-xs-5 p-t-10 nopadding').find('img')
        score_element = block.find('div', class_='text-3')
        team_2_img = block.find_all('div', class_='col-xs-5 p-t-10 nopadding')[1].find('img')

        # Check if elements exist before accessing their attributes
        if team_1_img and score_element and team_2_img:
            team_1 = team_1_img['title']
            score = score_element.text.strip()
            team_2 = team_2_img['title']

            # Append game data to the list
            game_data.append([date_time, team_1, score, team_2])

# Write game data to CSV file
with open('dataCBR2023raw.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Data', 'Casa', 'Resultado', 'Fora'])  # Write column headers
    writer.writerows(game_data)


In [7]:
import pandas as pd
df = pd.read_csv('dataCBR2023raw.csv')

#Cleans-up the 'Resultado' column
df['Resultado'] = df['Resultado'].str.replace(r'\([^)]*\)', '', regex=True)
# Drop rows where the result is only an "x"
df = df[df['Resultado'] != 'x']
df = df.reset_index(drop=True)

#finds and separates the Date in the 'Data' column
df['Data'] = df['Data'].str.extract(r'(\s\d+/\d+/\d+)')

# Split the "Resultado" column
new = df["Resultado"].str.split("x", n=1, expand=True)

# Checks if the split operation resulted in two columns
if new.shape[1] == 2:
    # Assign values to "Casa" and "Fora" columns
    df['GC'] = new[0]
    df['GF'] = new[1]
    
df[['GC', 'GF']] = df[['GC', 'GF']].apply(pd.to_numeric, errors='coerce')
# Drop the "Resultado" column
df = df.drop(['Resultado'], axis=1)


#changes dtype to datetime in first dataframe
df['Data'] = pd.to_datetime(df['Data'], infer_datetime_format=True, errors='coerce')
#creates a new column with the for the competition name

df = df.assign(Comp='CBR')


#creates new column with competition name

#comparing scores to display the results in the second dataframe
df.loc[df['GC'] == df['GF'], 'Res'] = 'E'
df.loc[df['GC'] > df['GF'], 'Res'] = 'VC' 
df.loc[df['GC'] < df['GF'], 'Res'] = 'VV'
# create a new column to store the season information in the second data frame
df['Data'] = pd.to_datetime(df['Data'], infer_datetime_format=True, errors='coerce')
df['Temp'] = df['Data'].dt.year
#organizes de columns of the sacond dataframe
new_order = ['Comp','Temp','Data','Casa','GC', 'GF','Fora','Res']
df= df.reindex(columns=new_order)
df.dropna(axis=0, how='all', inplace=True)

#sort values by date
df = df.sort_values(by='Data')

#atributes a pontuation to each game
df['PC'] = df.apply(lambda x: 3 if x['Res'] == 'VC' else
                              1 if x['Res'] == 'E' else 0, axis=1)
df['PF'] = df.apply(lambda x: 3 if x['Res'] == 'VV' else
                              1 if x['Res'] == 'E' else 0, axis=1)

#atributes a number of games for each season
df['J'] = df.groupby('Temp').cumcount() + 1
#corrects any possible issue before saving as new file
df = df.dropna()



  df['Data'] = pd.to_datetime(df['Data'], infer_datetime_format=True, errors='coerce')
  df['Data'] = pd.to_datetime(df['Data'], infer_datetime_format=True, errors='coerce')
  df['Data'] = pd.to_datetime(df['Data'], infer_datetime_format=True, errors='coerce')


In [8]:
# This output cleans and organizes the Data
import numpy as np


# Define the list of values to keep the '-'
condition = ['América - RN',
             'Atlético - GO', 
             'Atlético - BA', 
             'Botafogo - RJ', 
             'Botafogo - SP',
             'Botafogo - PB',
             'Fluminense - RJ',
             'Fluminense - PI',
             'Operário - MS',
             'Operário - PR',
             'União - MT']

def process_value(value):
    if value in condition:
        return value  # Keep the original value
    else:
        value = value.split('-', 1)[0].strip()  # Remove everything after '-' and strip whitespace
        value = value.replace('Saf', '').replace('S.a.f', '').replace('Futebol Clube', '').replace('Esporte Clube', '').strip()  # Remove 'Saf' and 'S.a.f' and strip whitespace
        return value

# Apply the function to the 'Casa' column
df['Casa'] = np.where(df['Casa'].isin(condition), df['Casa'], df['Casa'].apply(process_value))

# Apply the function to the 'Fora' column
df['Fora'] = np.where(df['Fora'].isin(condition), df['Fora'], df['Fora'].apply(process_value))


#Renames the teams 

df['Casa'] = df['Casa'].str.strip()
df['Fora'] = df['Fora'].str.strip()
df.replace({'Abc': 'ABC',
            'Asa': 'ASA',
            'Aguia de Maraba': 'Águia de Marabá',
            'América - RN': 'América-RN',
            'America Fc .': 'América-MG',
            'Atlético Mineiro': 'Atlético-MG',
            'Athletico Paranaense': 'Athletico-PR',
            'Atlhetic Ce': 'Atlhetic-MG',
            'Atlético - GO': 'Atlético-GO', 
            'Atlético - BA': 'Atlético-BA', 
            'Botafogo - RJ': 'Botafogo', 
            'Botafogo - SP': 'Botafogo-SP',
            'Botafogo - PB': 'Botafogo-PB',
            'Brasil': 'Brasil-RS',
            'Camboriu': 'Camboriú',
            'Crb': 'CRB',
            'Csa': 'CSA',
            'Ec Democrata': 'Democrata-MG',
            'Fluminense - RJ': 'Fluminense',
            'Fluminense - PI': 'Fluminense-PI',
            'Marilia': 'Marília',
            'Operário - MS': 'Operário-MS',
            'Operário - PR': 'Operário-PR',
            'União - MT': 'União-MT',
            'Real Noroeste Capixaba F. C.': 'Real Noroeste',
            'Sao Bernardo Fc': 'São Bernardo', 
            'Sao Francisco': 'São Francisco',
            'Vasco da Gama': 'Vasco',
            'Vitoria F. C.': 'Vitória-ES',
            'Ypiranga': 'Ypiranga-RS',
            'Vila Nova': 'Vila Nova-GO'
            
                        }, inplace=True)

In [9]:
df.tail(20)

Unnamed: 0,Comp,Temp,Data,Casa,GC,GF,Fora,Res,PC,PF,J
96,CBR,2023,2023-05-17,Palmeiras,3,0,Fortaleza,VC,3,0,99
94,CBR,2023,2023-05-17,Santos,0,0,Bahia,E,1,1,100
99,CBR,2023,2023-05-31,Cruzeiro,0,1,Grêmio,VV,0,3,101
95,CBR,2023,2023-05-31,Bahia,1,1,Santos,E,1,1,102
103,CBR,2023,2023-05-31,Internacional,3,1,América-MG,VC,3,0,103
105,CBR,2023,2023-05-31,Botafogo,1,0,Athletico-PR,VC,3,0,104
97,CBR,2023,2023-05-31,Fortaleza,1,0,Palmeiras,VC,3,0,105
107,CBR,2023,2023-05-31,Corinthians,2,0,Atlético-MG,VC,3,0,106
101,CBR,2023,2023-06-01,São Paulo,1,3,Sport,VV,0,3,107
93,CBR,2023,2023-06-01,Flamengo,2,0,Fluminense,VC,3,0,108


In [10]:
#Saves the Data
df.to_csv('dataCBR2023.csv', index=False)
