In [399]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

year = '1962'  # Update the year accordingly

# URL of the website
url = f'https://rsssfbrasil.com/tablesr/rjsp{year}.htm'

# Make a request to the website
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Initialize an empty list to store individual match results
    match_results = []

    # Use a regular expression to find match results
    pattern = r'(\d{2}\/\d{2})\s+((?:.*\n)+?)(?=\d|$)'
    matches = re.findall(pattern, soup.get_text())
    # Iterate through matches
    for match in matches:
        # Extract date and format it as [MMM dd]
        raw_date, match_text = match
        formatted_date = f'{raw_date}'

        # Split the match into lines
        lines = match_text.strip().split('\n')

        # Process each line (excluding the line with the date)
        for line in lines:
            # Use a regex to extract lines that match the specified pattern
            line_pattern = r'([a-zA-Z][\w\sàáâãäéèêëíìîïóòôõöúùûüç-]+)(\d+\s+x\s+\d+)\s*([\w\sàáâãäéèêëíìîïóòôõöúùûüç-]+)'
            line_match = re.match(line_pattern, line.strip(), re.UNICODE)
            
            if line_match:
                team1, score, team2 = line_match.groups()
                match_results.append({
                    'Data': formatted_date,
                    'Text': f'{team1} {score} {team2}'
                })

    # Create a DataFrame from the list of match results
    df = pd.DataFrame(match_results)

    # Save DataFrame to CSV
    df.to_csv('match_results.csv', index=False)
    print("DataFrame saved to 'match_results.csv'")
else:
    print(f"Error: {response.status_code}")

    


DataFrame saved to 'match_results.csv'


In [400]:
df


In [386]:
# Assuming your DataFrame is named df
#new_row_index = 219  # Replace with the desired index for the new row
#existing_row_index = 0  # Replace with the index of the row you want to use as a reference

# Extract the date and MatchInfo from an existing row
#existing_date = df.loc[existing_row_index, 'Data']
#existing_match_info = df.loc[existing_row_index, 'MatchInfo']

# Concatenate the information for the new row
##new_row_data = {'Data': f"{existing_date} {existing_match_info}", 'MatchInfo': 'None'}

# Use loc to add the new row
##df.loc[new_row_index] = new_row_data
# Specify the indices to drop
#indices_to_drop = (655)
#indices_to_drop.extend(range(471, 510))

#df = df.drop(indices_to_drop)

# Reset the index after dropping rows
#df = df.reset_index(drop=True)

# Drop the 'MatchInfo' column
#df = df.drop(['Text'], axis=1)

# Extract information using a regular expression
df[['Casa', 'GC', 'GF', 'Fora']] = df['Text'].str.extract(
    r'([\w\sàáâãäéèêëíìîïóòôõöúùûüç-]+)(\d+)\s+x\s+(\d+)\s+([\w\sàáâãäéèêëíìîïóòôõöúùûüç-]+)')

df = df.drop(['Text'], axis=1)

# Display the updated DataFrame
df

Unnamed: 0,Data,Casa,GC,GF,Fora
0,13.02.1963,SÃO PAULO,2,1,PALMEIRAS
1,13.02.1963,VASCO DA GAMA,1,1,FLUMINENSE
2,14.02.1963,CORINTHIANS,3,1,PORTUGUESA DE DESPORTOS
3,14.02.1963,FLAMENGO,4,1,OLARIA
4,16.02.1963,PALMEIRAS,1,1,BOTAFOGO
5,16.02.1963,VASCO DA GAMA,2,2,SANTOS
6,17.02.1963,CORINTHIANS,2,0,FLUMINENSE
7,17.02.1963,FLAMENGO,2,0,SÃO PAULO
8,20.02.1963,SANTOS,6,3,PORTUGUESA DE DESPORTOS
9,20.02.1963,BOTAFOGO,2,1,OLARIA


In [387]:
# Assuming df is your DataFrame

# Function to normalize and capitalize team names
def normalize_team_name(team_name):
    return team_name.strip().capitalize()

# Apply the function to 'Casa' and 'Fora' columns
df['Casa'] = df['Casa'].apply(normalize_team_name)
df['Fora'] = df['Fora'].apply(normalize_team_name)

df


Unnamed: 0,Data,Casa,GC,GF,Fora
0,13.02.1963,São paulo,2,1,Palmeiras
1,13.02.1963,Vasco da gama,1,1,Fluminense
2,14.02.1963,Corinthians,3,1,Portuguesa de desportos
3,14.02.1963,Flamengo,4,1,Olaria
4,16.02.1963,Palmeiras,1,1,Botafogo
5,16.02.1963,Vasco da gama,2,2,Santos
6,17.02.1963,Corinthians,2,0,Fluminense
7,17.02.1963,Flamengo,2,0,São paulo
8,20.02.1963,Santos,6,3,Portuguesa de desportos
9,20.02.1963,Botafogo,2,1,Olaria


In [388]:
unique_values = df['Casa'].unique()
unique_values_as_strings = [str(value) for value in unique_values]
unique_values_sorted = sorted(unique_values_as_strings)
print(unique_values_sorted)

['Botafogo', 'Corinthians', 'Flamengo', 'Fluminense', 'Olaria', 'Palmeiras', 'Portuguesa de desportos', 'Santos', 'São paulo', 'Vasco da gama']


In [389]:



df.replace({'Atlético mineiro': 'Atlético-MG',
            'Atlético paranaense': 'Athletico-PR',
            'América-GB': 'America-RJ',
            'América': 'America-RJ',
            'Botafogo-RJ': 'Botafogo',
            'Comercial': 'Comercial-MS',
            'Desportiva': 'Desportiva Capixaba',
            'Ferroviário': 'Ferroviário-CE',
            'Flamengo-RJ': 'Flamengo',
            'Fluminense-RJ': 'Fluminense',
            'Mixto': 'Mixto-MT',
            'Nacional': 'Nacional-AM',
            'Portuguesa de desportos': 'Portuguesa',
            'Santa cruz': 'Santa Cruz',
            'São paulo': 'São Paulo',
            'Operário': 'Operário-MS',
            'Rio Branco': 'Rio Branco-ES',
            'Rio Negro': 'Rio Negro-AM',
            'Tiradentes': 'Tiradentes-PI',
            'Vasco da gama': 'Vasco',             
            }, inplace=True)

df

Unnamed: 0,Data,Casa,GC,GF,Fora
0,13.02.1963,São Paulo,2,1,Palmeiras
1,13.02.1963,Vasco,1,1,Fluminense
2,14.02.1963,Corinthians,3,1,Portuguesa
3,14.02.1963,Flamengo,4,1,Olaria
4,16.02.1963,Palmeiras,1,1,Botafogo
5,16.02.1963,Vasco,2,2,Santos
6,17.02.1963,Corinthians,2,0,Fluminense
7,17.02.1963,Flamengo,2,0,São Paulo
8,20.02.1963,Santos,6,3,Portuguesa
9,20.02.1963,Botafogo,2,1,Olaria


In [390]:

df['Data'] = df['Data'].str.replace('.', '-')  # Remove brackets from the date string
df['Data'] = pd.to_datetime(df['Data'], format='%d-%m-%Y')  # Convert to datetime format
df['Data'] = df['Data'].apply(lambda x: x.replace(year=int(year))).dt.strftime('%Y-%m-%d')

# Now df['Data'] should have the correct date with the specified year


In [391]:
#Step 8: cleaning data frame
#creates a new column with the for the competition name

#creates new columns with competition name and season
df = df.assign(Comp='Rio-SP')
df = df.assign(Temp=f'{year}')

#comparing scores to display the results in the second dataframe
df.loc[df['GC'] == df['GF'], 'Res'] = 'E'
df.loc[df['GC'] > df['GF'], 'Res'] = 'VC' 
df.loc[df['GC'] < df['GF'], 'Res'] = 'VV'
# create a new column to store the season information in the second data frame

#organizes de columns of the sacond dataframe
new_order = ['Comp','Temp','Data','Casa','GC', 'GF','Fora','Res']
df= df.reindex(columns=new_order)
df


#drops unvalid data
df.dropna(axis=0, how='all', inplace=True)

df = df.sort_values(by='Data')


#atributes a pontuation to each game
df['PC'] = df.apply(lambda x: 3 if x['Res'] == 'VC' else
                              1 if x['Res'] == 'E' else 0, axis=1)
df['PF'] = df.apply(lambda x: 3 if x['Res'] == 'VV' else
                              1 if x['Res'] == 'E' else 0, axis=1)

#atributes a number of games for each season
df['J'] = df.groupby('Temp').cumcount() + 1
#corrects any possible issue before saving as new file
df

Unnamed: 0,Comp,Temp,Data,Casa,GC,GF,Fora,Res,PC,PF,J
0,Rio-SP,1963,1963-02-13,São Paulo,2,1,Palmeiras,VC,3,0,1
1,Rio-SP,1963,1963-02-13,Vasco,1,1,Fluminense,E,1,1,2
2,Rio-SP,1963,1963-02-14,Corinthians,3,1,Portuguesa,VC,3,0,3
3,Rio-SP,1963,1963-02-14,Flamengo,4,1,Olaria,VC,3,0,4
4,Rio-SP,1963,1963-02-16,Palmeiras,1,1,Botafogo,E,1,1,5
5,Rio-SP,1963,1963-02-16,Vasco,2,2,Santos,E,1,1,6
6,Rio-SP,1963,1963-02-17,Corinthians,2,0,Fluminense,VC,3,0,7
7,Rio-SP,1963,1963-02-17,Flamengo,2,0,São Paulo,VC,3,0,8
8,Rio-SP,1963,1963-02-20,Santos,6,3,Portuguesa,VC,3,0,9
9,Rio-SP,1963,1963-02-20,Botafogo,2,1,Olaria,VC,3,0,10


In [392]:
df.to_csv(f'rio_sp_{year}.csv', index=False)



In [393]:
# Merge dataframes and sort by date
df2 = pd.read_csv('rio_sp_55_66.csv')
frames = [df, df2]
df3 = pd.concat(frames)
df3["Data"] = pd.to_datetime(df3["Data"], errors='coerce')  # Convert "Data" to datetime format
df3 = df3.drop_duplicates()
df3 = df3.sort_values(by='Data')
df3.to_csv('rio_sp_55_66.csv', index=False)


In [394]:

df3

Unnamed: 0,Comp,Temp,Data,Casa,GC,GF,Fora,Res,PC,PF,J
0,Rio-SP,1963,1963-02-13,São Paulo,2,1,Palmeiras,VC,3,0,1
1,Rio-SP,1963,1963-02-13,Vasco,1,1,Fluminense,E,1,1,2
2,Rio-SP,1963,1963-02-14,Corinthians,3,1,Portuguesa,VC,3,0,3
3,Rio-SP,1963,1963-02-14,Flamengo,4,1,Olaria,VC,3,0,4
4,Rio-SP,1963,1963-02-16,Palmeiras,1,1,Botafogo,E,1,1,5
...,...,...,...,...,...,...,...,...,...,...,...
158,Rio-SP,1966,1966-03-26,São Paulo,4,2,Palmeiras,VC,3,0,41
161,Rio-SP,1966,1966-03-27,Bangu,2,0,Fluminense,VC,3,0,44
159,Rio-SP,1966,1966-03-27,Botafogo,3,0,Vasco,VC,3,0,42
160,Rio-SP,1966,1966-03-27,Santos,0,0,Corinthians,E,1,1,43


In [196]:
df3 = df3[df3['Temp'] != '1957']

In [218]:
unique_values = df3['Temp'].unique()
unique_values_as_strings = [str(value) for value in unique_values]
unique_values_sorted = sorted(unique_values_as_strings)
print(unique_values_sorted)

['1955', '1956', '1957', '1957', '1958', '1959', '1960', '1961', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1973', '1974', '1975']


In [None]:
# Assuming your DataFrame is named df
new_row_index = 372  # Replace with the desired index for the new row
existing_row_index = 0  # Replace with the index of the row you want to use as a reference

# Extract the date and MatchInfo from an existing row
existing_date = df.loc[existing_row_index, 'Data']
existing_match_info = df.loc[existing_row_index, 'MatchInfo']

# Concatenate the information for the new row
new_row_data = {'Data': f"{existing_date} {existing_match_info}", 'MatchInfo': 'None'}

# Use loc to add the new row
df.loc[new_row_index] = new_row_data
