In [1]:
#IMPORTACIÓN DE LIBRERÍAS

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from string import ascii_uppercase as alfabeto
import pickle
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

In [30]:
class DataGetter:
    def __init__(self, url):
        self.link = url
    
    def charger(self):
        data = pd.read_html(self.link)
        
        return data

    def organicer(self, data, mode = 'CA'):
        if mode == 'CA':
            dict_tables = {} #Creamos un diccionario vacío para rellenarlo con las tablas
            for letra, i in zip(alfabeto, range(14, 42, 7)):
                df = data[i]
                df.rename(columns={df.columns[1]: 'Team'}, inplace=True)
                df.pop('Qualification')
                dict_tables[f'Group {letra}'] = df
            
            return dict_tables
        
        elif mode == 'RM':
            historical_ranking = data[3]
            actual_ranking = data[0]

            actual_ranking = actual_ranking.drop(0)
            actual_ranking = actual_ranking.drop(1)
            actual_ranking = actual_ranking.drop(2)
            actual_ranking = actual_ranking.drop(23)
            actual_ranking = actual_ranking.drop(24)

            actual_ranking.rename(columns={actual_ranking.columns[0]: 'Rank'}, inplace=True)
            actual_ranking.rename(columns={actual_ranking.columns[1]: 'Change'}, inplace=True)
            actual_ranking.rename(columns={actual_ranking.columns[2]: 'Team'}, inplace=True)
            actual_ranking.rename(columns={actual_ranking.columns[3]: 'Points'}, inplace=True)

            actual_ranking.pop('Change')

            return actual_ranking, historical_ranking

In [31]:
url_ca = 'https://en.wikipedia.org/wiki/2024_Copa_America'
url_rm = "https://en.wikipedia.org/wiki/FIFA_Men's_World_Ranking"

Copa_America = DataGetter(url_ca)
Ranking_Mundial = DataGetter(url_rm)

datos_ca = Copa_America.charger()
datos_rm = Ranking_Mundial.charger()

tabla_ca = Copa_America.organicer(datos_ca)
tabla_rm = Ranking_Mundial.organicer(datos_rm, mode = 'RM')

In [43]:
class DataCollector:
    # def __init__(self):
    #     self.service = Service(ChromeDriverManager().install())
    #     self.driver = webdriver.Chrome(service=self.service)
 
    def get_matches(self, year):
        
        if year <= 1967:
            urls = f'https://en.wikipedia.org/wiki/{year}_South_American_Championship'
            
        else:
            urls = f'https://en.wikipedia.org/wiki/{year}_Copa_America'
            
        response = requests.get(urls)
        content = response.text
        
        soup = BeautifulSoup(content, 'lxml')
        matches = soup.find_all('div', class_="footballbox")

        home = []
        score = []
        away = []
        
        for match in matches:
            home.append(match.find('th', class_="fhome").get_text())
            score.append(match.find('th', class_="fscore").get_text())
            away.append(match.find('th', class_="faway").get_text())
            
        dict_America = {'home':home, 
                        'score':score,
                        'away':away}

        df_America = pd.DataFrame(dict_America)
        df_America['year'] = year

        return df_America
    
    def getTotalMatches(self, years):

        TotalMatches = [self.get_matches(year) for year in years]
        df_TotalMatches = pd.concat(TotalMatches, ignore_index=True)

        df_conmebol = df_TotalMatches[df_TotalMatches['year'] != 2024]
        df_fixture = df_TotalMatches[df_TotalMatches['year'] == 2024]

        return df_conmebol, df_fixture
    
    def getMissingMatches(self, year):
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service)
        
        url = f'https://en.wikipedia.org/wiki/{year}_Copa_America'
        
        driver.get(url) #Habilitamos el acceso al driver 
        
        missing_matches = driver.find_elements(by='xpath', value='//tr[@style="font-size:90%"]')

        home = []
        score = []
        away = []

        for match in missing_matches:
            home.append(match.find_element(by='xpath', value='./td[1]').text)
            score.append(match.find_element(by='xpath', value='./td[2]').text)
            away.append(match.find_element(by='xpath', value='./td[3]').text)

        dict_missing = {'home':home, 
                    'score':score,
                    'away':away}

        df_missing = pd.DataFrame(dict_missing)
        df_missing['year'] = year
        time.sleep(1) #Tiempo de espera para pasar de una página a la otra

        return df_missing
    
    def getTotalMissingMatches(self, years):
        missing_data = [self.getMissingMatches(year) for year in years]

        # driver.quit() #Línea necesaria para que el driver deje de controlar la página web

        df_missing_data = pd.concat(missing_data, ignore_index=True)

        return df_missing_data

In [44]:
years_America = [1916, 1917, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1929, 1935, 1937, 1939, 1941,
                 1942, 1945, 1946, 1947, 1949, 1953, 1955, 1956, 1957, 1963, 1967, 1975, 1979, 1983, 1987, 1989,
                 1991, 1993, 1995, 1997, 1999, 2001, 2004, 2007, 2011, 2015, 2016, 2019, 2021, 2024]

missing_years = [2011, 2015]

Partidos = DataCollector()
tablas = Partidos.getTotalMatches(years_America)
tablas_perdidas = Partidos.getTotalMissingMatches(missing_years)

In [58]:
class TableGetter:
    def __init__(self, archive):
        self.path = archive
    
    def openData(self):
        data = pd.read_csv(self.path)
        return data
    
    def cleanData(self, data_f, data_i, data_m, mode = 0):
        if mode == 0: #Clean Fixture
            data_f['home'] = data_f.home.str.strip()
            data_f['away'] = data_f.away.str.strip()
            return data_f
        
        elif mode == 1:
            df_complete_data = pd.concat([data_i, data_m], ignore_index=True)
            df_complete_data.drop_duplicates(inplace=True)
            df_complete_data.sort_values('year', inplace=True)

            df_complete_data['score'] = df_complete_data['score'].str.strip()
            df_complete_data['score'] = df_complete_data['score'].str.replace('[^\d–]', '', regex=True)
            
            df_complete_data['home'] = df_complete_data.home.str.strip()
            df_complete_data['away'] = df_complete_data.away.str.strip()

            df_complete_data[['home_goals', 'away_goals']] = df_complete_data['score'].str.split('–', expand=True)
            df_complete_data.drop('score', axis=1, inplace=True)

            df_complete_data = df_complete_data.astype({'home_goals': int, 'away_goals':int, 'year':int}) 

            return df_complete_data

In [59]:
initial = TableGetter('Conmebol_Copa_America_initial_data.csv')
missing = TableGetter('Conmebol_Copa_America_missing_data.csv')
fixture = TableGetter('Programacion_Copa_America_2024.csv')

initial_data = initial.openData()
missing_data = missing.openData()
df_fixture = fixture.openData()

In [63]:
df_fixture = fixture.cleanData(df_fixture, initial_data, missing_data)
df_complete_data = initial.cleanData(df_fixture, initial_data, missing_data, 1)

In [64]:
df_complete_data

Unnamed: 0,home,away,year,home_goals,away_goals
0,Uruguay,Chile,1916,4,0
1,Argentina,Chile,1916,6,1
2,Brazil,Chile,1916,1,1
3,Argentina,Brazil,1916,1,1
4,Uruguay,Brazil,1916,2,1
...,...,...,...,...,...
749,Argentina,Uruguay,2021,1,0
748,Chile,Bolivia,2021,1,0
747,Paraguay,Bolivia,2021,3,1
765,Venezuela,Peru,2021,0,1
