In [2]:
import requests
import re
import pandas as pd
import numpy as np
import datetime
import slugify
import os
from urllib.parse import urljoin
from bs4 import BeautifulSoup

print(slugify.slugify('Antonio Feregrino Bolaños'))

if not os.path.exists("mt-scraper"):
    os.makedirs("mt-scraper/defensiva")
    os.makedirs("mt-scraper/ofensiva")

antonio-feregrino-bolanos


In [None]:
# Get base page
base_url = "http://www.mediotiempo.com/liga/futbol/ligamx/tabla-general/"
base_page = requests.get(base_url).text
base_soup = BeautifulSoup(base_page, "lxml")

In [None]:
tournament_container = base_soup.find("div", { "class" : "dropdown-container" })
ul = tournament_container.find('ul')
tournaments = []
for li in ul.findAll('li'):
    tournaments.append(li.get('value'))

In [None]:
# Get tables
c = { 'Team': 0, 'PTS':1, 'JJ':2, 'DG':3, 'JG':4,'JE':5, 'JP':6, 'GF': 7, 'GC': 8 }
print("Tournaments", len(tournaments))
scraped = {}
for tournament in tournaments:
    results = []
    url = urljoin(base_url, tournament)
    tournament_page = requests.get(url).text
    tournament_soup = BeautifulSoup(tournament_page, "lxml")
    tables = tournament_soup.findAll("div", { "class" :'table-positions' })
    for table in tables:
        # need to find the table inside div.scroll:
        table = table.find('div', {'class':'scroll'}).find('table', {'class':'mt-table'})
        rows = table.tbody.findAll('tr')
        for row in rows:
            tds = row.findAll('td')
            team = tds[c['Team']].text.strip()
            pts =  tds[c['PTS']].text.strip()
            jj =  tds[c['JJ']].text.strip()
            dg =  tds[c['DG']].text.strip()
            jg =  tds[c['JG']].text.strip()
            je =  tds[c['JE']].text.strip()
            jp =  tds[c['JP']].text.strip()
            gf =  tds[c['GF']].text.strip()
            gc =  tds[c['GC']].text.strip()
            team_stat = {
                'team':team,
                'pts':pts,
                'jj':jj,
                'dg':dg,
                'jg':jg,
                'je':je,
                'jp':jp,
                'gf':gf,
                'gc':gc
            }
            results.append(team_stat)
    scraped[tournament] = results

In [None]:
torneo_largo = re.compile('(\w+)-([0-9]{4})-+([0-9]{4})')
torneo_corto = re.compile('(\w+)-([0-9]{4})')

In [None]:
dos = set(['invierno', 'apertura'])
uno = set(['verano', 'clausura', 'bicentenario'])
intermediate = []
for torneo in scraped:
    match_torneo_largo = torneo_largo.search(torneo)
    match_torneo_corto = torneo_corto.search(torneo)
    name = ''
    if match_torneo_largo:
        t = match_torneo_largo.group(1)
        inicio = int(match_torneo_largo.group(2).upper())
        fin = int(match_torneo_largo.group(3).upper())
        if t == "temporada":
            name = "Temporada de " + str(inicio) + " a "+ str(fin)
        if t == "liguilla":
            name = "Liguilla de " + str(inicio) + " a "+ str(fin)
    elif match_torneo_corto:
        t = match_torneo_corto.group(1)
        c = int(match_torneo_corto.group(2).upper())
        if t in dos:
            name = "Torneo corto " + str(c)  + "-2"
        if t in uno:
            name = "Torneo corto " + str(c)  + "-1"
    for result in scraped[torneo]:
        intermediate.append([
            torneo,
            result['team'],
            result['pts'],
            result['jj'],
            result['dg'],
            result['jg'],
            result['je'],
            result['jp'],
            result['gf'],
            result['gc']
        ])
tournament_df = pd.DataFrame(intermediate)
tournament_df.columns = ['tournament', 'team', 'pts', 'jj', 'dg', 'jg','je', 'jp', 'gf', 'gc']
tournament_df.set_index(['tournament', 'team'], inplace=True)
print(tournament_df.tail())

In [None]:
tournament_df.to_csv('mt-scraper/tournaments.csv')

In [None]:
tournament_df = pd.read_csv('mt-scraper/tournaments.csv', index_col=[0, 1])
print(tournament_df.head())

In [None]:
all_tournaments = tournament_df.index.get_level_values(0).unique().values
url = "http://www.mediotiempo.com/liga/futbol/ligamx/calendario/"
seasons = []
seasonRounds = []
rounds = []
for season in all_tournaments:
    #print("Scraping season", season)
    season_url = url + tournament + "/"
    season_page = requests.get(season_url).text
    season_soup = BeautifulSoup(season_page, "lxml")
    seasonRound_ul = season_soup.find('ul', { "name": "seasonRound"})
    if seasonRound_ul is None: 
        continue
    for li in seasonRound_ul.findAll('li'):
        seasonRound = li.get('value')
        seasonRound_url = season_url + seasonRound + "/"
        seasonRound_page = requests.get(seasonRound_url).text
        seasonRound_soup = BeautifulSoup(seasonRound_page, "lxml")
        round_ul = seasonRound_soup.find('ul', { "name": "round"})
        if round_ul is None: 
            continue
        for li in round_ul.findAll('li'):
            _round = li.get('value')
            seasons.append(season)
            seasonRounds.append(seasonRound)
            rounds.append(_round)
        
print("Found",len(seasons), len(seasonRounds), len(rounds), "rounds")

In [None]:
url = "http://www.mediotiempo.com/liga/futbol/ligamx/calendario/%s/%s/%s"

matches_seasons = []
matches_seasonRounds = []
matches_rounds = []
matches_date = []
matches_time = []
matches_home_team = []
matches_result = []
matches_away_team = []
matches_venue = []

for season,seasonRound,_round in zip(seasons,seasonRounds,rounds):
    query_url = url % (season,seasonRound,_round)
#    print(query_url)
    scrape = requests.get(query_url).text
    scrape_soup = BeautifulSoup(scrape, "lxml")
    calendar_groups = scrape_soup.findAll('div', {"class":"mt-calendar-group"})
    for calendar_group in calendar_groups:
        date = calendar_group.find('div', {"class":"calendar-date-wrapper"}).text.strip()
        match_wrappers = calendar_group.findAll('div', {"class":"mt-calendar-match"},recursive=False)
        for match_wrapper in match_wrappers:
            try:
                divs = match_wrapper.findAll('div')
                time = divs[0].text.strip()
                _as = divs[1].findAll('a')
                home_team = _as[0].text.strip()
                result = _as[1].text.replace("\n", " ").strip()
                away_team = _as[2].text.strip()
                venue_div =  divs[1].find('div', {'class':'venue-wrapper'})
                if venue_div is None:
                    venue = ''
                else:
                    venue = venue_div.text.strip()

                matches_seasons.append(season)
                matches_seasonRounds.append(seasonRound)
                matches_rounds.append(_round)
                matches_date.append(date)
                matches_time.append(time)
                matches_home_team.append(home_team)
                matches_result.append(result)
                matches_away_team.append(away_team)
                matches_venue.append(venue)
            except:
                print("Error", query_url)
print("Done scraping")

In [None]:
matches_df = pd.DataFrame({
    'season' : matches_seasons,
    'season_round': matches_seasonRounds,
    'round': matches_rounds,
    'date': matches_date,
    'time': matches_time,
    'home_team': matches_home_team,
    'result': matches_result,
    'away_team': matches_away_team,
    'venue': matches_venue
})

print(matches_df.info())
matches_df.to_csv('mt-scraper/matches_raw.csv')

In [None]:
matches_df = pd.read_csv('mt-scraper/matches_raw.csv', index_col=0)

month_dict = {
    'enero': 1,
    'febrero': 2,
    'marzo': 3,
    'abril':4,
    'mayo': 5,
    'junio': 6,
    'julio': 7,
    'agosto': 8,
    'septiembre': 9,
    'octubre': 10,
    'noviembre': 11,
    'diciembre': 12
}

mt_date_re = re.compile('(\w{3})\s([0-9]+)\sde\s(\w+),\s([0-9]{4})\s([0-9]{2}):([0-9]{2})')
def parse_dates(text_date):
    match = mt_date_re.search(text_date)
    if match:
        day = int(match.group(2))
        month = month_dict[match.group(3)] 
        year = int(match.group(4))
        hour =int(match.group(5))
        minute =int(match.group(6))
        dt_str = "%04d-%02d-%02d %02d:%02d" % (year,month,day,hour,minute)
        try:
            return pd.to_datetime(dt_str)
        except:
            print(dt_str)

mt_score = re.compile('([0-9]+)\s*-\s*([0-9]+)')
def get_scores(raw_score):
    match = mt_score.search(raw_score)
    if match:
        return int(match.group(1)), int(match.group(2))
    return np.nan, np.nan
    
    
date_time = matches_df['date'] + " " + matches_df["time"]

#matches_df['match_datetime']
matches_df['match_datetime'] = date_time.apply(parse_dates)
matches_df['home_score'],  matches_df['away_score'] = zip(*matches_df['result'].apply(get_scores))
matches_df.tail()

#del matches_df['date'], matches_df['time'], matches_df['result']

#matches_df.info()
matches_df.to_csv('mt-scraper/matches_processed.csv')
no_date = matches_df[matches_df['match_datetime'].isnull()]
no_date.head(11)

In [None]:
matches_processed_df = pd.read_csv('mt-scraper/matches_processed.csv', index_col=0, parse_dates=['match_datetime'])
matches_processed_df.info()

In [None]:
no_date = matches_processed_df[matches_processed_df['match_datetime'].isnull()]
no_date.head(11)

In [None]:
# Get details about each match (insane):
_ = '''
url = "http://www.mediotiempo.com/partido/futbol/ligamx/%s/%s/ficha"
url_alt = "http://www.mediotiempo.com/partido/futbol/liga-mx/%s/%s/ficha"
a = matches_processed_df[['home_team','away_team','match_datetime']].values
for r in a[4300:4305]:
    s = r[0] + " vs " + r[1]
    _url = url % (slugify.slugify(s), pd.to_datetime(r[2]).strftime("%Y/%m/%d"))
    rq = requests.get(_url)
    if rq.status_code != 200:
        _url = url_alt % (slugify.slugify(s), pd.to_datetime(r[2]).strftime("%Y/%m/%d"))
        rq = requests.get(_url)
        if rq.status_code != 200:
            rq = None
            _url = None
    
    if rq is not None:
        print(_url)
'''

In [None]:
all_tournaments = tournament_df.index.get_level_values(0).unique().values

In [None]:
# Ofensiva
of_url = "http://www.mediotiempo.com/liga/futbol/ligamx/estadisticas/equipos/%s/ofensiva?tabla=mas-goleadores"
ofensiva_tables = {}
for season in all_tournaments:
    url = of_url % season
    r = requests.get(url)
    scrape_soup = BeautifulSoup(r.text, "lxml")
    table = scrape_soup.find('div', {'class': 'table-containers'})
    if table is not None:
        table_body = table.find('div', {'class': 'scroll'}).find('tbody', {'class':'mt-table-body'})
        rows = table_body.findAll('tr')
        if len(rows) == 0:
            continue
        
        gf = []
        equipo = []
        tt = []
        tg = []
        prec = []
        g_c = []
        ll = []
        lla = []
        fdl = []
        
        for row in rows:
            all_tds = row.findAll('td')
            equipo.append(all_tds[0].text.strip())
            if len(all_tds) == 2:
                gf.append(int(all_tds[1].text.strip()))
                tt.append(np.nan)
                tg.append(np.nan)
                prec.append(np.nan)
                g_c.append(np.nan)
                ll.append(np.nan)
                lla.append(np.nan)
                fdl.append(np.nan)
            else:
                gf.append(int(all_tds[3].text.strip()))
                tt.append(int(all_tds[1].text.strip()))
                tg.append(int(all_tds[2].text.strip()))
                prec.append(all_tds[4].text.strip())
                g_c.append(float(all_tds[5].text.strip()))
                ll.append(int(all_tds[6].text.strip()))
                lla.append(int(all_tds[7].text.strip()))
                fdl.append(int(all_tds[8].text.strip()))
        ofensiva_tables[season] = pd.DataFrame({'Equipo': equipo, 
                                                'GF': gf,
                                                'TT': tt,
                                                'TG': tg,
                                                'PREC': prec,
                                                'G_C': g_c,
                                                'LL': ll,
                                                'LLA': lla,
                                                'FDL': fdl})

In [None]:
# Defensiva
of_url = "http://www.mediotiempo.com/liga/futbol/ligamx/estadisticas/equipos/%s/defensiva?tabla=menos-goleados"
defensiva_tables = {}
for season in all_tournaments:
    url = of_url % season
    r = requests.get(url)
    scrape_soup = BeautifulSoup(r.text, "lxml")
    table = scrape_soup.find('div', {'class': 'table-containers'})
    if table is not None:
        table_body = table.find('div', {'class': 'scroll'}).find('tbody', {'class':'mt-table-body'})
        rows = table_body.findAll('tr')
        if len(rows) == 0:
            continue
        
        gc = []
        equipo = []
        ttp = []
        tgp = []
        blq = []
        tblq = []
        cblq = []
        pblq = []
        _int = []
        
        for row in rows:
            all_tds = row.findAll('td')
            equipo.append(all_tds[0].text.strip())
            if len(all_tds) == 2:
                gc.append(int(all_tds[1].text.strip()))
                ttp.append(np.nan)
                tgp.append(np.nan)
                blq.append(np.nan)
                tblq.append(np.nan)
                cblq.append(np.nan)
                pblq.append(np.nan)
                _int.append(np.nan)
            else:
                gc.append(int(all_tds[3].text.strip()))
                ttp.append(int(all_tds[1].text.strip()))
                tgp.append(int(all_tds[2].text.strip()))
                blq.append(int(all_tds[4].text.strip()))
                tblq.append(int(all_tds[5].text.strip()))
                cblq.append(int(all_tds[6].text.strip()))
                pblq.append(int(all_tds[7].text.strip()))
                _int.append(int(all_tds[8].text.strip()))
        defensiva_tables[season] = pd.DataFrame({'Equipo': equipo, 
                                                 'TTP': ttp,
                                                 'TGP': tgp,
                                                 'GC': gc,
                                                 'BLQ': blq,
                                                 'TBLQ': tblq,
                                                 'CBLQ': cblq,
                                                 'PBLQ': pblq,
                                                 'INT': _int})

In [None]:
for season in all_tournaments:
    if season in defensiva_tables and season in ofensiva_tables:
        defensiva_tables[season].to_csv('mt-scraper/defensiva/' + season +'.csv')
        ofensiva_tables[season].to_csv('mt-scraper/ofensiva/' + season +'.csv')