### Scraping live betting odds from [flashscore.pl/tenis](https://www.flashscore.pl/tenis/)

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as BS
import re
import requests
import json
from datetime import datetime
from datetime import timedelta
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 40)

In [2]:
# Prepare a data frame
data = pd.DataFrame(columns = ['event', 'match_time', 'player_1', 'player_2', 
                               'player_1_score_sets', 'player_1_score_games', 'player_1_score_points',
                               'player_2_score_sets', 'player_2_score_games', 'player_2_score_points', 
                               'serving',
                               'player_1_nationality', 'player_2_nationality', 
                               'player_1_rank', 'player_2_rank',
                               'player_1_link', 'player_2_link',
                               'player_1_eFortuna', 'player_2_eFortuna',
                               'player_1_STS', 'player_2_STS',
                               'player_1_Betclic', 'player_2_Betclic',
                               'player_1_Betfan', 'player_2_Betfan',
                               'player_1_Pzbuk', 'player_2_Pzbuk',
                               'player_1_Lvbet', 'player_2_Lvbet',
                               'player_1_Totolotek', 'player_2_Totolotek'
                              ])

# Retrieve data file for the main page for today's matches, including current scores
headers = {"Accept":  "*/*",
           "Accept-Encoding": "gzip, deflate, br",
           "Accept-Language": "pl,en-US;q=0.7,en;q=0.3",
           "Connection": "keep-alive",
           "Host": "d.flashscore.pl",
           "Referer": "https://d.flashscore.pl/x/feed/proxy-fetch",
           "TE": "Trailers",
           "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0",
           "x-fsign": "SW9D1eZo"
          }

s = requests.Session()

main_page = s.get('https://d.flashscore.pl/x/feed/f_2_0_2_pl_1', 
                        headers = headers)

# Split the data for the matches (Note: the 1st element of `matches` will not be a match, but we do not worry, 
# because this element will be filtered out in the next step.)
matches = re.split('¬~AA÷', main_page.text)

# It seems that that the ended/cancelled/interrupted matches contain the tag '¬AB÷3', the current ones - '¬AB÷2', 
# and the future ones - '¬AB÷1'. We will keep only those matches, for which there are active/future bets.
matches = [match for match in matches if '¬AB÷1' in match or '¬AB÷2' in match]

for match in matches:
    
    new_row = {}
    
    # Retrieve match ID that will be used for retriving detailed information for the match (betting odds)
    match_id = re.search('.*?¬AD÷', match).group()[:-4]
    url = 'https://www.flashscore.pl/mecz/' + match_id + '/#zestawienie-kursow/home-away/koniec-meczu'
    
    # Retrieve match time
    # Add 2 hours (time zone difference).
    match_time = re.search('¬AD÷.*?¬ADE÷', match).group()[4:-5]
    match_time = datetime.utcfromtimestamp(int(match_time))
    match_time = match_time + timedelta(hours = 2)
    new_row['match_time'] = match_time.strftime('%d-%m-%Y %H:%M:%S')
    
    # Retrieve current match score from the file (if match not started, NoType object is found)
    player_1_score = re.search('¬AG÷.*?¬OA÷', match)
    player_2_score = re.search('¬AH÷.*?¬OB÷', match)
    
    # Strip the score from unnecesary symbols and split it into sets, games and points
    if player_1_score:
        player_1_score = player_1_score.group()[4:-4]
        # (Possibly remove the part of the score accounting for tie-break points.)
        player_1_score = re.sub('¬D.÷\d+', '', player_1_score)
        player_1_score = re.split('¬.*?÷', player_1_score)
        new_row['player_1_score_sets'] = player_1_score[0]
        new_row['player_1_score_games'] = ' '.join(map(str, player_1_score[1:-1]))
        new_row['player_1_score_points'] = player_1_score[-1]

    else:
        new_row['player_1_score_sets'] = 0
        new_row['player_1_score_games'] = ''
        new_row['player_1_score_points'] = ''

    if player_2_score:
        player_2_score = player_2_score.group()[4:-4]
        player_2_score = re.sub('¬D.÷\d+', '', player_2_score)
        player_2_score = re.split('¬.*?÷', player_2_score)
        new_row['player_2_score_sets'] = player_2_score[0]
        new_row['player_2_score_games'] = ' '.join(map(str, player_2_score[1:-1]))
        new_row['player_2_score_points'] = player_2_score[-1]

    else:
        new_row['player_2_score_sets'] = 0
        new_row['player_2_score_games'] = ''
        new_row['player_2_score_points'] = ''
        
    # Retrieve information on who is serving right now. It seems that '¬WC÷1' signifies that it is the 1st player,
    # and '¬WC÷2' - the 2nd player.
    if '¬WC÷1' in match:
        new_row['serving'] = 'player_1'
    elif '¬WC÷2' in match:
        new_row['serving'] = 'player_2'
    else:
        new_row['serving'] = np.nan
        
    try:
        html = s.get(url)
        match_page = BS(html.content, 'html.parser')
        
        # Retrieve the event title from the the match's site
        new_row['event'] = match_page.head.find_all('meta')[5]['content']
        
        # Retrieve players' names
        players = match_page.head.find_all('meta')[4]['content']
        players = re.split(' - ', players)
        new_row['player_1'] = players[0]
        new_row['player_2'] = players[1]
        
        # Retrieve a JSON part containing some details on the players
        script = match_page.find_all('script')[1].string
        script = re.search('\{.*\}', script).group()
        details = json.loads(script)
        
        details_player_1 = details['participantsData']['home']
        details_player_2 = details['participantsData']['away']
        
        # Retrieve players' nationality
        # In the case of doubles, seprate the information by ' / '.
        new_row['player_1_nationality'] = ' / '.join([player['country'] for player in details_player_1])
        new_row['player_2_nationality'] = ' / '.join([player['country'] for player in details_player_2])

        # Retrieve players' rankings
        # In the case of doubles, seprate the information by ' / '.
        new_row['player_1_rank'] = ' / '.join([player['rank'][1] if player['rank']!=[] else '' for player \
                                               in details_player_1])
        new_row['player_2_rank'] = ' / '.join([player['rank'][1] if player['rank']!=[] else '' for player \
                                               in details_player_2])
        
        # Retrieve links to players' pages on flashscore.pl
        # In the case of doubles, seprate the information by ' / '.
        new_row['player_1_link'] = ' / '.join(['https://www.flashscore.pl' + player['detail_link'] for player \
                                               in details_player_1]) 
        new_row['player_2_link'] = ' / '.join(['https://www.flashscore.pl' + player['detail_link'] for player \
                                               in details_player_2])

        
    except:
        new_row['event'] = np.nan
        # (For players' names and nationalities we can use another method of retrieving this information, 
        # not using the match_page, but the main_page).
        details_player_1 = re.search('¬AE÷.*?¬JA÷', match).group()[4:-4]
        details_player_2 = re.search('¬AF÷.*?¬JB÷', match).group()[4:-4]
        details_player_1 = re.split(' \(', details_player_1)
        details_player_2 = re.split(' \(', details_player_2)
        
        new_row['player_1'] = details_player_1[0]
        new_row['player_2'] = details_player_2[0]
        # (We need to distinguish singles - players' nationalities is given on the main page - and doubles
        # - nationalities not given).
        if len(details_player_1)==2:
            new_row['player_1_nationality'] = details_player_1[1][:-1]
        else:
            new_row['player_1_nationality'] = np.nan
        if len(details_player_2)==2:
            new_row['player_2_nationality'] = details_player_2[1][:-1]
        else:
            new_row['player_2_nationality'] = np.nan
        
        new_row['player_1_rank'] = np.nan
        new_row['player_2_rank'] = np.nan
        new_row['player_1_link'] = np.nan
        new_row['player_2_link'] = np.nan
        
        
    try:
        # Retrieve a file including betting odds from the match's site
        match_odds = s.get('https://d.flashscore.pl/x/feed/df_od_1_' + match_id, headers = headers)

        # Retrieve the part of data including end of match betting odds
        match_odds = re.search('(home-away).*?(Set 1)', match_odds.text).group()
        
        # It seems that invalid (crossed-out) odds are those that end up with '¬OG÷0', and the valid ones
        # end up with '¬OG÷1'.
        match_odds = re.findall('¬OD÷.*?¬OG÷.', match_odds)
        valid_odds = ''.join([o for o in match_odds if o[-1]=='1'])

        # Retrieve the lists of the bookmakers' names and the odds
        bookmakers = re.findall('¬OD.*?¬OPI', valid_odds)
        player_1_odds = re.findall('¬XB÷.*?¬XC', valid_odds) 
        player_2_odds = re.findall('¬XC÷.*?¬OG', valid_odds)
        
        # Strip the bookmakers names from unnecessary symbols
        bookmakers = [i[4:-4] for i in bookmakers]
        
        # Strip the odds data from unnecessary symbols, split on the last change of the odds, 
        # and take the last value
        player_1_odds = [re.split('\[.\]', i[4:-3])[-1] for i in player_1_odds]
        player_2_odds = [re.split('\[.\]', i[4:-3])[-1] for i in player_2_odds]
        
        # Assign betting odds for 7 possible bookmakers, for both players.
        # (Regex is used to produce an appropriate form of a variable (column name).)
        for bookmaker in ['eFortuna.pl', 'STS.pl', 'Betclic.pl', 'Betfan.pl', 
                          'Lvbet.pl', 'Pzbukpl', 'Totolotek.pl']:
            
            try:
                new_row['player_1_' + re.search('\w*(?=[\.p])', bookmaker).group()] = \
                player_1_odds[bookmakers.index(bookmaker)]
            except:
                new_row['player_1_' + re.search('\w*(?=[\.p])', bookmaker).group()] = np.nan

            try:
                new_row['player_2_' + re.search('\w*(?=[\.p])', bookmaker).group()] = \
                player_2_odds[bookmakers.index(bookmaker)]
            except:
                new_row['player_2_' + re.search('\w*(?=[\.p])', bookmaker).group()] = np.nan
            
    except:
        
        for bookmaker in ['eFortuna.pl', 'STS.pl', 'Betclic.pl', 'Betfan.pl', 
                          'Lvbet.pl', 'Pzbukpl', 'Totolotek.pl']:
            
            new_row['player_1_' + re.search('\w*(?=[\.p])', bookmaker).group()] = np.nan
            new_row['player_2_' + re.search('\w*(?=[\.p])', bookmaker).group()] = np.nan
            
    data = data.append(new_row, ignore_index = True)

scraping_time = datetime.now()

In [3]:
data

Unnamed: 0,event,match_time,player_1,player_2,player_1_score_sets,player_1_score_games,player_1_score_points,player_2_score_sets,player_2_score_games,player_2_score_points,serving,player_1_nationality,player_2_nationality,player_1_rank,player_2_rank,player_1_link,player_2_link,player_1_eFortuna,player_2_eFortuna,player_1_STS,player_2_STS,player_1_Betclic,player_2_Betclic,player_1_Betfan,player_2_Betfan,player_1_Pzbuk,player_2_Pzbuk,player_1_Lvbet,player_2_Lvbet,player_1_Totolotek,player_2_Totolotek
0,"ATP - SINGIEL: Belgrade 2 (Serbia), ziemna - Ć...",27-05-2021 15:30:00,Andrej Martin,Dusan Lajovic,0,2,0.0,0,3,15.0,player_1,Svk,Srb,119,39,https://www.flashscore.pl/zawodnik/martin-andr...,https://www.flashscore.pl/zawodnik/lajovic-dus...,3.3,1.36,3.25,1.35,3.45,1.32,3.45,1.31,3.45,1.32,3.15,1.31,3.04,1.33
1,"ATP - SINGIEL: Belgrade 2 (Serbia), ziemna - Ć...",27-05-2021 17:00:00,Federico Delbonis,Roberto Carballes Baena,0,,,0,,,,Arg,Esp,52,98,https://www.flashscore.pl/zawodnik/delbonis-fe...,https://www.flashscore.pl/zawodnik/carballes-b...,1.5,2.7,1.45,2.75,1.41,2.95,1.39,3.0,1.42,2.95,1.37,2.85,1.41,2.69
2,ATP - SINGIEL: French Open (Francja) - Kwalifi...,27-05-2021 15:35:00,Taro Daniel,Quentin Halys,0,1,0.0,0,3,0.0,player_2,Jpn,Fra,112,222,https://www.flashscore.pl/zawodnik/daniel-taro...,https://www.flashscore.pl/zawodnik/halys-quent...,1.38,3.1,1.37,3.05,,,1.36,3.15,1.36,3.15,1.37,2.9,1.35,2.94
3,ATP - SINGIEL: French Open (Francja) - Kwalifi...,27-05-2021 15:35:00,Hugo Dellien,Oscar Otte,0,1,0.0,0,2,0.0,player_1,Bol,Ger,124,152,https://www.flashscore.pl/zawodnik/dellien-hug...,https://www.flashscore.pl/zawodnik/otte-oscar/...,2.55,1.52,2.5,1.52,2.5,1.54,2.59,1.5,2.5,1.54,2.5,1.48,2.29,1.55
4,"ATP - SINGIEL: Parma (Włochy), ziemna - Ćwierć...",27-05-2021 16:00:00,Sebastian Korda,Yoshihito Nishioka,0,,,0,,,,Usa,Jpn,63,60,https://www.flashscore.pl/zawodnik/korda-sebas...,https://www.flashscore.pl/zawodnik/nishioka-yo...,1.53,2.65,1.45,2.7,1.4,2.95,1.44,2.8,1.45,2.85,1.42,2.65,1.41,2.69
5,WTA - SINGIEL: French Open (Francja) - Kwalifi...,27-05-2021 14:50:00,Jaqueline Adina Cristian,Anhelina Kalinina,0,4 0,30.0,1,6 3,40.0,player_2,Rou,Ukr,152,139,https://www.flashscore.pl/zawodnik/cristian-ja...,https://www.flashscore.pl/zawodnik/kalinina-an...,3.65,1.29,3.55,1.27,3.3,1.34,3.75,1.27,3.7,1.27,3.5,1.27,3.04,1.33
6,WTA - SINGIEL: French Open (Francja) - Kwalifi...,27-05-2021 15:40:00,Aleksandra Krunic,Olga Goworcowa,0,3,0.0,0,1,0.0,player_1,Srb,Blr,212,138,https://www.flashscore.pl/zawodnik/krunic-alek...,https://www.flashscore.pl/zawodnik/gworcowa-ol...,1.41,2.95,1.4,2.9,1.41,2.95,1.38,3.05,1.39,3.0,1.38,2.85,1.4,2.73
7,"WTA - SINGIEL: Strasbourg (Francja), ziemna - ...",27-05-2021 15:05:00,Barbora Krejcikova,Jekatierina Aleksandrowa,0,6,0.0,0,6,0.0,player_1,Cze,Rus,38,33,https://www.flashscore.pl/zawodnik/krejcikova-...,https://www.flashscore.pl/zawodnik/aleksandrow...,1.65,2.31,1.67,2.15,1.68,2.19,1.61,2.33,1.66,2.3,1.7,2.05,1.7,2.03
8,"WTA - SINGIEL: Strasbourg (Francja), ziemna - ...",27-05-2021 17:30:00,Jule Niemeier,Arantxa Rus,0,,,0,,,,Ger,Ned,216,84,https://www.flashscore.pl/zawodnik/niemeier-ju...,https://www.flashscore.pl/zawodnik/rus-arantxa...,2.02,1.8,1.95,1.8,2.02,1.8,2.05,1.77,2.05,1.83,1.96,1.76,2.0,1.72
9,"ATP - DEBEL: Belgrade 2 (Serbia), ziemna - Pół...",27-05-2021 15:00:00,Ivan Sabanov / Matej Sabanov,Jonathan Erlich / Andriej Wasilewskij,0,3 2,30.0,1,6 1,30.0,player_2,/,/,/ 916,/,https://www.flashscore.pl/zawodnik/sabanov-iva...,https://www.flashscore.pl/zawodnik/erlich-jona...,1.72,2.13,1.7,2.1,1.69,2.1,1.69,2.17,1.8,1.95,,,1.74,1.98


In [4]:
data.dtypes

event                    object
match_time               object
player_1                 object
player_2                 object
player_1_score_sets      object
player_1_score_games     object
player_1_score_points    object
player_2_score_sets      object
player_2_score_games     object
player_2_score_points    object
serving                  object
player_1_nationality     object
player_2_nationality     object
player_1_rank            object
player_2_rank            object
player_1_link            object
player_2_link            object
player_1_eFortuna        object
player_2_eFortuna        object
player_1_STS             object
player_2_STS             object
player_1_Betclic         object
player_2_Betclic         object
player_1_Betfan          object
player_2_Betfan          object
player_1_Pzbuk           object
player_2_Pzbuk           object
player_1_Lvbet           object
player_2_Lvbet           object
player_1_Totolotek       object
player_2_Totolotek       object
dtype: o

In [5]:
# Hand-pick non-numeric columns
character_columns = ['event', 'match_time', 'player_1', 'player_2', 'player_1_score_games', 'player_2_score_games',
                    'player_1_score_points', 'player_2_score_points', 'serving', 'player_1_nationality',
                    'player_2_nationality', 'player_1_rank', 'player_2_rank', 'player_1_link', 'player_2_link']

# Note that `player_1_score_games` (and `player_2_score_games`) are not integers, because it contains games won
# in the subsequent sets, separated by a space, and `player_1_score_points` (and `player_2_score_points`) is not
# an integer column either, because it may contain 'A' (advantage).
# `player_1_rank` and `player_2_rank` are also not integers, since for doubles they contain an expression 'x / y'.

# Convert numberic (float/integer) columns to their proper types
for col in data.drop(character_columns, axis = 1).columns:
    data[col] = pd.to_numeric(data[col], downcast = 'integer')

In [6]:
data.dtypes

event                     object
match_time                object
player_1                  object
player_2                  object
player_1_score_sets         int8
player_1_score_games      object
player_1_score_points     object
player_2_score_sets         int8
player_2_score_games      object
player_2_score_points     object
serving                   object
player_1_nationality      object
player_2_nationality      object
player_1_rank             object
player_2_rank             object
player_1_link             object
player_2_link             object
player_1_eFortuna        float64
player_2_eFortuna        float64
player_1_STS             float64
player_2_STS             float64
player_1_Betclic         float64
player_2_Betclic         float64
player_1_Betfan          float64
player_2_Betfan          float64
player_1_Pzbuk           float64
player_2_Pzbuk           float64
player_1_Lvbet           float64
player_2_Lvbet           float64
player_1_Totolotek       float64
player_2_T

In [7]:
# Export the data, naming the file after the time scraping was finished
scraping_time = scraping_time.strftime("%d%m%y_%H%M%S")
data.to_csv('data_' + scraping_time + '.csv', index = False)

In [None]:
# Note: the retrieved match result would not be correct for the matches that are finished. However, since we are 
# interested in the matches that one can bet, we filter out the ended matches. 