### Scraping live betting odds from [flashscore.pl/tenis](https://www.flashscore.pl/tenis/)

In [30]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as BS
import re
import requests
import json
from datetime import datetime
from datetime import timedelta
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 40)

In [31]:
# Prepare a data frame
data = pd.DataFrame(columns = ['event', 'match_time', 'player_1', 'player_2', 
                               'player_1_score_sets', 'player_1_score_games', 'player_1_score_points',
                               'player_2_score_sets', 'player_2_score_games', 'player_2_score_points', 
                               'serving',
                               'player_1_nationality', 'player_2_nationality', 
                               'player_1_rank', 'player_2_rank',
                               'player_1_link', 'player_2_link',
                               'player_1_eFortuna', 'player_2_eFortuna',
                               'player_1_STS', 'player_2_STS',
                               'player_1_Betclic', 'player_2_Betclic',
                               'player_1_Betfan', 'player_2_Betfan',
                               'player_1_Pzbuk', 'player_2_Pzbuk',
                               'player_1_Lvbet', 'player_2_Lvbet',
                               'player_1_Totolotek', 'player_2_Totolotek'
                              ])

# Retrieve data file for the main page for today's matches, including current scores
headers = {"Accept":  "*/*",
           "Accept-Encoding": "gzip, deflate, br",
           "Accept-Language": "pl,en-US;q=0.7,en;q=0.3",
           "Connection": "keep-alive",
           "Host": "d.flashscore.pl",
           "Referer": "https://d.flashscore.pl/x/feed/proxy-fetch",
           "TE": "Trailers",
           "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0",
           "x-fsign": "SW9D1eZo"
          }

s = requests.Session()

main_page = s.get('https://d.flashscore.pl/x/feed/f_2_0_2_pl_1', 
                        headers = headers)

# Split the data for the matches (Note: the 1st element of `matches` will not be a match, but we do not worry, 
# because this element will be filtered out in the next step.)
matches = re.split('¬~AA÷', main_page.text)

# It seems that that the ended/cancelled/interrupted matches contain the tag '¬AB÷3', the current ones - '¬AB÷2', 
# and the future ones - '¬AB÷1'. We will keep only those matches, for which there are active/future bets.
matches = [match for match in matches if '¬AB÷1' in match or '¬AB÷2' in match]

for match in matches:
    
    new_row = {}
    
    # Retrieve match ID that will be used for retriving detailed information for the match (betting odds)
    match_id = re.search('.*?¬AD÷', match).group()[:-4]
    url = 'https://www.flashscore.pl/mecz/' + match_id + '/#zestawienie-kursow/home-away/koniec-meczu'
    
    # Retrieve match time
    # Add 2 hours (time zone difference).
    match_time = re.search('¬AD÷.*?¬ADE÷', match).group()[4:-5]
    match_time = datetime.utcfromtimestamp(int(match_time))
    match_time = match_time + timedelta(hours = 2)
    new_row['match_time'] = match_time.strftime('%d-%m-%Y %H:%M:%S')
    
    # Retrieve current match score from the file (if match not started, NoType object is found)
    player_1_score = re.search('¬AG÷.*?¬OA÷', match)
    player_2_score = re.search('¬AH÷.*?¬OB÷', match)
    
    # Strip the score from unnecesary symbols and split it into sets, games and points
    if player_1_score:
        player_1_score = player_1_score.group()[4:-4]
        # (Possibly remove the part of the score accounting for tie-break points.)
        player_1_score = re.sub('¬D.÷\d+', '', player_1_score)
        player_1_score = re.split('¬.*?÷', player_1_score)
        new_row['player_1_score_sets'] = player_1_score[0]
        new_row['player_1_score_games'] = ' '.join(map(str, player_1_score[1:-1]))
        new_row['player_1_score_points'] = player_1_score[-1]

    else:
        new_row['player_1_score_sets'] = 0
        new_row['player_1_score_games'] = ''
        new_row['player_1_score_points'] = ''

    if player_2_score:
        player_2_score = player_2_score.group()[4:-4]
        player_2_score = re.sub('¬D.÷\d+', '', player_2_score)
        player_2_score = re.split('¬.*?÷', player_2_score)
        new_row['player_2_score_sets'] = player_2_score[0]
        new_row['player_2_score_games'] = ' '.join(map(str, player_2_score[1:-1]))
        new_row['player_2_score_points'] = player_2_score[-1]

    else:
        new_row['player_2_score_sets'] = 0
        new_row['player_2_score_games'] = ''
        new_row['player_2_score_points'] = ''
        
    # Retrieve information on who is serving right now. It seems that '¬WC÷1' signifies that it is the 1st player,
    # and '¬WC÷2' - the 2nd player.
    if '¬WC÷1' in match:
        new_row['serving'] = 'player_1'
    elif '¬WC÷2' in match:
        new_row['serving'] = 'player_2'
    else:
        new_row['serving'] = np.nan
        
    try:
        html = s.get(url)
        match_page = BS(html.content, 'html.parser')
        
        # Retrieve the event title from the the match's site
        new_row['event'] = match_page.head.find_all('meta')[5]['content']
        
        # Retrieve players' names
        players = match_page.head.find_all('meta')[4]['content']
        players = re.split(' - ', players)
        new_row['player_1'] = players[0]
        new_row['player_2'] = players[1]
        
        # Retrieve a JSON part containing some details on the players
        script = match_page.find_all('script')[1].string
        script = re.search('\{.*\}', script).group()
        details = json.loads(script)
        
        details_player_1 = details['participantsData']['home']
        details_player_2 = details['participantsData']['away']
        
        # Retrieve players' nationality
        # In the case of doubles, seprate the information by ' / '.
        new_row['player_1_nationality'] = ' / '.join([player['country'] for player in details_player_1])
        new_row['player_2_nationality'] = ' / '.join([player['country'] for player in details_player_2])

        # Retrieve players' rankings
        # In the case of doubles, seprate the information by ' / '.
        new_row['player_1_rank'] = ' / '.join([player['rank'][1] if player['rank']!=[] else '' for player \
                                               in details_player_1])
        new_row['player_2_rank'] = ' / '.join([player['rank'][1] if player['rank']!=[] else '' for player \
                                               in details_player_2])
        
        # Retrieve links to players' pages on flashscore.pl
        # In the case of doubles, seprate the information by ' / '.
        new_row['player_1_link'] = ' / '.join(['https://www.flashscore.pl' + player['detail_link'] for player \
                                               in details_player_1]) 
        new_row['player_2_link'] = ' / '.join(['https://www.flashscore.pl' + player['detail_link'] for player \
                                               in details_player_2])

        
    except:
        new_row['event'] = np.nan
        # (For players' names and nationalities we can use another method of retrieving this information, 
        # not using the match_page, but the main_page).
        details_player_1 = re.search('¬AE÷.*?¬JA÷', match).group()[4:-4]
        details_player_2 = re.search('¬AF÷.*?¬JB÷', match).group()[4:-4]
        details_player_1 = re.split(' \(', details_player_1)
        details_player_2 = re.split(' \(', details_player_2)
        
        new_row['player_1'] = details_player_1[0]
        new_row['player_2'] = details_player_2[0]
        # (We need to distinguish singles - players' nationalities is given on the main page - and doubles
        # - nationalities not given).
        if len(details_player_1)==2:
            new_row['player_1_nationality'] = details_player_1[1][:-1]
        else:
            new_row['player_1_nationality'] = np.nan
        if len(details_player_2)==2:
            new_row['player_2_nationality'] = details_player_2[1][:-1]
        else:
            new_row['player_2_nationality'] = np.nan
        
        new_row['player_1_rank'] = np.nan
        new_row['player_2_rank'] = np.nan
        new_row['player_1_link'] = np.nan
        new_row['player_2_link'] = np.nan
        
        
    try:
        # Retrieve a file including betting odds from the match's site
        match_odds = s.get('https://d.flashscore.pl/x/feed/df_od_1_' + match_id, headers = headers)

        # Retrieve the part of data including end of match betting odds
        match_odds = re.search('(home-away).*?(Set 1)', match_odds.text).group()
        
        # It seems that invalid (crossed-out) odds are those that end up with '¬OG÷0', and the valid ones
        # end up with '¬OG÷1'.
        match_odds = re.findall('¬OD÷.*?¬OG÷.', match_odds)
        valid_odds = ''.join([o for o in match_odds if o[-1]=='1'])

        # Retrieve the lists of the bookmakers' names and the odds
        bookmakers = re.findall('¬OD.*?¬OPI', valid_odds)
        player_1_odds = re.findall('¬XB÷.*?¬XC', valid_odds) 
        player_2_odds = re.findall('¬XC÷.*?¬OG', valid_odds)
        
        # Strip the bookmakers names from unnecessary symbols
        bookmakers = [i[4:-4] for i in bookmakers]
        
        # Strip the odds data from unnecessary symbols, split on the last change of the odds, 
        # and take the last value
        player_1_odds = [re.split('\[.\]', i[4:-3])[-1] for i in player_1_odds]
        player_2_odds = [re.split('\[.\]', i[4:-3])[-1] for i in player_2_odds]
        
        # Assign betting odds for 7 possible bookmakers, for both players.
        # (Regex is used to produce an appropriate form of a variable (column name).)
        for bookmaker in ['eFortuna.pl', 'STS.pl', 'Betclic.pl', 'Betfan.pl', 
                          'Lvbet.pl', 'Pzbukpl', 'Totolotek.pl']:
            
            try:
                new_row['player_1_' + re.search('\w*(?=[\.p])', bookmaker).group()] = \
                player_1_odds[bookmakers.index(bookmaker)]
            except:
                new_row['player_1_' + re.search('\w*(?=[\.p])', bookmaker).group()] = np.nan

            try:
                new_row['player_2_' + re.search('\w*(?=[\.p])', bookmaker).group()] = \
                player_2_odds[bookmakers.index(bookmaker)]
            except:
                new_row['player_2_' + re.search('\w*(?=[\.p])', bookmaker).group()] = np.nan
            
    except:
        
        for bookmaker in ['eFortuna.pl', 'STS.pl', 'Betclic.pl', 'Betfan.pl', 
                          'Lvbet.pl', 'Pzbukpl', 'Totolotek.pl']:
            
            new_row['player_1_' + re.search('\w*(?=[\.p])', bookmaker).group()] = np.nan
            new_row['player_2_' + re.search('\w*(?=[\.p])', bookmaker).group()] = np.nan
            
    data = data.append(new_row, ignore_index = True)

scraping_time = datetime.now()

In [32]:
data

Unnamed: 0,event,match_time,player_1,player_2,player_1_score_sets,player_1_score_games,player_1_score_points,player_2_score_sets,player_2_score_games,player_2_score_points,serving,player_1_nationality,player_2_nationality,player_1_rank,player_2_rank,player_1_link,player_2_link,player_1_eFortuna,player_2_eFortuna,player_1_STS,player_2_STS,player_1_Betclic,player_2_Betclic,player_1_Betfan,player_2_Betfan,player_1_Pzbuk,player_2_Pzbuk,player_1_Lvbet,player_2_Lvbet,player_1_Totolotek,player_2_Totolotek
0,"ATP - SINGIEL: Madryt (Hiszpania), ziemna - 1/...",06-05-2021 15:30:00,John Isner,Andriej Rublow,1,7 2,0.0,0,6 4,0.0,player_1,Usa,Rus,39,7,https://www.flashscore.pl/zawodnik/isner-john/...,https://www.flashscore.pl/zawodnik/rublow-andr...,3.1,1.31,3.55,1.3,,,3.7,1.28,3.6,1.29,3.4,1.28,3.71,1.25
1,"ATP - SINGIEL: Madryt (Hiszpania), ziemna - 1/...",06-05-2021 17:00:00,Casper Ruud,Stefanos Tsitsipas,0,,,0,,,,Nor,Gre,22,5,https://www.flashscore.pl/zawodnik/ruud-casper...,https://www.flashscore.pl/zawodnik/tsitsipas-s...,4.4,1.23,4.0,1.25,,,4.55,1.2,4.45,1.21,4.15,1.2,3.9,1.23
2,"ATP - SINGIEL: Madryt (Hiszpania), ziemna - 1/...",06-05-2021 18:30:00,Matteo Berrettini,Federico Delbonis,0,,,0,,,,Ita,Arg,10,77,https://www.flashscore.pl/zawodnik/berrettini-...,https://www.flashscore.pl/zawodnik/delbonis-fe...,1.3,3.15,1.25,4.0,1.23,4.2,1.23,4.25,1.21,4.5,1.2,4.15,1.25,3.71
3,"ATP - SINGIEL: Madryt (Hiszpania), ziemna - 1/...",06-05-2021 19:00:00,Daniel Evans,Alexander Zverev,0,,,0,,,,Gbr,Ger,26,6,https://www.flashscore.pl/zawodnik/evans-danie...,https://www.flashscore.pl/zawodnik/zverev-alex...,4.1,1.26,4.35,1.22,4.5,1.21,4.7,1.19,4.65,1.19,4.3,1.19,4.23,1.2
4,"WTA - SINGIEL: Madryt (Hiszpania), ziemna - Pó...",06-05-2021 20:30:00,Aryna Sabalenka,Anastazja Pawluczenkowa,0,,,0,,,,Blr,Rus,7,41,https://www.flashscore.pl/zawodnik/sabalenka-a...,https://www.flashscore.pl/zawodnik/pawluczenko...,1.23,4.7,1.22,4.35,1.21,4.5,1.19,4.65,1.21,4.6,1.21,4.05,1.2,4.04
5,"ATP - DEBEL: Madryt (Hiszpania), ziemna - 1/8-...",06-05-2021 16:45:00,Alexander Bublik / Cristian Garin,Sander Gille / Joran Vliegen,0,,,0,,,,/,/,44 / 25,/,https://www.flashscore.pl/zawodnik/bublik-alex...,https://www.flashscore.pl/zawodnik/gille-sande...,2.85,1.43,2.75,1.42,,,2.93,1.41,2.8,1.39,2.95,1.36,2.94,1.35
6,"ATP - DEBEL: Madryt (Hiszpania), ziemna - 1/8-...",06-05-2021 16:45:00,Raven Klaasen / Ben McLachlan,Pierre-Hugues Herbert / Nicolas Mahut,0,,,0,,,,/,/,/,85 / 246,https://www.flashscore.pl/zawodnik/klaasen-rav...,https://www.flashscore.pl/zawodnik/herbert-pie...,3.25,1.35,3.2,1.32,3.2,1.32,3.35,1.33,3.1,1.32,3.05,1.34,3.41,1.27
7,CHALLENGER MĘŻCZYŹNI - SINGIEL: Biella 5 (Włoc...,06-05-2021 14:40:00,Jay Clarke,Guido Andreozzi,1,6 3 2,30.0,1,3 6 2,30.0,player_1,Gbr,Arg,233,221,https://www.flashscore.pl/zawodnik/clarke-jay/...,https://www.flashscore.pl/zawodnik/andreozzi-g...,2.65,1.48,2.55,1.45,2.55,1.45,2.62,1.47,2.5,1.52,2.4,1.47,2.35,1.5
8,CHALLENGER MĘŻCZYŹNI - SINGIEL: Biella 5 (Włoc...,06-05-2021 15:20:00,Joao Menezes,Juan Pablo Varillas,0,4 2,0.0,1,6 2,30.0,player_2,Bra,Per,199,150,https://www.flashscore.pl/zawodnik/menezes-joa...,https://www.flashscore.pl/zawodnik/varillas-ju...,2.85,1.43,2.75,1.4,2.75,1.4,2.81,1.42,2.9,1.39,2.75,1.37,2.65,1.4
9,CHALLENGER MĘŻCZYŹNI - SINGIEL: Biella 5 (Włoc...,06-05-2021 17:00:00,Leonardo Mayer,Alexandre Muller,0,,,0,,,,Arg,Fra,160,202,https://www.flashscore.pl/zawodnik/mayer-leona...,https://www.flashscore.pl/zawodnik/muller-alex...,1.66,2.24,1.62,2.15,,,1.65,2.21,1.64,2.2,1.61,2.14,1.6,2.14


In [33]:
data.dtypes

event                    object
match_time               object
player_1                 object
player_2                 object
player_1_score_sets      object
player_1_score_games     object
player_1_score_points    object
player_2_score_sets      object
player_2_score_games     object
player_2_score_points    object
serving                  object
player_1_nationality     object
player_2_nationality     object
player_1_rank            object
player_2_rank            object
player_1_link            object
player_2_link            object
player_1_eFortuna        object
player_2_eFortuna        object
player_1_STS             object
player_2_STS             object
player_1_Betclic         object
player_2_Betclic         object
player_1_Betfan          object
player_2_Betfan          object
player_1_Pzbuk           object
player_2_Pzbuk           object
player_1_Lvbet           object
player_2_Lvbet           object
player_1_Totolotek       object
player_2_Totolotek       object
dtype: o

In [35]:
# Hand-pick non-numeric columns
character_columns = ['event', 'match_time', 'player_1', 'player_2', 'player_1_score_games', 'player_2_score_games',
                    'player_1_score_points', 'player_2_score_points', 'serving', 'player_1_nationality',
                    'player_2_nationality', 'player_1_rank', 'player_2_rank', 'player_1_link', 'player_2_link']

# Note that `player_1_score_games` (and `player_2_score_games`) are not integers, because it contains games won
# in the subsequent sets, separated by a space, and `player_1_score_points` (and `player_2_score_points`) is not
# an integer column either, because it may contain 'A' (advantage).
# `player_1_rank` and `player_2_rank` are also not integers, since for doubles they contain an expression 'x / y'.

# Convert numberic (float/integer) columns to their proper types
for col in data.drop(character_columns, axis = 1).columns:
    data[col] = pd.to_numeric(data[col], downcast = 'integer')

In [36]:
data.dtypes

event                     object
match_time                object
player_1                  object
player_2                  object
player_1_score_sets         int8
player_1_score_games      object
player_1_score_points     object
player_2_score_sets         int8
player_2_score_games      object
player_2_score_points     object
serving                   object
player_1_nationality      object
player_2_nationality      object
player_1_rank             object
player_2_rank             object
player_1_link             object
player_2_link             object
player_1_eFortuna        float64
player_2_eFortuna        float64
player_1_STS             float64
player_2_STS             float64
player_1_Betclic         float64
player_2_Betclic         float64
player_1_Betfan          float64
player_2_Betfan          float64
player_1_Pzbuk           float64
player_2_Pzbuk           float64
player_1_Lvbet           float64
player_2_Lvbet           float64
player_1_Totolotek       float64
player_2_T

In [37]:
# Export the data, naming the file after the time scraping was finished
scraping_time = scraping_time.strftime("%d%m%y_%H%M%S")
data.to_csv('data_' + scraping_time + '.csv', index = False)

In [None]:
# Note: the retrieved match result would not be correct for the matches that are finished. However, since we are 
# interested in the matches that one can bet, we filter out the ended matches. 