### Scraping live betting odds from [flashscore.pl/tenis](https://www.flashscore.pl/tenis/)

In [19]:
import numpy as np
import pandas as pd
from urllib import request
from bs4 import BeautifulSoup as BS
import re
import requests
import json
from datetime import datetime
from datetime import timedelta
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 30)

In [20]:
# Prepare a data frame
data = pd.DataFrame(columns = ['event', 'match_time', 'player_1', 'player_2', 
                               'player_1_score_sets', 'player_1_score_games', 'player_1_score_points',
                               'player_2_score_sets', 'player_2_score_games', 'player_2_score_points', 
                               'serving',
                               'player_1_nationality', 'player_2_nationality', 
                               'player_1_rank', 'player_2_rank',
                               'player_1_link', 'player_2_link',
                               'player_1_eFortuna', 'player_2_eFortuna',
                               'player_1_STS', 'player_2_STS',
                               'player_1_Betclic', 'player_2_Betclic',
                               'player_1_Betfan', 'player_2_Betfan',
                               'player_1_Pzbuk', 'player_2_Pzbuk',
                               'player_1_Lvbet', 'player_2_Lvbet',
                               'player_1_Totolotek', 'player_2_Totolotek'
                              ])

# Retrieve data file for the main page for today's matches, including current scores
headers = {"Accept":  "*/*",
           "Accept-Encoding": "gzip, deflate, br",
           "Accept-Language": "pl,en-US;q=0.7,en;q=0.3",
           "Connection": "keep-alive",
           "Host": "d.flashscore.pl",
           "Referer": "https://d.flashscore.pl/x/feed/proxy-fetch",
           "TE": "Trailers",
           "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0",
           "x-fsign": "SW9D1eZo"
          }

s = requests.Session()

main_page = s.get('https://d.flashscore.pl/x/feed/f_2_0_2_pl_1', 
                        headers = headers)

# Split the data for the matches (Note: the 1st element of `matches` will not be a match, but we do not worry, 
# because this element will be filtered out in the next step.)
matches = re.split('¬~AA÷', main_page.text)

# It seems that that the ended/cancelled/interrupted matches contain the tag '¬AB÷3', the current ones - '¬AB÷2', 
# and the future ones - '¬AB÷1'. We will keep only those matches, for which there are active/future bets.
matches = [match for match in matches if '¬AB÷1' in match or '¬AB÷2' in match]

for match in matches:
    
    new_row = {}
    
    # Retrieve match ID that will be used for retriving detailed information for the match (betting odds)
    match_id = re.search('.*?¬AD÷', match).group()[:-4]
    url = 'https://www.flashscore.pl/mecz/' + match_id + '/#zestawienie-kursow/home-away/koniec-meczu'
    
    # Retrieve match time
    # Add 2 hours (time zone difference).
    match_time = re.search('¬AD÷.*?¬ADE÷', match).group()[4:-5]
    match_time = datetime.utcfromtimestamp(int(match_time))
    match_time = match_time + timedelta(hours = 2)
    new_row['match_time'] = match_time.strftime('%d-%m-%Y %H:%M:%S')
    
    # Retrieve current match score from the file (if match not started, NoType object is found)
    player_1_score = re.search('¬AG÷.*?¬OA÷', match)
    player_2_score = re.search('¬AH÷.*?¬OB÷', match)
    
    # Strip the score from unnecesary symbols and split it into sets, games and points
    if player_1_score:
        player_1_score = player_1_score.group()[4:-4]
        # (Possibly remove the part of the score accounting for tie-break points.)
        player_1_score = re.sub('¬D.÷\d+', '', player_1_score)
        player_1_score = re.split('¬.*?÷', player_1_score)
        new_row['player_1_score_sets'] = player_1_score[0]
        new_row['player_1_score_games'] = ' '.join(map(str, player_1_score[1:-1]))
        new_row['player_1_score_points'] = player_1_score[-1]

    else:
        new_row['player_1_score_sets'] = 0
        new_row['player_1_score_games'] = ''
        new_row['player_1_score_points'] = ''

    if player_2_score:
        player_2_score = player_2_score.group()[4:-4]
        player_2_score = re.sub('¬D.÷\d+', '', player_2_score)
        player_2_score = re.split('¬.*?÷', player_2_score)
        new_row['player_2_score_sets'] = player_2_score[0]
        new_row['player_2_score_games'] = ' '.join(map(str, player_2_score[1:-1]))
        new_row['player_2_score_points'] = player_2_score[-1]

    else:
        new_row['player_2_score_sets'] = 0
        new_row['player_2_score_games'] = ''
        new_row['player_2_score_points'] = ''
        
    # Retrieve information on who is serving right now. It seems that '¬WC÷1' signifies that it is the 1st player,
    # and '¬WC÷2' - the 2nd player.
    if '¬WC÷1' in match:
        new_row['serving'] = 'player_1'
    elif '¬WC÷2' in match:
        new_row['serving'] = 'player_2'
    else:
        new_row['serving'] = np.nan
        
    try:
        html = s.get(url)
        match_page = BS(html.content, 'html.parser')
        
        # Retrieve the event title from the the match's site
        new_row['event'] = match_page.head.find_all('meta')[5]['content']
        
        # Retrieve players' names
        players = match_page.head.find_all('meta')[4]['content']
        players = re.split(' - ', players)
        new_row['player_1'] = players[0]
        new_row['player_2'] = players[1]
        
        # Retrieve a JS part containing some details on the players
        script = match_page.find_all('script')[1].string[22:-4]
        details = json.loads(script)
        
        details_player_1 = details['participantsData']['home']
        details_player_2 = details['participantsData']['away']
        
        # Retrieve players' nationality
        # In the case of doubles, seprate the information by ' / '.
        new_row['player_1_nationality'] = ' / '.join([player['country'] for player in details_player_1])
        new_row['player_2_nationality'] = ' / '.join([player['country'] for player in details_player_2])

        # Retrieve players' rankings
        # In the case of doubles, seprate the information by ' / '.
        new_row['player_1_rank'] = ' / '.join([player['rank'][1] if player['rank']!=[] else '' for player \
                                               in details_player_1])
        new_row['player_2_rank'] = ' / '.join([player['rank'][1] if player['rank']!=[] else '' for player \
                                               in details_player_2])
        
        # Retrieve links to players' pages on flashscore.pl
        # In the case of doubles, seprate the information by ' / '.
        new_row['player_1_link'] = ' / '.join(['https://www.flashscore.pl' + player['detail_link'] for player \
                                               in details_player_1]) 
        new_row['player_2_link'] = ' / '.join(['https://www.flashscore.pl' + player['detail_link'] for player \
                                               in details_player_2])

        
    except:
        new_row['event'] = np.nan
        # (For players' names we could use another method of retrieving this information, not using the match_page).
        new_row['player_1'] = re.search('¬AE÷.*?¬JA÷', match).group()[4:-4]
        new_row['player_2'] = re.search('¬AF÷.*?¬JB÷', match).group()[4:-4]
        new_row['player_1_nationality'] = np.nan
        new_row['player_2_nationality'] = np.nan
        new_row['player_1_rank'] = np.nan
        new_row['player_2_rank'] = np.nan
        new_row['player_1_link'] = np.nan
        new_row['player_2_link'] = np.nan
        
        
    try:
        # Retrieve a file including betting odds from the match's site
        match_odds = s.get('https://d.flashscore.pl/x/feed/df_od_1_' + match_id, headers = headers)

        # Retrieve the part of data including end of match betting odds
        match_odds = re.search('(home-away).*?(Set 1)', match_odds.text).group()
        
        # It seems that invalid (crossed-out) odds are those that end up with '¬OG÷0', and the valid ones
        # end up with '¬OG÷1'.
        match_odds = re.findall('¬OD÷.*?¬OG÷.', match_odds)
        valid_odds = ''.join([o for o in match_odds if o[-1]=='1'])

        # Retrieve the lists of the bookmakers' names and the odds
        bookmakers = re.findall('¬OD.*?¬OPI', valid_odds)
        player_1_odds = re.findall('¬XB÷.*?¬XC', valid_odds) 
        player_2_odds = re.findall('¬XC÷.*?¬OG', valid_odds)
        
        # Strip the bookmakers names from unnecessary symbols
        bookmakers = [i[4:-4] for i in bookmakers]
        
        # Strip the odds data from unnecessary symbols, split on the last change of the odds, 
        # and take the last value
        player_1_odds = [re.split('\[.\]', i[4:-3])[-1] for i in player_1_odds]
        player_2_odds = [re.split('\[.\]', i[4:-3])[-1] for i in player_2_odds]
        
        # Assign betting odds for 7 possible bookmakers, for both players.
        # (Regex is used to produce an appropriate form of a variable (column name).)
        for bookmaker in ['eFortuna.pl', 'STS.pl', 'Betclic.pl', 'Betfan.pl', 
                          'Lvbet.pl', 'Pzbukpl', 'Totolotek.pl']:
            
            try:
                new_row['player_1_' + re.search('\w*(?=[\.p])', bookmaker).group()] = \
                player_1_odds[bookmakers.index(bookmaker)]
            except:
                new_row['player_1_' + re.search('\w*(?=[\.p])', bookmaker).group()] = np.nan

            try:
                new_row['player_2_' + re.search('\w*(?=[\.p])', bookmaker).group()] = \
                player_2_odds[bookmakers.index(bookmaker)]
            except:
                new_row['player_2_' + re.search('\w*(?=[\.p])', bookmaker).group()] = np.nan
            
    except:
        
        for bookmaker in ['eFortuna.pl', 'STS.pl', 'Betclic.pl', 'Betfan.pl', 
                          'Lvbet.pl', 'Pzbukpl', 'Totolotek.pl']:
            
            new_row['player_1_' + re.search('\w*(?=[\.p])', bookmaker).group()] = np.nan
            new_row['player_2_' + re.search('\w*(?=[\.p])', bookmaker).group()] = np.nan
            
    data = data.append(new_row, ignore_index = True)

scraping_time = datetime.now()

In [21]:
data

Unnamed: 0,event,match_time,player_1,player_2,player_1_score_sets,player_1_score_games,player_1_score_points,player_2_score_sets,player_2_score_games,player_2_score_points,serving,player_1_nationality,player_2_nationality,player_1_rank,player_2_rank,...,player_2_link,player_1_eFortuna,player_2_eFortuna,player_1_STS,player_2_STS,player_1_Betclic,player_2_Betclic,player_1_Betfan,player_2_Betfan,player_1_Pzbuk,player_2_Pzbuk,player_1_Lvbet,player_2_Lvbet,player_1_Totolotek,player_2_Totolotek
0,"ATP - SINGIEL: Estoril (Portugalia), ziemna - ...",29-04-2021 11:30:00,Pedro Martinez,Cameron Norrie,0,,,0,,,,Esp,Gbr,100,50,...,https://www.flashscore.pl/zawodnik/norrie-came...,2.29,1.65,2.2,1.65,2.29,1.63,2.33,1.6,2.35,1.62,2.2,1.58,2.11,1.65
1,"ATP - SINGIEL: Estoril (Portugalia), ziemna - ...",29-04-2021 14:00:00,Richard Gasquet,Cristian Garin,0,,,0,,,,Fra,Chi,52,22,...,https://www.flashscore.pl/zawodnik/garin-crist...,3.55,1.31,3.4,1.3,3.55,1.3,3.65,1.28,3.6,1.29,3.3,1.29,3.41,1.27
2,"ATP - SINGIEL: Estoril (Portugalia), ziemna - ...",29-04-2021 15:30:00,Denis Shapovalov,Corentin Moutet,0,,,0,,,,Can,Fra,14,73,...,https://www.flashscore.pl/zawodnik/moutet-core...,1.45,2.8,1.45,2.7,1.45,2.8,1.43,2.84,1.45,2.8,1.42,2.65,1.44,2.59
3,"ATP - SINGIEL: Estoril (Portugalia), ziemna - ...",29-04-2021 17:00:00,Pierre-Hugues Herbert,Albert Ramos-Vinolas,0,,,0,,,,Fra,Esp,86,46,...,https://www.flashscore.pl/zawodnik/ramos-vinol...,2.75,1.46,2.7,1.45,2.75,1.46,2.81,1.44,2.75,1.45,2.65,1.42,2.62,1.43
4,"ATP - SINGIEL: Monachium (Niemcy), ziemna - 1/...",29-04-2021 11:00:00,Filip Krajinovic,Yannick Hanfmann,0,,,0,,,,Srb,Ger,36,98,...,https://www.flashscore.pl/zawodnik/hanfmann-ya...,1.48,2.75,1.47,2.65,1.46,2.75,1.44,2.8,1.45,2.75,1.43,2.6,1.43,2.62
5,"ATP - SINGIEL: Monachium (Niemcy), ziemna - 1/...",29-04-2021 12:30:00,Federico Coria,Norbert Gombos,0,,,0,,,,Arg,Svk,90,95,...,https://www.flashscore.pl/zawodnik/gombos-norb...,2.03,1.8,2.0,1.75,2.07,1.76,2.1,1.73,2.05,1.8,1.98,1.74,1.96,1.75
6,"ATP - SINGIEL: Monachium (Niemcy), ziemna - 1/...",29-04-2021 12:30:00,Dominik Koepfer,Jan-Lennard Struff,0,,,0,,,,Ger,Ger,54,44,...,https://www.flashscore.pl/zawodnik/struff-jan-...,2.06,1.66,2.25,1.65,2.3,1.62,2.35,1.6,2.25,1.66,2.2,1.6,2.25,1.57
7,"ATP - SINGIEL: Monachium (Niemcy), ziemna - 1/...",29-04-2021 14:00:00,Nikoloz Basilaszwili,Daniel Elahi Galan Riveros,0,,,0,,,,Geo,Col,35,115,...,https://www.flashscore.pl/zawodnik/galan-river...,1.8,2.04,1.8,1.95,1.79,2.04,1.76,2.07,1.8,2.05,1.75,1.97,1.78,1.93
8,"WTA - SINGIEL: Madryt (Hiszpania), ziemna - 1/...",29-04-2021 11:00:00,Victoria Jimenez Kasintseva,Kiki Bertens,0,,,0,,,,And,Ned,901,10,...,https://www.flashscore.pl/zawodnik/bertens-kik...,8.8,1.07,8.8,1.06,9.6,1.06,10.5,1.04,10.25,1.05,8.2,1.05,7.77,1.05
9,"WTA - SINGIEL: Madryt (Hiszpania), ziemna - 1/...",29-04-2021 11:00:00,Angelique Kerber,Marketa Vondrousova,0,,,0,,,,Ger,Cze,26,21,...,https://www.flashscore.pl/zawodnik/vondrousova...,2.9,1.42,2.8,1.42,2.9,1.42,2.97,1.4,2.8,1.46,2.7,1.41,2.65,1.42


In [22]:
data.dtypes

event                    object
match_time               object
player_1                 object
player_2                 object
player_1_score_sets      object
player_1_score_games     object
player_1_score_points    object
player_2_score_sets      object
player_2_score_games     object
player_2_score_points    object
serving                  object
player_1_nationality     object
player_2_nationality     object
player_1_rank            object
player_2_rank            object
player_1_link            object
player_2_link            object
player_1_eFortuna        object
player_2_eFortuna        object
player_1_STS             object
player_2_STS             object
player_1_Betclic         object
player_2_Betclic         object
player_1_Betfan          object
player_2_Betfan          object
player_1_Pzbuk           object
player_2_Pzbuk           object
player_1_Lvbet           object
player_2_Lvbet           object
player_1_Totolotek       object
player_2_Totolotek       object
dtype: o

In [23]:
# Hand-pick non-numeric columns
character_columns = ['event', 'match_time', 'player_1', 'player_2', 'player_1_score_games', 'player_2_score_games',
                    'player_1_score_points', 'player_2_score_points', 'player_1_nationality',
                    'player_2_nationality', 'player_1_rank', 'player_2_rank', 'player_1_link', 'player_2_link']

# Note that `player_1_score_games` (and `player_2_score_games`) are not integers, because it contains games won
# in the subsequent sets, separated by a space, and `player_1_score_points` (and `player_2_score_points`) is not
# an integer column either, because it may contain 'A' (advantage).
# `player_1_rank` and `player_2_rank` are also not integers, since for doubles they contain an expression 'x / y'.

# Convert numberic (float/integer) columns to their proper types
for col in data.drop(character_columns, axis = 1).columns:
    data[col] = pd.to_numeric(data[col], downcast = 'integer')

In [24]:
data.dtypes

event                     object
match_time                object
player_1                  object
player_2                  object
player_1_score_sets         int8
player_1_score_games      object
player_1_score_points     object
player_2_score_sets         int8
player_2_score_games      object
player_2_score_points     object
serving                  float64
player_1_nationality      object
player_2_nationality      object
player_1_rank             object
player_2_rank             object
player_1_link             object
player_2_link             object
player_1_eFortuna        float64
player_2_eFortuna        float64
player_1_STS             float64
player_2_STS             float64
player_1_Betclic         float64
player_2_Betclic         float64
player_1_Betfan          float64
player_2_Betfan          float64
player_1_Pzbuk           float64
player_2_Pzbuk           float64
player_1_Lvbet           float64
player_2_Lvbet           float64
player_1_Totolotek       float64
player_2_T

In [25]:
# Export the data, naming the file after the time scraping was finished
scraping_time = scraping_time.strftime("%d%m%y_%H%M%S")
data.to_csv('data_' + scraping_time + '.csv', index = False)

In [None]:
# Note: the retrieved match result would not be correct for the matches that are finished. However, since we are 
# interested in the matches that one can bet, we filter out the ended matches. 

In [None]:
# Co nie jest jeszcze do końca dobrze:
# - nationality nie jest łapane w deblach (stąd ' / '). Nie występuje po prostu w htmlu spotkania w przypadku debli
# (tak jak w przypadku singli występuje). Można by wchodzić na strony poszczególnych zawodników i pobierać 
# narodowość stamtąd, ale to by isottnie wydłużyło czas scrapowania, a nationality nie jest kluczową zmienną.

In [None]:
# Co powinno być w końcowym pliku:

# - ogólny rozkład kursów, statystyki makro (max kursy, min kursy, rozkład kursów, rozkład w zależności
# od rożnych zmiennych - np. event, debel/singiel, różnica rankingu, bukmacher; rozkład "opłacalności"* zakładów)
# - najlepsze zakłady według 2 kryteriów opłacalności (ile, na kogo, u jakiego bukmachera)
# - wykres trade-off: expected_return i z jaką stratą by się wiązał w prypadku najlepszego zakładu
# (dla expected return 1%, 2%, 3%, 4%, 5%, 10%, 15%, 20%, 25%, 30%, 40%, 50%)

# *opłacalność - albo max. zysk z wygranej faworyta w przypadku ustalonej straty w przypadku wygranej 2. strony,
# albo min. strata w przypadku przegranej faworyta przy ustalonym zysku w przypadku wygranej faworyta.

# Jako parametry raportu:
# - możliwość zadeklarowania kwoty
# - możliwość zadeklarowania min. poziomu zysku w przypadku powodzenia
# - możliwość zadeklarowania max. poziomu straty w przypadu braku powodzenia
# - możliwość wyboru, czy mecz ma być przyszły, czy już w trakcie

# Inne hipotezy:
# Czy im bliższe kursy, tym lepsza opłacalność? Czy może odwrotnie? (Dodać zm. proc. różnica kursów i wyplotować
# kontra "opłacalność")
# Faworyt bukmachera to ten, kto ma niższy kurs. Faworyt obstawiającego to ten, który jeśli wygra, to obstawiający
# wygra >100%. Czy niezgodność faworytów bukmachera i obstawiającego wiąże się z lepszą "opłacalnością" zakładu?
# (Myślę, że tak, ale można policzyć różnice i wyplotować)

# Inne pomysły:
# - analiza "opłacalności" (zysk/ryzyko) zakładów dla konkretnych bukmacherów, bez zakładów między różnymi 
# bukmacherami) - o ile gorzej jest, kiedy trzymamy się tylko jednego bukmachera (wykres, np. linie trade-off dla
# wszystkich bukmacherów z osobna i dla przypadku, kiedy można ich mieszać)