In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import time

In [2]:
from utils.odds import calc_probs

In [3]:
def get_soup(url):
    # Configure Chrome options for headless mode
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # Run Chrome in headless mode
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    chrome_options.add_argument("--start-maximized")  # Start in maximized mode
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")  # Hide automation
    chrome_options.add_argument('--disable-http2')

     # Add headers via DesiredCapabilities
    caps = DesiredCapabilities.CHROME
    caps["goog:loggingPrefs"] = {"performance": "ALL"}  # Optional: Log network events
    caps["pageLoadStrategy"] = "eager"  # Speeds up loading by waiting less
    chrome_options.headless = False
    # Create a new instance of the Chrome WebDriver
    driver = webdriver.Chrome(options=chrome_options)
    # Navigate to the URL
    print("Preparing soup ...")
    driver.get(url)
    # Retrieve page source code 
    page_source = driver.page_source
    # Parse the page source using BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')
    driver.quit()
    return soup 

In [4]:
def extract_snapshot(soup):
    # Get all "live" Event <a><a> elements
    event_tags = []
    try:
        live_events = soup.select_one('div.Program-styles-module-desktop')
        event_tags = live_events.select('a.EventRow-styles-module-event-row')
    except:
        pass
    print("Tasting soup ...")
    # Extract text content from each anchor tag
    # match time: xth minute
    # event name: "team A_versus_team B"
    # timestamp: 
    # odds: 
    # probabilities:
    # score:
    #columns = ['primary_key', 'name_team_A', 'name_team_B', 'gametime', 'score_team_A', 'score_team_B', 'odd_win_team_A', 'odd_draw', 'odd_win_team_B']
    data = []
    #print(event_tags)
    
    if len(event_tags) != 0:
        for event in event_tags:
            #print("test")
            try:
                team_names= event.select('span.EventTeams-styles-module-team-title')
                #print(team_names)
                score_tags = event.select_one('div.EventScores-styles-module-scores')
                score_list = [char for char in [score.get_text(strip=True) for score in  score_tags][-1]]
                team_names_list = [name.get_text(strip=True) for name in  team_names]
                event_id = str.replace(team_names_list[0] + "_versus_" + team_names_list[1], " ", "_")
                # Get current date and time
                current_datetime = datetime.datetime.now()
                current_date = datetime.date.today()
                current_time = datetime.datetime.now().time().strftime("%H:%M:%S")
                # Format the current datetime as a string
                datetime_string = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
                snapshot_id = event_id + "_time_" + datetime_string 
                #if len(event.select('div.EventDateTime-styles-module-live-date'))> 0:
                game_time_tag = event.select_one('div.EventDateTime-styles-module-info-cell-live')
                #print(game_time_tag)
                game_time = [game_time.get_text(strip=True) for game_time in  game_time_tag][0]
                #else
                odd_tags = event.select('div.EventOddGroup-styles-module-odd-group')
                odds_list = [1.0 if x == "" else float(x.replace(",",".")) for x in [odd.get_text(strip=True) for odd in  odd_tags[0]]]
                #print(odds_list)
                probs_list = calc_probs(odds_list)
                #print(probs_list)
                
                row_data = {
                    'primary_key': snapshot_id,
                    'match_key': event_id,
                    'date': current_date,
                    'time_of_day': current_time,
                    'name_team_A': team_names_list[0],
                    'name_team_B': team_names_list[1],
                    'gametime': game_time,
                    'score_team_A': score_list[0],
                    'score_team_B': score_list[1],
                    'odd_win_team_A': odds_list[0],
                    'odd_draw': odds_list[1],
                    'odd_win_team_B': odds_list[2],
                    'prob_win_team_A': probs_list[0],
                    'prob_draw': probs_list[1],
                    'prob_win_team_B': probs_list[2],
                    }
                #print(row_data)
                
                data.append(row_data)
                
            except:
                pass

    df = pd.DataFrame(data)
    return df

In [5]:
url = "https://sports.tipico.de/de/live/default"

soup = get_soup(url)

scraped_data = []

# Define the duration for which you want the script to run (in seconds)
duration =   30 # Run for 1 hour (adjust as needed)

# Start time
start_time = time.time()

# Main loop to scrape data
while time.time() - start_time < duration:
    # Your scraping logic here
    # For example, navigate to a webpage and scrape its content
    soup = get_soup(url)
    # Scraping code...
    # Store the scraped data in a dictionary or list
    snapshot_data = extract_snapshot(soup)
    scraped_data.append(snapshot_data)
    # Sleep for a certain interval before the next iteration
    time.sleep(10)  # Sleep for 10 seconds between iterations

# Close the web driver

# Convert the scraped data to a pandas DataFrame
#df = pd.DataFrame(scraped_data)

op_df = pd.concat(scraped_data, ignore_index=True)

print(op_df)

# Save the DataFrame to a CSV file

op_df.to_csv('scraped_data.csv', index=False)




Preparing soup ...
Preparing soup ...
Tasting soup ...
Preparing soup ...
Tasting soup ...


KeyboardInterrupt: 

In [36]:
print(op_df)

[Empty DataFrame
Columns: []
Index: [], Empty DataFrame
Columns: []
Index: []]
