# Data Retrieval

## Game scores


In this part I will retrieve data from the official site of the French basketball league using Selenium. Note that all web selenium elements in the code are prefixed by `se_`.

In [43]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import pandas as pd

In [53]:
GAME_LIST_URL = "https://www.lnb.fr/fr/pro-a/calendrier-proa-25.html"
games = pd.DataFrame(columns=[
    'date', 
    'home_team', 
    'home_score',
    'away_team',
    'away_score',
    'qt_1_home_score',
    'qt_1_away_score',
    'qt_2_home_score',
    'qt_2_away_score',
    'qt_3_home_score',
    'qt_3_away_score',
    'qt_4_home_score',
    'qt_1_away_score',
    'ot_1_home_score',
    'ot_1_away_score',
    'ot_2_home_score',
    'ot_2_away_score',
    'ot_3_home_score',
    'ot_3_away_score',
#     'is_on_tv'
])

# Instanciating the browser
driver = webdriver.Firefox()
driver.get(GAME_LIST_URL)
(driver.page_source).encode('utf-8')

# Retrieving game data (not)
se_months = driver.find_element(By.XPATH, "/html/body/div[2]/section/nav/ul").find_elements(By.TAG_NAME, "li")

for se_month in se_months:
    se_month.click()
    month = se_month.get_attribute('data-month')
    se_games = driver.find_elements(By.XPATH, f"/html/body/div[2]/section/section/section[@data-month='{month}']/div")
    
    for se_game in se_games:
        # Game score
        game_score = se_game.find_element(By.XPATH, ".//div[@class='game-result']").text.split()
        home_score = int(game_score[0])
        away_score = int(game_score[2])
        
        if home_score > 0 and away_score > 0: # Games not played yet are displayed but as 0-0.
            # Game date
            game_date = se_game.find_element(By.XPATH, "./div[1]/span").text
            game_date = pd.to_datetime(game_date[5:15], format="%d.%m.%Y")

            # Teams
            home_team = se_game.find_element(By.XPATH, ".//div[@class='team'][1]//span[@class='team-name']").text
            away_team = se_game.find_element(By.XPATH, ".//div[@class='team'][2]//span[@class='team-name']").text
            
            # Qts scores
            qt_1_2_scores = se_game.find_element(By.XPATH, ".//div[@class='game-quarters']/div/span[1]").text.split()
            qt_3_4_scores = se_game.find_element(By.XPATH, ".//div[@class='game-quarters']/div/span[2]").text.split()
            qt_1_home_score = int(qt_1_2_scores[0])
            qt_1_away_score = int(qt_1_2_scores[2])
            qt_2_home_score = int(qt_1_2_scores[4]) - qt_1_home_score
            qt_2_away_score = int(qt_1_2_scores[6]) - qt_1_away_score
            qt_3_home_score = int(qt_3_4_scores[1]) - qt_2_home_score - qt_1_home_score
            qt_3_away_score = int(qt_3_4_scores[3]) - qt_2_away_score - qt_1_away_score
            qt_4_home_score = int(qt_3_4_scores[5]) - qt_3_home_score - qt_2_home_score - qt_1_home_score
            qt_4_away_score = int(qt_3_4_scores[7]) - qt_3_away_score - qt_2_away_score - qt_1_away_score
            
            # OT
            ot_1_home_score = None
            ot_1_away_score = None
            ot_2_home_score = None
            ot_2_away_score = None
            ot_3_home_score = None
            ot_3_away_score = None
            try:
                overtimes = se_game.find_elements(By.XPATH, ".//div[@class='score-result']/span[@class='over']")
                nb_overtimes = len(overtimes)
                
                for i in range(1, nb_overtimes+1):
                    ot = overtimes[i-1].text.split()
                    
                    if (i == 1):
                        ot_1_home_score = int(ot[0]) - qt_4_home_score - qt_3_home_score - qt_2_home_score - qt_1_home_score
                        ot_1_away_score = int(ot[2]) - qt_4_away_score - qt_3_away_score - qt_2_away_score - qt_1_away_score
                    elif (i == 2):
                        ot_2_home_score = int(ot[0]) - ot_1_home_score - qt_4_home_score - qt_3_home_score - qt_2_home_score - qt_1_home_score
                        ot_2_away_score = int(ot[2]) - ot_1_away_score - qt_4_away_score - qt_3_away_score - qt_2_away_score - qt_1_away_score
                    elif (i == 3):
                        ot_3_home_score = int(ot[0]) - ot_2_home_score - ot_1_home_score - qt_4_home_score - qt_3_home_score - qt_2_home_score - qt_1_home_score
                        ot_3_away_score = int(ot[2]) - ot_2_away_score - ot_1_away_score - qt_4_away_score - qt_3_away_score - qt_2_away_score - qt_1_away_score
            except NoSuchElementException:
                pass
            
            # Save game
            games = games.append({
                'date': game_date, 
                'home_team': home_team,
                'home_score': home_score,
                'away_team': away_team,
                'away_score': away_score,
                'qt_1_home_score': qt_1_home_score,
                'qt_1_away_score': qt_1_away_score,
                'qt_2_home_score': qt_2_home_score,
                'qt_2_away_score': qt_2_away_score,
                'qt_3_home_score': qt_3_home_score,
                'qt_3_away_score': qt_3_away_score,
                'qt_4_home_score': qt_4_home_score,
                'qt_1_away_score': qt_1_away_score,
                'ot_1_home_score': ot_1_home_score,
                'ot_1_away_score': ot_1_away_score,
                'ot_2_home_score': ot_2_home_score,
                'ot_2_away_score': ot_2_away_score,
                'ot_3_home_score': ot_3_home_score,
                'ot_3_away_score': ot_3_away_score
            }, ignore_index=True)

driver.close()

games.to_csv('data/game_scores.csv', index=False, encoding="utf-8")