# Data Acquisition - Scraping OP.gg

### Import Libraries

In [3]:
from bs4 import BeautifulSoup
import requests
from itertools import product
import json
import os
import time
import re

import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException


### Static Variables

In [2]:
TIERS = ['all', 'challenger',
        'master',
        'diamond',
        'emerald',
        'platinum',
        'gold',
        'silver',
        'bronze',
        'iron']
POSITIONS = ['top', 'jungle', 'mid','adc', 'support']

ROOT_PATH_DATA = 'datasets'

ROOT_URL = 'op.gg/champions?'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

### Starting Browser

In [3]:
# Configura le opzioni del browser
chrome_options = Options()
#chrome_options.add_argument("--headless")  # Esegui il browser in modalità headless (senza interfaccia grafica)
chrome_options.add_argument("--start-maximized")
# Avvia il browser
driver = webdriver.Chrome(options=chrome_options)

In [4]:
def get_champion_root_url(champion_name, lane):
    return f'https://www.op.gg/champions/{champion_name}/build/{lane}?'

In [5]:
def get_complete_url(root_url, **params):
    if not root_url.startswith(('http://', 'https://')):
        root_url = 'http://' + root_url  # Aggiunge il protocollo se non è presente

    # Costruisce i parametri dell'URL
    param_string = '&'.join([f'{key}={value}' for key, value in params.items()])

    return f'{root_url}{param_string}'

### Scraping Functions

In [6]:
def get_soup_object(url):
        global driver    
        try:
            # Esegui la richiesta HTTP con Selenium
            driver.set_page_load_timeout(15)
            print("Chiamata Driver")
            driver.get(url)
            
            print("Cerco elemento con i counters")
            # Attendi fino a 10 secondi per il caricamento completo della pagina
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//div[@class='css-1kwzo20 e9r595k2']"))
            )
            
            print("Trovati gli elementi")       
            # Ottieni il contenuto HTML dopo il completamento del caricamento
            html_content = driver.page_source
            
            print("Ricavo gli html")
            # Utilizza BeautifulSoup per analizzare il contenuto HTML
            soup = BeautifulSoup(html_content, "html.parser")
            
            print("Richiesta SUCCESSO:", url)
            return soup
        except TimeoutException:
            print("Timeout durante il caricamento della pagina:", url)
            return None
        except WebDriverException as e:
        # Cattura un errore HTTP 429 (Too Many Requests)
            print(f"ERRORE WEBDRIVE {str(e)}. Tentativo: ")
            return None
        
        except Exception as e:
            print("Richiesta FAILED: ", url)
            print("Errore durante l'esecuzione di Selenium:", e)
            return None


In [9]:
def trova_target_champion(href):
    # Definisci il pattern regex per trovare la parola dopo "target_champion"
    pattern = r'target_champion=([a-zA-Z0-9_]+)'

    # Cerca il pattern nella stringa URL
    match = re.search(pattern, href)

    # Restituisci la parola dopo "target_champion" se trovata, altrimenti None
    if match:
        return match.group(1)
    else:
        return None

In [10]:
def pulisci_stringa(input_string):
    # Rimuovi caratteri speciali ('', punto e spazio vuoto) e converti tutto in minuscolo
    cleaned_string = re.sub(r"['. ]", "", input_string.lower())
    return cleaned_string

In [11]:
def get_rows_elements(soup): 
    grade = None
    try:
        grade = soup.find('div', class_ ='tier-icon').img['alt']
    except:
        print('Grade non trovato')



    counters_soup_list = soup.aside.findAll('div', recursive=False)[1].findAll('ul')[0].findAll('li')
    counter = []
    win_rate_weak = []
    games_played_weak = []

    for counter_soup in counters_soup_list:
        counter.append(trova_target_champion(counter_soup.a['href']))
        win_rate_weak.append(counter_soup.div.text.replace('%', ''))
        games_played_weak.append(re.search(r'(\d+(,\d+)?)', counter_soup.find('div', class_ ='play').get_text()).group(1).replace(',', ''))


    strong_soup_list = soup.aside.findAll('div', recursive=False)[1].findAll('ul')[1].findAll('li')
    strong = []
    win_rate_strong = []
    games_played_strong = []

    for strong_soup in strong_soup_list:
        strong.append(trova_target_champion(strong_soup.a['href']))
        win_rate_strong.append(strong_soup.div.text.replace('%', ''))
        games_played_strong.append(re.search(r'(\d+(,\d+)?)', strong_soup.find('div', class_ ='play').get_text()).group(1).replace(',', ''))

    #qualche check
    if len(strong) < 5:
        print("Problemi con STRONG: ", strong)
    
    if (len(counter)) < 5:
        print("PRoblemi con WEAK: ", counter)

    dict_with_elements = {
        'grade': grade,
        'counter': counter,
        'win_rate_weak': win_rate_weak,
        'games_played_weak': games_played_weak,
        'strong': strong,
        'win_rate_strong': win_rate_strong,
        'games_played_strong': games_played_strong
    }

    return dict_with_elements

In [12]:
def scrape_counter_champion(champion_name, tier, lane):
    global driver
    print("==============================================================")
    print(f'{champion_name} - {tier} - {lane}')
    champion_name_cleaned = pulisci_stringa(champion_name)
    root_champion = get_champion_root_url(champion_name_cleaned, lane)
    complete_url = get_complete_url(root_champion,tier = tier)

    n_attempts = 0

    while n_attempts < 3:
        n_attempts+=1
        soup = get_soup_object(complete_url)
        if soup:
            n_attempts = 3
        else:
            driver.quit()  
            time.sleep(60)
            driver = webdriver.Chrome(options=chrome_options)
            
            
            

    if soup:
        dict_with_elements = get_rows_elements(soup)
        
        dict_with_elements['champion_name'] = champion_name_cleaned
        dict_with_elements['tier'] = tier
        dict_with_elements['lane'] = lane
    else:
        print("PROBLEMI DI CONNESSIONE ")
        return None

    return dict_with_elements
    

### Load Lolalytics Dataframe

In [13]:
champions_lolalytics = pd.read_csv('datasets/champions_lolalytics.csv')
champions_lolalytics

Unnamed: 0,rank,champion_name,tier,lane,win_rate,pick_rate,ban_rate,pbi,games_played,tier_rank
0,1,Jax,S+,top,50.20,10.21,19.44,3,1737071,all
1,2,Fiora,S+,top,51.00,4.89,7.75,5,832573,all
2,3,Poppy,S,top,50.71,1.13,2.30,1,192796,all
3,4,Singed,S,top,52.80,1.53,0.48,4,260712,all
4,5,Zac,S,top,52.52,0.50,1.29,1,84558,all
...,...,...,...,...,...,...,...,...,...,...
3651,100,Anivia,D,support,36.99,0.43,0.79,-3,3393,iron
3652,101,Lulu,D-,support,40.67,2.45,0.99,-9,19551,iron
3653,102,Yuumi,D-,support,38.84,7.27,3.04,-40,57948,iron
3654,103,Sivir,D-,support,23.92,0.11,0.93,-2,899,iron


Wukong and Renata are 2 champions who are written differently in the URL

In [14]:
mapping_lane = {
    'middle': 'mid',
    'bottom': 'adc'
}
champions_lolalytics['lane'] = champions_lolalytics['lane'].replace(mapping_lane)

mapping_champion = {
    'Wukong': 'Monkey king',
    'Renata Glasc': 'renata'
}

champions_lolalytics['champion_name'] = champions_lolalytics['champion_name'].replace(mapping_champion)
champions_lolalytics['champion_name'].unique()

array(['Jax', 'Fiora', 'Poppy', 'Singed', 'Zac', 'Ornn', 'Cassiopeia',
       'Illaoi', 'Olaf', 'Rengar', 'Camille', 'Akshan', 'Vayne', 'Nasus',
       'Quinn', 'Urgot', 'Kled', 'Garen', 'Swain', 'Kayle', 'Malphite',
       'Sylas', 'Tahm Kench', 'Maokai', 'Trundle', 'Teemo', 'Tryndamere',
       'Warwick', 'Riven', 'Fiddlesticks', 'Dr. Mundo', 'Neeko', 'Udyr',
       'Zed', 'Shen', 'Aatrox', 'Mordekaiser', 'Azir', 'Aurelion Sol',
       'Jayce', 'Varus', 'Darius', "Cho'Gath", "K'Sante", 'Lillia',
       'Irelia', 'Qiyana', 'Yorick', 'Anivia', 'Kassadin', 'LeBlanc',
       'Naafiri', 'Vladimir', 'Viktor', 'Heimerdinger', 'Galio',
       'Renekton', 'Karthus', 'Kennen', 'Gwen', 'Annie', 'Ryze',
       "Rek'Sai", 'Sett', 'Gragas', 'Gnar', 'Akali', 'Rumble', 'Volibear',
       'Rammus', 'Sion', 'Malzahar', 'Yasuo', 'Yone', 'Graves', 'Kayn',
       'Pantheon', 'Lissandra', 'Ahri', 'Gangplank', 'Kalista', 'Ivern',
       'Shaco', 'Monkey king', 'Karma', 'Briar', 'Shyvana', 'Master Yi',
    

### Scraping Execution
Execution time: 330 minutes

In [15]:
num_rows = len(champions_lolalytics)

rows_appended = 0
list_temp = []
rows_failed = []
start_time = time.time()

for index, row in champions_lolalytics.iterrows():
    
    iter_start_time = time.time()
    scraped_info_dict = scrape_counter_champion(row['champion_name'], row['tier_rank'], row['lane'])
    
    if scraped_info_dict:
        print("Appending_dict success: ", scraped_info_dict)
        list_temp.append(scraped_info_dict)
    else:
        row_failed = {
            'champion_name': row['champion_name'],
            'tier_rank': row['tier_rank'],
            'lane': row['lane']
        }
        print("Appending row failed: ",row_failed)
        rows_failed.append(row_failed)
    
    rows_appended+=1
    percentage = round(rows_appended / num_rows * 100, 2)
    with open('list_temp2.json', 'w') as file:
        json.dump(list_temp, file)
    print("Percentage: ", percentage)
    print("Current index: ", index)

    iter_end_time = time.time()

    # Calcola e stampa il tempo trascorso nell'iterazione
    iter_time = iter_end_time - iter_start_time
    print(f"Tempo impiegato nell'iterazione {index}: {iter_time} secondi")



Jax - all - top
Chiamata Driver


Cerco elemento con i counters
Trovati gli elementi
Ricavo gli html
Richiesta SUCCESSO: https://www.op.gg/champions/jax/build/top?tier=all
Appending_dict success:  {'grade': '1', 'counter': ['garen', 'illaoi', 'drmundo', 'singed', 'zac'], 'win_rate_weak': ['44.01', '44.53', '44.85', '44.86', '45.8'], 'games_played_weak': ['100862', '50803', '28349', '14508', '5380'], 'strong': ['irelia', 'yasuo', 'yone', 'volibear', 'warwick'], 'win_rate_strong': ['55.01', '54.88', '54.02', '53.73', '52.95'], 'games_played_strong': ['33167', '19581', '89243', '33202', '9495'], 'champion_name': 'jax', 'tier': 'all', 'lane': 'top'}
Percentage:  0.03
Current index:  0
Tempo impiegato nell'iterazione 0: 2.896667957305908 secondi
Fiora - all - top
Chiamata Driver
Cerco elemento con i counters
Trovati gli elementi
Ricavo gli html
Richiesta SUCCESSO: https://www.op.gg/champions/fiora/build/top?tier=all
Appending_dict success:  {'grade': '2', 'counter': ['warwick', 'kayle', 'malphite', 'monkeyking', 'illaoi'], 

In [16]:
print("Tempo per SCRAPING: ", start_time)

Tempo per SCRAPING:  1703855988.9901698


Save rows for which scraping has failed

In [17]:
with open('rows_failed.json', 'w') as file:
    json.dump(rows_failed, file)

0 rows Failed

### Check

In [4]:
with open('list_temp2.json', 'r') as file:
    list_temp = json.load(file)

list_temp

[{'grade': '1',
  'counter': ['garen', 'illaoi', 'drmundo', 'singed', 'zac'],
  'win_rate_weak': ['44.01', '44.53', '44.85', '44.86', '45.8'],
  'games_played_weak': ['100862', '50803', '28349', '14508', '5380'],
  'strong': ['irelia', 'yasuo', 'yone', 'volibear', 'warwick'],
  'win_rate_strong': ['55.01', '54.88', '54.02', '53.73', '52.95'],
  'games_played_strong': ['33167', '19581', '89243', '33202', '9495'],
  'champion_name': 'jax',
  'tier': 'all',
  'lane': 'top'},
 {'grade': '2',
  'counter': ['warwick', 'kayle', 'malphite', 'monkeyking', 'illaoi'],
  'win_rate_weak': ['46.23', '46.78', '47.01', '47.43', '47.72'],
  'games_played_weak': ['7564', '12532', '27335', '2686', '20090'],
  'strong': ['ksante', 'akali', 'gwen', 'sion', 'sylas'],
  'win_rate_strong': ['55.97', '54.75', '54.5', '54.46', '54.18'],
  'games_played_strong': ['17031', '12587', '14770', '16710', '3721'],
  'champion_name': 'fiora',
  'tier': 'all',
  'lane': 'top'},
 {'grade': '4',
  'counter': ['singed', 'ka

In [5]:
df_list_temp = pd.DataFrame(list_temp)
df_list_temp

Unnamed: 0,grade,counter,win_rate_weak,games_played_weak,strong,win_rate_strong,games_played_strong,champion_name,tier,lane
0,1,"[garen, illaoi, drmundo, singed, zac]","[44.01, 44.53, 44.85, 44.86, 45.8]","[100862, 50803, 28349, 14508, 5380]","[irelia, yasuo, yone, volibear, warwick]","[55.01, 54.88, 54.02, 53.73, 52.95]","[33167, 19581, 89243, 33202, 9495]",jax,all,top
1,2,"[warwick, kayle, malphite, monkeyking, illaoi]","[46.23, 46.78, 47.01, 47.43, 47.72]","[7564, 12532, 27335, 2686, 20090]","[ksante, akali, gwen, sion, sylas]","[55.97, 54.75, 54.5, 54.46, 54.18]","[17031, 12587, 14770, 16710, 3721]",fiora,all,top
2,4,"[singed, kayle, olaf, garen, drmundo]","[45.86, 46.13, 46.16, 46.34, 46.54]","[1378, 3061, 2684, 6752, 1981]","[sylas, gragas, jayce, yasuo, akali]","[57.19, 55.65, 55.36, 54.8, 54.49]","[932, 859, 4801, 2208, 2841]",poppy,all,top
3,2,"[vayne, kayle, urgot, drmundo, trundle]","[46.07, 46.59, 48.82, 49.12, 49.55]","[1973, 5666, 3380, 4127, 12864]","[gragas, yasuo, gwen, akali, zac]","[59.3, 58.36, 57.44, 57.33, 57.23]","[1135, 2971, 4641, 3035, 692]",singed,all,top
4,4,"[singed, vayne, volibear, mordekaiser, kennen]","[42.77, 47.64, 47.9, 48.39, 48.92]","[692, 594, 1885, 2145, 646]","[sylas, malphite, pantheon, tryndamere, yasuo]","[58.61, 57.48, 57.23, 57.13, 56.78]","[633, 3304, 858, 1248, 1032]",zac,all,top
...,...,...,...,...,...,...,...,...,...,...
3651,,"[blitzcrank, nautilus, seraphine, senna, morgana]","[28.21, 33.33, 36.36, 36.76, 41.77]","[39, 51, 55, 68, 79]","[yuumi, brand]","[71.05, 51.35]","[38, 37]",anivia,iron,support
3652,5,"[bard, maokai, veigar, velkoz, brand]","[33.33, 34, 36.36, 36.73, 36.84]","[36, 50, 33, 49, 171]","[milio, renata, pantheon, malphite, soraka]","[57.14, 53.85, 53.03, 52.38, 52.1]","[35, 39, 66, 63, 119]",lulu,iron,support
3653,5,"[shaco, tahmkench, maokai, teemo, rakan]","[31.82, 33.33, 33.83, 35.94, 37.7]","[88, 90, 133, 217, 427]","[hwei, janna]","[53.15, 52.7]","[286, 148]",yuumi,iron,support
3654,,[],[],[],[],[],[],sivir,iron,support


In [6]:
df_list_temp = df_list_temp.drop_duplicates(subset=['champion_name', 'tier', 'lane'])
df_list_temp

Unnamed: 0,grade,counter,win_rate_weak,games_played_weak,strong,win_rate_strong,games_played_strong,champion_name,tier,lane
0,1,"[garen, illaoi, drmundo, singed, zac]","[44.01, 44.53, 44.85, 44.86, 45.8]","[100862, 50803, 28349, 14508, 5380]","[irelia, yasuo, yone, volibear, warwick]","[55.01, 54.88, 54.02, 53.73, 52.95]","[33167, 19581, 89243, 33202, 9495]",jax,all,top
1,2,"[warwick, kayle, malphite, monkeyking, illaoi]","[46.23, 46.78, 47.01, 47.43, 47.72]","[7564, 12532, 27335, 2686, 20090]","[ksante, akali, gwen, sion, sylas]","[55.97, 54.75, 54.5, 54.46, 54.18]","[17031, 12587, 14770, 16710, 3721]",fiora,all,top
2,4,"[singed, kayle, olaf, garen, drmundo]","[45.86, 46.13, 46.16, 46.34, 46.54]","[1378, 3061, 2684, 6752, 1981]","[sylas, gragas, jayce, yasuo, akali]","[57.19, 55.65, 55.36, 54.8, 54.49]","[932, 859, 4801, 2208, 2841]",poppy,all,top
3,2,"[vayne, kayle, urgot, drmundo, trundle]","[46.07, 46.59, 48.82, 49.12, 49.55]","[1973, 5666, 3380, 4127, 12864]","[gragas, yasuo, gwen, akali, zac]","[59.3, 58.36, 57.44, 57.33, 57.23]","[1135, 2971, 4641, 3035, 692]",singed,all,top
4,4,"[singed, vayne, volibear, mordekaiser, kennen]","[42.77, 47.64, 47.9, 48.39, 48.92]","[692, 594, 1885, 2145, 646]","[sylas, malphite, pantheon, tryndamere, yasuo]","[58.61, 57.48, 57.23, 57.13, 56.78]","[633, 3304, 858, 1248, 1032]",zac,all,top
...,...,...,...,...,...,...,...,...,...,...
3651,,"[blitzcrank, nautilus, seraphine, senna, morgana]","[28.21, 33.33, 36.36, 36.76, 41.77]","[39, 51, 55, 68, 79]","[yuumi, brand]","[71.05, 51.35]","[38, 37]",anivia,iron,support
3652,5,"[bard, maokai, veigar, velkoz, brand]","[33.33, 34, 36.36, 36.73, 36.84]","[36, 50, 33, 49, 171]","[milio, renata, pantheon, malphite, soraka]","[57.14, 53.85, 53.03, 52.38, 52.1]","[35, 39, 66, 63, 119]",lulu,iron,support
3653,5,"[shaco, tahmkench, maokai, teemo, rakan]","[31.82, 33.33, 33.83, 35.94, 37.7]","[88, 90, 133, 217, 427]","[hwei, janna]","[53.15, 52.7]","[286, 148]",yuumi,iron,support
3654,,[],[],[],[],[],[],sivir,iron,support


### Data Storage

In [7]:

df_list_temp.to_csv('datasets/counters2.csv', index=False)