# Data Extraction

In [1]:
import requests
from urllib.parse import urlencode
import settings
import pandas as pd
from bs4 import BeautifulSoup
import urllib.parse
import time

In [2]:
def summ_puuid_extraction(n):
    summoner_dict = {}  # Initialize the dictionary outside the loop
    
    for i in range(1, n + 1):
        url = f'https://www.op.gg/leaderboards/tier?region=lan&type=ladder&page={i}&tier=diamond'
        response = requests.get(url)
        # Print the response for each page
        if response.status_code == 200:
            print(f"Successfully fetched the page {i}")
        else:
            print(f"Failed to fetch the page {i}, status code: {response.status_code}")
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.find_all('a')  # Find all <a> tags, which are links
        urls = [link.get('href') for link in links if link.get('href')]

        # Extract summoner URLs and decode them
        summoner_urls = [url for url in urls if '/summoners/lan/' in url]
        summoner_names = [url.split('/summoners/lan/')[1] for url in summoner_urls]
        summoner_names = [urllib.parse.unquote(name) for name in summoner_names]

        # Populate the dictionary with summoner names and taglines
        for name in summoner_names:
            if '-' in name:
                summoner, tagline = name.rsplit('-', 1)  # Split at the last occurrence of '-'
                summoner_dict[summoner.strip()] = tagline.strip()
            else:
                summoner_dict[name.strip()] = "LAN"  # Default tagline for names without taglines

    return summoner_dict


In [3]:
summoner_dict=summ_puuid_extraction(2)

Successfully fetched the page 1
Successfully fetched the page 2


In [4]:
summoner_dict

{'McLovin Musulmán': '6382',
 'Víner33': 'LAN',
 'mind control': 'geass',
 'Lunelle': 'Sejin',
 'xDarkrastarx': 'LAN1',
 'Renyi': 'R10',
 'jjk': 'LAN',
 'Tekiku': 'LAN',
 'DannyDk': 'LAN',
 'Catcissa': 'owo',
 'Vanitas': 'PSM',
 'FIosd': 'LAN',
 'Alex': 'Chall',
 'Vísion': 'LAN',
 'Virick': '1108',
 'Fluttershy': 'Kpop',
 'FlexAle': '12345',
 'MaJo Eulog': '2021',
 'TwTv 21souls': '2121',
 'Blue Änd Red': 'LAN',
 'Melgem': 'CLN',
 'JoyBoyƒ': '1998',
 'Nolan Grayson': 'Sweg',
 'Ioser': 'LAN',
 'D4niel': 'LAN',
 'VI7': 'LAN',
 'Thankfhort': 'LAN',
 'JarchisParchis': 'LAN',
 'Stitch': 'ali69',
 'Taquit0 on Fire': 'LAN',
 'Halloween': 'LAN',
 'Boing de Consome': 'LAN',
 'Phoebe Buffay': 'sunxi',
 'Chu 可愛くてごめん': '可愛い',
 'Voltaic Dux': 'SMURF',
 'kupperto': 'LAN',
 'SirChubby': 'LAN',
 'CatWell': 'Dry',
 'Enxyy': 'LAN',
 'DLuXe': 'LAN',
 'Prowled': 'Crow',
 'MordeKaiserion': 'MDKSR',
 'Tonantzin': 'LAN',
 'Juancho Rois': '1002',
 'Grey': 'luv',
 'Mr Boulsack': '4704',
 'Dacrax': 'LAN',
 'te 

In [5]:
print(len(summoner_dict))

200


The next block is focused in adding the PUUID into the existing dictionary and obtaining the match history ids. 

In [6]:
def get_puuids(data_dict):

    puuid_list = []  # Initialize an empty list to store puuid values
    request_count_per_second = 0  # Track the number of requests made
    request_count_per_2min = 0
    start_time = time.time()  # Record the start time for rate-limiting checks

    for summoner, tagline in data_dict.items():
        # Rate limiting: Ensure no more than 20 requests per second
        if request_count_per_second >= 20:
            time_elapsed = time.time() - start_time
            if time_elapsed < 1:  # Less than a second has passed
                time.sleep(1 - time_elapsed)  # Wait for the remaining time
            start_time = time.time()  # Reset the start time
            request_count_per_second = 0  # Reset the request counter

        # Rate limiting: Ensure no more than 100 requests every 2 minutes
        if request_count_per_2min >= 100:
            time_elapsed = time.time() - start_time
            if time_elapsed < 120:  # Less than 2 minutes have passed
                time.sleep(120 - time_elapsed)  # Wait for the remaining time
            start_time = time.time()  # Reset the start time
            request_count_per_2min = 0  # Reset the request counter

        try:
            # Construct the URL using the summoner name and tagline
            url = f'https://americas.api.riotgames.com/riot/account/v1/accounts/by-riot-id/{summoner}/{tagline}?api_key={settings.API_KEY}'
            
            # Fetch data from the URL
            response = requests.get(url)
            response.raise_for_status()  # Raise HTTPError for bad responses (4xx and 5xx)
            
            # Extract 'puuid' from the response JSON
            fetched_data = response.json()
            puuid = fetched_data.get("puuid")
            
            # Add the 'puuid' to the list if it exists
            if puuid:
                puuid_list.append(puuid)
            
            request_count_per_second += 1  # Increment the request count
            request_count_per_2min += 1  # Increment the request count

        except requests.RequestException as e:
            # Extract the error number
            error_message = str(e).split()
            error_num = error_message[0] if len(error_message) > 0 else "Unknown"

            print(f"Error fetching data for {summoner} (tagline:{tagline}): {error_num}")

            # If error number is 429, pause for 2 minutes
            if error_num == "429":
                print("Rate limit reached. Pausing for 2 minutes...")
                time.sleep(120)  # Wait for 2 minutes

    return puuid_list


In [8]:
dict_prueba={'LK7%20Mr%20Fuentes':'LAN'}
lista_prueba= get_puuids(dict_prueba)
lista_prueba

['qIc0d3YQSvxrrWC4XXSvK0MLy9-A0X2LYewXkpORTsunZnEjVVNx9HyV-pUYYtyRaOe_d2XKTgaHbw']

In [9]:
puuids= get_puuids(summoner_dict)

Error fetching data for TwTv 21souls (tagline:2121): 404
Error fetching data for Grey (tagline:luv): 404
Error fetching data for VoiD Axel Rous (tagline:LAN): 429
Rate limit reached. Pausing for 2 minutes...
Error fetching data for 404TopNotFound (tagline:ERROR): 404
Error fetching data for TW JH Ars Magnus (tagline:LAN): 404


In [10]:
def get_matches(puuid_list):

    match_list = []  # Initialize an empty list to store match id values
    request_count_per_second = 0  # Track the number of requests made
    request_count_per_2min = 0
    start_time = time.time()  # Record the start time for rate-limiting checks

    for puuid in puuid_list:
        # Rate limiting: Ensure no more than 20 requests per second
        if request_count_per_second >= 20:
            time_elapsed = time.time() - start_time
            if time_elapsed < 1:  # Less than a second has passed
                time.sleep(1 - time_elapsed)  # Wait for the remaining time
            start_time = time.time()  # Reset the start time
            request_count_per_second = 0  # Reset the request counter

        # Rate limiting: Ensure no more than 100 requests every 2 minutes
        if request_count_per_2min >= 100:
            time_elapsed = time.time() - start_time
            if time_elapsed < 120:  # Less than 2 minutes have passed
                time.sleep(120 - time_elapsed)  # Wait for the remaining time
            start_time = time.time()  # Reset the start time
            request_count_per_2min = 0  # Reset the request counter


        try:
            # Construct the URL using the summoner name and tagline
            url = f'https://americas.api.riotgames.com/lol/match/v5/matches/by-puuid/{puuid}/ids?start=0&count=20&api_key={settings.API_KEY}'
            
            # Fetch data from the URL
            response = requests.get(url)
            response.raise_for_status()  # Raise HTTPError for bad responses (4xx and 5xx)
            
            # Extract 'puuid' from the response JSON
            match_ids = response.json()
            
            
            # Add the 'puuid' to the list if it exists
            if match_ids:
                match_list.extend(match_ids)
            
            request_count_per_second += 1  # Increment the request count
            request_count_per_2min += 1 # Increment the request count
        except requests.RequestException as e:

              # Extract the error number
            error_message = str(e).split()
            error_num = error_message[0] if len(error_message) > 0 else "Unknown"

            print(f"Error fetching data for puuid {puuid} : {error_num}")

            # If error number is 429, pause for 2 minutes
            if error_num == "429":
                print("Rate limit reached. Pausing for 2 minutes...")
                time.sleep(120)  # Wait for 2 minutes


    return match_list

In [11]:
list_match_prueba= get_matches(lista_prueba)
list_match_prueba

['LA1_1578542376',
 'LA1_1578427839',
 'LA1_1578419356',
 'LA1_1578274283',
 'LA1_1578252563',
 'LA1_1578236621',
 'LA1_1577941878',
 'LA1_1577929393',
 'LA1_1577360135',
 'LA1_1577355054',
 'LA1_1577277049',
 'LA1_1577255380',
 'LA1_1577175838',
 'LA1_1577160459',
 'LA1_1577012840',
 'LA1_1577009130',
 'LA1_1576997416',
 'LA1_1576979172',
 'LA1_1576959633',
 'LA1_1576639985']

In [12]:
match_list=get_matches(puuids)

Error fetching data for puuid 2MeQ-yF32YFa57gorj4T45pDGe_58bw5NpmzCO4DXX41IQD8pvMuBb3FxtQOumR0k_oI7fi6t5vlZw : 429
Rate limit reached. Pausing for 2 minutes...


In [15]:
len(match_list)

3880

In [16]:
def get_matches_info(match_id_list):

    df=pd.DataFrame()
    request_count_per_second = 0  # Track the number of requests made
    request_count_per_2min = 0
    start_time = time.time()  # Record the start time for rate-limiting checks

    for match in match_id_list:
        # Rate limiting: Ensure no more than 20 requests per second
        if request_count_per_second >= 20:
            time_elapsed = time.time() - start_time
            if time_elapsed < 1:  # Less than a second has passed
                time.sleep(1 - time_elapsed)  # Wait for the remaining time
            start_time = time.time()  # Reset the start time
            request_count_per_second = 0  # Reset the request counter

        # Rate limiting: Ensure no more than 100 requests every 2 minutes
        if request_count_per_2min >= 100:
            time_elapsed = time.time() - start_time
            if time_elapsed < 120:  # Less than 2 minutes have passed
                time.sleep(120 - time_elapsed)  # Wait for the remaining time
            start_time = time.time()  # Reset the start time
            request_count_per_2min = 0  # Reset the request counter

        try:
            # Construct the URL using the summoner name and tagline
            url = f' https://americas.api.riotgames.com/lol/match/v5/matches/{match}?api_key={settings.API_KEY}'
            
            # Fetch data from the URL
            response = requests.get(url)
            response.raise_for_status()  # Raise HTTPError for bad responses (4xx and 5xx)
            
            # Extract 'puuid' from the response JSON
            match_dict = response.json()
            match_info = match_dict['info']
            queu_Id=match_dict['info']['queueId']
            participants_df = pd.json_normalize(match_info['participants'])

            # Merge the team and participant details into the main DataFrame
            df = pd.concat([df, participants_df])  
            df ['queu_Id']= queu_Id


            request_count_per_second += 1  # Increment the request count
            request_count_per_2min += 1  # Increment the request count
        except requests.RequestException as e:
            
            # Extract the error number
            error_message = str(e).split()
            error_num = error_message[0] if len(error_message) > 0 else "Unknown"

            print(f"Error fetching data for match {match} : {error_num}")

            # If error number is 429, pause for 2 minutes
            if error_num == "429":
                print("Rate limit reached. Pausing for 2 minutes...")
                time.sleep(120)  # Wait for 2 minutes

    return df

In [17]:
df_prueba=get_matches_info(list_match_prueba)
df_prueba.info()

Error fetching data for match LA1_1577941878 : 429
Rate limit reached. Pausing for 2 minutes...
<class 'pandas.core.frame.DataFrame'>
Int64Index: 190 entries, 0 to 9
Columns: 289 entries, allInPings to challenges.hadAfkTeammate
dtypes: bool(9), float64(38), int64(230), object(12)
memory usage: 418.8+ KB


In [None]:
match_list_cut=match_list[:1000]

In [19]:
print(len(match_list_cut))

1000


In [20]:
raw_df= get_matches_info(match_list_cut)


In [21]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9
Columns: 290 entries, allInPings to challenges.hadAfkTeammate
dtypes: bool(9), float64(39), int64(230), object(12)
memory usage: 21.6+ MB


In [22]:
raw_df.to_csv('./raw.csv')