In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Scraping
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

import csv
import time

In [2]:
# Load .env file
load_dotenv('../.env.local')  # Adjust the relative path according to your directory structure

# Get the project path from environment variables
project_path = os.getenv('MY_PROJECT_PATH')

# Read CSV into DataFrame
df = pd.read_csv(project_path)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Rank,Champion Name,Tier,Win rate,Pick Rate,Ban Rate,Matches,gameDuration,allInPings,...,turretTakedowns_y,turretsTakenWithRiftHerald,twentyMinionsIn3SecondsCount,twoWardsOneSweeperCount,unseenRecalls,visionScoreAdvantageLaneOpponent,visionScorePerMinute,wardTakedowns,wardTakedownsBefore20M,wardsGuarded
0,0,1,Swain,S,53.87,12.0,1.7,19216,1711,0,...,7,0,4,0,0,0.731238,1.166228,4,2,1
1,1,1,Swain,S,53.87,12.0,1.7,19216,1987,0,...,1,0,9,0,0,-0.407494,0.594141,1,1,0
2,2,1,Swain,S,53.87,12.0,1.7,19216,1495,0,...,1,0,8,0,0,-0.78205,0.388326,1,0,0
3,3,1,Swain,S,53.87,12.0,1.7,19216,1528,1,...,1,5,6,0,0,-0.055465,0.637202,4,3,0
4,4,1,Swain,S,53.87,12.0,1.7,19216,1155,0,...,0,0,1,0,0,-0.218048,1.941728,1,1,0


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures
import numpy as np  # For NaN
import time

def format_name_and_region(name, region):
    return name.replace(" ", "").lower(), region.replace(" ", "").lower()

def scrape_data(summonerName, platformId, index, total_rows):
    start_time = time.time()
    try:
        summonerName, platformId = format_name_and_region(summonerName, platformId)
        url = f"https://u.gg/lol/profile/{platformId}/{summonerName}/champion-stats?"
        
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-extensions")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.binary_location = "C:\\Users\\nicol\\Downloads\\chrome-win64\\chrome-win64\\chrome.exe"

        service = Service("C:\\Users\\nicol\\Downloads\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe")
        driver = webdriver.Chrome(service=service, options=options)

        driver.get(url)
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.rt-tr-group")))

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        driver.quit()

        rows = soup.select("div.rt-tr-group")

        if len(rows) == 0:
            print(f"No data for row {index + 1}")
            return pd.DataFrame()

        data = []

        for i, row in enumerate(rows, 1):
            selectors = {
                'Champion': "div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > span:nth-child(2)",
                'WinsLoses': "div:nth-child(1) > div:nth-child(3) > div:nth-child(1) > span:nth-child(3)",
                'Winrate': "div:nth-child(1) > div:nth-child(3) > div:nth-child(1) > strong:nth-child(1)",
                'KDA': "div:nth-child(1) > div:nth-child(4) > div:nth-child(1) > div:nth-child(1) > strong:nth-child(1)",
                'KillsDeathsAssists': "div:nth-child(1) > div:nth-child(4) > div:nth-child(1) > span:nth-child(2)",
                'LP': "div:nth-child(1) > div:nth-child(5) > span:nth-child(1)",
                'MaxKills': "div:nth-child(1) > div:nth-child(6) > span:nth-child(1)",
                'MaxDeaths': "div:nth-child(1) > div:nth-child(7)",
                'CS': "div:nth-child(1) > div:nth-child(8) > span:nth-child(1)",
                'Damage': "div:nth-child(1) > div:nth-child(9) > span:nth-child(1)",
                'Gold': "div:nth-child(1) > div:nth-child(10) > span:nth-child(1)"
            }

            row_data = []
            for key, selector in selectors.items():
                element = row.select_one(selector)
                text = element.get_text(strip=True) if element else np.nan
                row_data.append(text)

            data.append(row_data)

        columns = ['Champion', 'WinsLoses', 'Winrate', 'KDA', 'KillsDeathsAssists', 'LP', 'MaxKills', 'MaxDeaths', 'CS', 'Damage', 'Gold']
        df_individual = pd.DataFrame(data, columns=columns)

        print(f"Successfully scraped data for row {index + 1}/{total_rows}")

        elapsed_time = time.time() - start_time
        remaining_time = elapsed_time * (total_rows - (index + 1))
        print(f"Estimated remaining time: {remaining_time:.2f} seconds")

        return df_individual

    except Exception as e:
        print(f"Error in row {index + 1}: {e}")
        return pd.DataFrame()

# Create an empty DataFrame to store the combined data
df_combined = pd.DataFrame()

# Total number of rows
total_rows = len(df)

# Using ThreadPoolExecutor for concurrency
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
    futures = []
    for index, row in df.iterrows():
        summonerName = row['summonerName']
        platformId = row['platformId']
        futures.append(executor.submit(scrape_data, summonerName, platformId, index, total_rows))

    for future in concurrent.futures.as_completed(futures):
        df_individual = future.result()
        df_combined = pd.concat([df_combined, df_individual], ignore_index=True)

Successfully scraped data for row 3/252379
Estimated remaining time: 4727176.32 seconds
Successfully scraped data for row 10/252379
Estimated remaining time: 4861907.64 seconds
Successfully scraped data for row 18/252379
Estimated remaining time: 4900412.44 seconds
Successfully scraped data for row 11/252379
Estimated remaining time: 5062998.16 seconds
Successfully scraped data for row 5/252379
Estimated remaining time: 5831064.99 seconds
Successfully scraped data for row 2/252379
Estimated remaining time: 5855903.97 seconds
Successfully scraped data for row 20/252379
Estimated remaining time: 5900878.22 seconds
Successfully scraped data for row 9/252379
Estimated remaining time: 6353393.74 seconds
Successfully scraped data for row 14/252379
Estimated remaining time: 6370467.57 seconds
Successfully scraped data for row 13/252379
Estimated remaining time: 6911077.75 seconds
Successfully scraped data for row 17/252379
Estimated remaining time: 6950041.29 seconds
Successfully scraped data