# Mostly shared code from Scraper.ipynb. Just slightly modified to only scrape one tournament and predict future matches

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from io import StringIO
from typing import Optional, Union

Constants for webscraping

In [3]:
folderpath = "G:\\School\\PersonalSVN\\S24\\CS372\\FinalProject\\"

matches_info_block = "wf-module-item match-item"
matches_info_vs_block = "match-item-vs"
matches_info_team_name = "match-item-vs-team-name"
matches_info_team_score = "match-item-vs-team-score"
matches_info_series_name = "match-item-event-series"

stats_series_class = "wf-tag-btn noselect series-id-"
stats_series_id = "data-subseries-id"

abbreviations_td_class = "mod-player"
abbreviations_class = "ge-text-light"

current_event_link = "https://www.vlr.gg/event/2004/champions-tour-2024-americas-stage-1"


In [4]:
def get_links(event_link: str) -> tuple[str, str]:
  index = event_link.find("/event/") + 7
  stats_link = event_link[:index] + "stats/" + event_link[index:]
  
  matches_link = event_link[:index] + "matches/" + event_link[index:] + "?series_id=all"
  
  return stats_link, matches_link
  
def get_soups(stats_link: str, match_link: str) -> tuple[BeautifulSoup, BeautifulSoup]:
  # Load stats
  statsPage = requests.get(stats_link)
  statsSoup = BeautifulSoup(statsPage.content, 'html.parser')

  # Load matches
  matchesPage = requests.get(match_link)
  matchesSoup = BeautifulSoup(matchesPage.content, 'html.parser')
  
  return statsSoup, matchesSoup

In [5]:
def get_matches(matchesSoup: BeautifulSoup) -> pd.DataFrame:
  # All match info blocks
  match_info_blocks = matchesSoup.find_all("a", class_=lambda x: x and x.startswith(matches_info_block))
  matches = pd.DataFrame(columns=["date", "team1", "team2", "winner", "series", "code"])
  for div in match_info_blocks:
    # get the parent div
    parent_div = div.parent
    # get the date from the dive above the parent div
    date = parent_div.find_previous("div", class_="wf-label mod-large").text.strip()
    # drops the day of the week
    date = ", ".join(date.split(", ")[1:])
    date = date.split("\n")[0].strip()
    
    # get the two divs in the div "match-item-vs"
    vs = div.find("div", class_=matches_info_vs_block)
    # get the two team names
    teamNames = vs.find_all("div", class_=matches_info_team_name)
    
    team1 = teamNames[0].text.strip()
    team2 = teamNames[1].text.strip()
    
    # get the two scores (starts with "match-item-vs-team-score")
    scores = vs.find_all("div", class_=lambda x: x and x.startswith(matches_info_team_score))
    score1 = scores[0].text.strip()
    score2 = scores[1].text.strip()
    
    # get the series name
    series = div.find("div", class_=lambda x: x and x.startswith(matches_info_series_name)).text.strip()
    
    # Gets the link (for finding abbreviation later)
    code = div["href"].split("/")[1]
    
    # Gets the winner (1 if team1, 2 if team2)
    winner = 1 if score1 > score2 else 2
    
    # add the df
    matches = pd.concat([matches, pd.DataFrame({"date": [date], "team1": [team1], "team2": [team2], "winner": [winner], "series": [series], "code": [code]})])
  
  return matches

In [6]:
def get_series(statsSoup: BeautifulSoup, matches: pd.Series) -> pd.DataFrame:
  # gets all the different cutoffs to exclude
  # this is the class of the divs that contain the character names
  divs = statsSoup.find_all("div", class_=lambda x: x and x.startswith(stats_series_class))

  # Put them in a df of the content of the div labeled name and the data-subseries-id
  series = pd.DataFrame(columns=["name", "id"])
  for div in divs:
    name = div.text
    # remove the padding from the name
    name = name.strip()
    id = div[stats_series_id]
    # use concat to add the new row to the df
    series = pd.concat([series, pd.DataFrame({"name": [name], "id": [id]})], ignore_index=True)
  
  series["name"] = pd.Categorical(series["name"], categories=matches.unique(), ordered=True)
  series = series.sort_values("name")
  return series

In [7]:
def get_different_cutoffs_soups(stats_link: str, series: pd.DataFrame) -> dict[str, BeautifulSoup]:
  # Get soups of all different cutoffs on match page
  # The link would stats_link + "?exclude=" + the id of all matches below the current one in the matches df joined by a .
  match_soups = {}
  for i in range(len(series)):
    cutoff = ".".join(series["id"].iloc[i:])
    match_soups[series["name"].iloc[i]] = BeautifulSoup(requests.get(stats_link + "?exclude=" + cutoff).content, 'html.parser')
    #match_soups[series["name"].iloc[i]] = stats_link + "?exclude=" + cutoff
  
  return match_soups

In [8]:
def get_stats(soup: BeautifulSoup):
  # Get the table with class wf-table mod-stats mod-scroll
  table = soup.find("table", {"class": "wf-table mod-stats mod-scroll"})

  # Get the different columns
  columns = table.find("thead").find_all("th")
  
  # Get the table body
  table_body = table.find("tbody")

  # Get all the rows
  rows = table_body.find_all("tr")

  # Put into pandas dataframe
  data = []
  for row in rows:
    cols = row.find_all("td")
    cols = [col.text.strip() for col in cols]
    data.append(cols)

  df = pd.DataFrame(data, columns=[column.text for column in columns])
  
  if (len(df) == 0):
    # no rows so just add the Team column
    df["Team"] = ""
  else:
    # seperate the player and team at the \n in the player column
    df[["Player", "Team"]] = df["Player"].str.split("\n", expand=True)
  
  # make team column the second column
  df = df[["Player", "Team"] + [col for col in df.columns if col not in ["Player", "Team"]]]

  # remove agent column
  df = df.drop("Agents", axis=1)
  return df

def get_series_stats(cutoff_soups: dict[str, BeautifulSoup]) -> pd.DataFrame:
  stats = {}
  for key in cutoff_soups:
    stat = get_stats(cutoff_soups[key])
    if (len(stat) > 0):
      stats[key] = stat
  return stats

In [9]:
def clean_stat(stat: pd.DataFrame) -> pd.DataFrame:
  # make all row types strings
  stat = stat.astype(str)
  # If a value is an empty string, replace it with NaN
  stat = stat.replace("", float("NaN"))
  
  first_valid_row = stat.dropna().index[0]

  # Get the first row in a column and check if it is a percentage. if so turn into float
  for column in stat.columns[2:]:
    if "%" in stat[column].loc[first_valid_row]:
      stat[column] = stat[column].str.replace("%", "").astype(float)
  
  # If it is a number, turn into float
  for column in stat.columns[2:]:
    # continue if not str
    if stat[column] is not str:
      continue
    if stat[column].loc[first_valid_row].replace(".", "", 1).isdigit():
      stat[column] = stat[column].astype(float)
      
      
  # Split CL into success and attempts
  if "CL" in stat.columns:
    cols = stat["CL"].str.split("/", expand=True)
    # Insert the two new columns at the CL column
    CL_index = stat.columns.get_loc("CL") + 1
    stat.insert(CL_index, "CL Success", cols[0].astype(float))
    stat.insert(CL_index + 1, "CL Attempts", cols[1].astype(float))
    stat = stat.drop("CL", axis=1)
  
  return stat

def clean_stats(stats: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]:
  for key in stats:
    stats[key] = clean_stat(stats[key])
  return stats

In [10]:
def turn_name_into_abbreviations(matches: pd.DataFrame) -> pd.DataFrame:
  # Go through all matches
  abbreviations = {}
  for index, match_ in matches.iterrows():
    # If both teams are already in the abbreviations, continue
    if match_["team1"] in abbreviations and match_["team2"] in abbreviations:
      continue
    
    # Open the link
    page = requests.get("https://www.vlr.gg/" + match_["code"])
    match_soup = BeautifulSoup(page.content, 'html.parser')
    
    # Get all td with class mod-player
    tds = match_soup.find_all("td", class_=abbreviations_td_class)
    
    team1_td = tds[-6]
    team2_td = tds[-1]
    
    #team1_abbreviation = team_1_td[0][1][1].text
    team1_abbreviation = team1_td.find('div', class_=abbreviations_class).text.strip()
    
    team2_abbreviation = team2_td.find('div', class_=abbreviations_class).text.strip()
    
    # Add the abbreviations to the dict
    abbreviations[match_["team1"]] = team1_abbreviation
    abbreviations[match_["team2"]] = team2_abbreviation
  
  # Convert old names to abbreviations
  matches["team1_abrev"] = matches["team1"].map(abbreviations)
  matches["team2_abrev"] = matches["team2"].map(abbreviations)
  
  # Remove the link column
  #matches = matches.drop("link", axis=1)
  
  return matches


In [11]:
def get_team_stats(team: str, stats: pd.DataFrame) -> pd.DataFrame:
  return stats[stats["Team"] == team]

def add_team_stats_to_matches(matches: pd.DataFrame, stats: dict[str, pd.DataFrame]) -> pd.DataFrame:
  matches.reset_index(drop=True, inplace=True)
  
  # for each map
  for index, match_ in matches.iterrows():
    # get series name
    series = match_["series"]
    # if the series is not in the stats remove the row
    if series not in stats and series != "Week 3":
      matches = matches.drop(index)
      continue
    
    # get the stats for the series
    stat = stats[series]
    # get the team stats
    team1_stats = get_team_stats(match_["team1_abrev"], stat)
    team2_stats = get_team_stats(match_["team2_abrev"], stat)
    
    # If either team is not in the stats remove the row
    if len(team1_stats) == 0 or len(team2_stats) == 0:
      matches = matches.drop(index)
      continue
    
    # add the stats to the matches df
    matches.at[index, "team1_stats"] = team1_stats.to_csv(index=False)
    matches.at[index, "team2_stats"] = team2_stats.to_csv(index=False)

  return matches

In [12]:
from functools import lru_cache

@lru_cache(maxsize=None)
def get_event_stats(event_link: str) -> Optional[pd.DataFrame]:
  print("Starting", event_link)
  if type(event_link) == list:
    if len(event_link) == 0:
      return None
    # Get first link
    event_stats = get_event_stats(event_link[0])
    print("Done with", event_link[0])
    print("Length of event_stats:", len(event_stats))
    
    # Add all other links
    for link in event_link[1:]:
      event_stats = pd.concat([event_stats, get_event_stats(link)], ignore_index=True)
      print("Done with", link)
      print("Length of event_stats:", len(event_stats))
    return event_stats
  
  stats_link, matches_link = get_links(event_link)
  stats_soup, matches_soup = get_soups(stats_link, matches_link)
  
  matches = get_matches(matches_soup)
  
  series = get_series(stats_soup, matches["series"])
  
  # add Week 3 as a row to series (id of 0)
  series = pd.concat([series, pd.DataFrame({"name": ["Week 3"], "id": ["0"]})], ignore_index=True)
  cutoff_soups = get_different_cutoffs_soups(stats_link, series)
  
  stats = get_series_stats(cutoff_soups)

  stats = clean_stats(stats)
  matches = turn_name_into_abbreviations(matches)
  return add_team_stats_to_matches(matches, stats)


# [link to player statistics](https://www.vlr.gg/event/stats/2004/champions-tour-2024-americas-stage-1)

In [13]:
event_stats = get_event_stats(current_event_link)
display(event_stats)

Starting https://www.vlr.gg/event/2004/champions-tour-2024-americas-stage-1


Unnamed: 0,date,team1,team2,winner,series,code,team1_abrev,team2_abrev,team1_stats,team2_stats
5,"April 13, 2024",G2 Esports,100 Thieves,2,Week 2,314628,G2,100T,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP..."
6,"April 13, 2024",Sentinels,Leviatán,2,Week 2,314629,SEN,LEV,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP..."
8,"April 14, 2024",NRG Esports,FURIA,1,Week 2,314631,NRG,FUR,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP..."
9,"April 15, 2024",Cloud9,Evil Geniuses,1,Week 2,314632,C9,EG,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP..."
10,"April 20, 2024",Sentinels,MIBR,2,Week 3,314634,SEN,MIBR,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP..."
11,"April 20, 2024",Cloud9,LOUD,2,Week 3,314635,C9,LOUD,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP..."
12,"April 21, 2024",KRÜ Esports,Evil Geniuses,2,Week 3,314636,KRÜ,EG,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP..."
13,"April 21, 2024",NRG Esports,Leviatán,2,Week 3,314637,NRG,LEV,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP..."
14,"April 22, 2024",G2 Esports,FURIA,2,Week 3,314638,G2,FUR,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP..."
15,"April 22, 2024",Cloud9,100 Thieves,2,Week 3,314639,C9,100T,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP..."


In [14]:
def get_match_link(match_code) -> str:
  return f"https://www.vlr.gg/{match_code}"


In [15]:
def get_head_to_head_score(match_row: pd.Series, soup: BeautifulSoup) -> tuple[int, int]:
  # scores are labeled with class match-h2h-matches-score
  head_t_heads = soup.find_all("div", class_="match-h2h-matches-score")
  head_t_head_dates = soup.find_all("div", class_="match-h2h-matches-date")

  round_differential = []
  for head_t_head in head_t_heads:
    # scores are in the two sub divs
    scores_divs = head_t_head.find_all("span")
    
    t1_score = int(scores_divs[0].text.strip())
    t2_score = int(scores_divs[1].text.strip())
    
    round_differential.append(t1_score - t2_score)

  date = pd.to_datetime(match_row["date"])

  months = []
  for date_div in head_t_head_dates:
    # change from YYYY/MM/DD to YYYY-MM-DD
    head_t_head_date = pd.to_datetime(date_div.text.replace("/", "-"))
    # get the months since the match
    months.append(12-((date - head_t_head_date).days / 30))

  # calculate the head to head score for the two teams (win * months since row["date"])
  t1_score = 0
  t2_score = 0

  for i in range(len(months)):
    # skip if months is negative
    if months[i] <= 0:
      continue
    
    if round_differential[i] > 0:
      t1_score += round_differential[i] * months[i]
    elif round_differential[i] < 0:
      t2_score += abs(round_differential[i]) * months[i]
      
  return t1_score, t2_score

def get_previous_matches_score(soup: BeautifulSoup) -> tuple[int, int]:
  # find the block with a stype of "display: flex;"
  prev_block = soup.find("div", style="display: flex;")
  
  # get the previous matches in div with class that starts with "match-histories-item"
  previous_matches = prev_block.find_all("a", class_=lambda x: x and x.startswith("match-histories-item"))
  
  game_scores = []
  for match in previous_matches:
    # if the class contains "mod-win"
    if "mod-win" in match["class"]:
      game_scores.append(True)
    else:
      game_scores.append(False)
      
  # first 5 are team 1, last 5 are team 2
  team1_score = sum(game_scores[:5])
  team2_score = sum(game_scores[5:])
  return team1_score, team2_score

def add_previous_games_scores(data: pd.DataFrame) -> pd.DataFrame:
  new_data = data.copy()
  new_data["team1_previous_score"] = 0
  new_data["team2_previous_score"] = 0
  
  new_data["team1_hth"] = 0
  new_data["team2_hth"] = 0
  
  for index, row in new_data.iterrows():
    print(index, end="\r")
    code = row["code"]
    soup = BeautifulSoup(requests.get(get_match_link(code)).content, 'html.parser')
    t1_score, t2_score = get_previous_matches_score(soup)
    new_data.at[index, "team1_previous_score"] = t1_score
    new_data.at[index, "team2_previous_score"] = t2_score
    
    t1_hth, t2_hth = get_head_to_head_score(row, soup)
    new_data.at[index, "team1_hth"] = t1_hth
    new_data.at[index, "team2_hth"] = t2_hth
    
  return new_data

In [16]:
final_stats = add_previous_games_scores(event_stats)

19

# [Link to match stats](https://www.vlr.gg/314634/sentinels-vs-mibr-champions-tour-2024-americas-stage-1-w3)

In [17]:
display(final_stats.head(10))

Unnamed: 0,date,team1,team2,winner,series,code,team1_abrev,team2_abrev,team1_stats,team2_stats,team1_previous_score,team2_previous_score,team1_hth,team2_hth
5,"April 13, 2024",G2 Esports,100 Thieves,2,Week 2,314628,G2,100T,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...",3,0,0.0,0.0
6,"April 13, 2024",Sentinels,Leviatán,2,Week 2,314629,SEN,LEV,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...",4,2,10.4,6.0
8,"April 14, 2024",NRG Esports,FURIA,1,Week 2,314631,NRG,FUR,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...",3,0,23.066667,0.0
9,"April 15, 2024",Cloud9,Evil Geniuses,1,Week 2,314632,C9,EG,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...",2,3,0.0,2.2
10,"April 20, 2024",Sentinels,MIBR,2,Week 3,314634,SEN,MIBR,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...",3,3,20.4,0.0
11,"April 20, 2024",Cloud9,LOUD,2,Week 3,314635,C9,LOUD,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...",3,2,0.0,0.0
12,"April 21, 2024",KRÜ Esports,Evil Geniuses,2,Week 3,314636,KRÜ,EG,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...",2,2,0.0,0.0
13,"April 21, 2024",NRG Esports,Leviatán,2,Week 3,314637,NRG,LEV,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...",4,2,0.0,0.0
14,"April 22, 2024",G2 Esports,FURIA,2,Week 3,314638,G2,FUR,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...",3,0,0.0,0.0
15,"April 22, 2024",Cloud9,100 Thieves,2,Week 3,314639,C9,100T,"Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...","Player,Team,Rnd,R,ACS,K:D,KAST,ADR,KPR,APR,FKP...",3,1,8.666667,0.0


In [18]:
# save to csv
final_stats.to_csv(folderpath + "final_test_set.csv", index=False)