# Data Collection

### Riot - Data Dragon

In [17]:
import requests
import json
import os

# Data Dragon URL for latest version
VERSIONS_URL = "https://ddragon.leagueoflegends.com/api/versions.json"
BASE_URL = "https://ddragon.leagueoflegends.com/cdn"

# Get latest game version
def get_latest_version():
    res = requests.get(VERSIONS_URL)
    res.raise_for_status()
    versions = res.json()
    return versions[0]

# Load champion data
def get_champion_data(version):
    champ_list_url = f"{BASE_URL}/{version}/data/en_US/champion.json"
    res = requests.get(champ_list_url)
    res.raise_for_status()
    champions = res.json()["data"]
    return champions

# Get full details for each champion
def get_detailed_champion_data(version, champ_id):
    detail_url = f"{BASE_URL}/{version}/data/en_US/champion/{champ_id}.json"
    res = requests.get(detail_url)
    res.raise_for_status()
    return res.json()["data"][champ_id]

# Extract relevant text and save
def build_champion_text_dataset():
    version = get_latest_version()
    champions = get_champion_data(version)
    
    all_champions = []
    
    for champ_key in champions:
        champ_id = champions[champ_key]["id"]
        detail = get_detailed_champion_data(version, champ_id)
        
        name = detail["name"]
        title = detail["title"]
        blurb = detail["blurb"]
        
        passive = detail["passive"]["description"]
        spells = [spell["description"] for spell in detail["spells"]]
        
        full_text = f"{name}, {title}. {blurb} Passive: {passive} " + " ".join([f"Spell: {s}" for s in spells])
        
        all_champions.append({
            "name": name,
            "title": title,
            "blurb": blurb,
            "passive": passive,
            "spells": spells,
            "full_text": full_text
        })
    
    # Save as JSON
    with open("riot_champion_data.json", "w", encoding="utf-8") as f:
        json.dump(all_champions, f, indent=2, ensure_ascii=False)

    print(f"Saved data for {len(all_champions)} champions.")

# Run it
build_champion_text_dataset()


Saved data for 170 champions.


### LoL Wiki

In [18]:
import requests
from bs4 import BeautifulSoup
import json
import time

def scrape_lol_wiki(champion_name):
    url_name = champion_name.replace(" ", "_")
    url = f"https://leagueoflegends.fandom.com/wiki/{url_name}"
    headers = {"User-Agent": "Mozilla/5.0"}
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except Exception as e:
        print(f"Failed to fetch {champion_name}: {e}")
        return None
    
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Get title from page header
    title_el = soup.find("h1", {"class": "page-header__title"})
    title = title_el.text.strip() if title_el else ""

    # Find roles
    role_box = soup.find("div", {"class": "pi-data-value pi-font"})
    roles = role_box.text.strip() if role_box else ""

    # Ability descriptions (grab first few paragraphs under abilities section)
    ability_section = soup.find("span", {"id": "Abilities"})
    if ability_section:
        ability_texts = []
        for tag in ability_section.find_parent().find_next_siblings("p", limit=6):
            ability_texts.append(tag.get_text().strip())
        abilities = " ".join(ability_texts)
    else:
        abilities = ""

    return {
        "champion": champion_name,
        "title": title,
        "roles": roles,
        "abilities": abilities
    }

def save_all_wiki_data_from_riot_json(riot_filename="riot_champion_data.json"):
    # Load all champion names from Riot JSON
    try:
        with open(riot_filename, "r", encoding="utf-8") as f:
            riot_data = json.load(f)
        champion_names = [entry["name"] for entry in riot_data]
    except Exception as e:
        print(f"Failed to load {riot_filename}: {e}")
        return

    all_data = []
    for i, name in enumerate(champion_names):
        #print(f"Scraping {name} ({i+1}/{len(champion_names)})...")         #uncomment to verify progress
        data = scrape_lol_wiki(name)
        if data:
            all_data.append(data)
            #print("Sample entry:", data)       #uncomment to verify progress
        time.sleep(1.5)  # Prevent overloading the site

    with open("wiki_champion_data.json", "w", encoding="utf-8") as f:
        json.dump(all_data, f, indent=2, ensure_ascii=False)
    print(f"\nSaved data for {len(all_data)} champions to wiki_champion_data.json.")

# Run it
save_all_wiki_data_from_riot_json()

# Printing Ahri's data as an example
with open("wiki_champion_data.json", "r", encoding="utf-8") as f:
    wiki_data = json.load(f)
    for champion in wiki_data:
        if champion["champion"] == "Ahri":
            print(json.dumps(champion, indent=2, ensure_ascii=False))
            break

KeyboardInterrupt: 

### Mobafire

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time
import re

# Known problematic Mobafire champion URL fixes
MOBALFIRE_NAME_FIXES = {
    "Aurelion Sol": "aurelion-sol-130",
    "Bel'Veth": "belveth",
    "Cho'Gath": "chogath",
    "Dr. Mundo": "dr-mundo-26",
    "Jarvan IV": "jarvan-iv-71",
    "Kai'Sa": "kaisa",
    "Kha'Zix": "khazix",
    "Kog'Maw": "kogmaw",
    "K'Sante": "ksante",
    "Lee Sin": "lee-sin-73",
    "Master Yi": "master-yi-3",
    "Miss Fortune": "miss-fortune-59",
    "Nunu & Willump": "nunu-amp-willump-12",
    "Rek'Sai": "reksai",
    "Renata Glasc": "renata-glasc-175",
    "Tahm Kench": "tahm-kench-126",
    "Twisted Fate": "twisted-fate-28",
    "Vel'Koz": "velkoz",
    "Xin Zhao": "xin-zhao-55",
    "Wukong": "wukong-80",
}

# Format fallback names and fix special characters
def format_mobafire_name(name):
    if name in MOBALFIRE_NAME_FIXES:
        return MOBALFIRE_NAME_FIXES[name]

    fallback = name.lower().replace(" ", "-")
    fallback = fallback.replace("'", "").replace(".", "").replace("&", "and")
    fallback = re.sub(r"[^\w\-]", "", fallback)
    return fallback

# Scrape a single champion's Mobafire guide description
def scrape_mobafire_description(champion_name):
    base_name = format_mobafire_name(champion_name)
    listing_url = f"https://www.mobafire.com/league-of-legends/champion/{base_name}"
    headers = {"User-Agent": "Mozilla/5.0"}

    try:
        listing_response = requests.get(listing_url, headers=headers)
        listing_response.raise_for_status()
    except Exception as e:
        print(f"Failed to fetch {champion_name} listing page: {e}")
        return {"champion": champion_name, "mobafire_description": "Error loading page"}

    listing_soup = BeautifulSoup(listing_response.text, "html.parser")

    guide_link_tag = listing_soup.find("a", class_="browse-list__guide-title")
    if not guide_link_tag or "href" not in guide_link_tag.attrs:
        return {"champion": champion_name, "mobafire_description": "No guide found"}

    guide_url = "https://www.mobafire.com" + guide_link_tag["href"]

    try:
        guide_response = requests.get(guide_url, headers=headers)
        guide_response.raise_for_status()
    except Exception as e:
        print(f"Failed to fetch guide for {champion_name}: {e}")
        return {"champion": champion_name, "mobafire_description": "Error loading guide"}

    guide_soup = BeautifulSoup(guide_response.text, "html.parser")

    content_div = guide_soup.find("div", class_="view-guide__full")
    if not content_div:
        return {"champion": champion_name, "mobafire_description": "No content found"}

    text = content_div.get_text(separator=" ", strip=True)
    return {
        "champion": champion_name,
        "mobafire_description": text[:1000]  # Limit to first 1000 chars
    }

# Load champion names and run the full scrape
def save_all_mobafire_data_from_riot_json(riot_filename="riot_champion_data.json"):
    try:
        with open(riot_filename, "r", encoding="utf-8") as f:
            riot_data = json.load(f)
        champion_names = [entry["name"] for entry in riot_data]
    except Exception as e:
        print(f"Failed to load {riot_filename}: {e}")
        return

    all_data = []
    for i, name in enumerate(champion_names):
        #print(f"Scraping Mobafire guide for {name} ({i+1}/{len(champion_names)})...")
        data = scrape_mobafire_description(name)
        all_data.append(data)
        time.sleep(2)  # Be kind to Mobafire servers

    with open("mobafire_champion_data.json", "w", encoding="utf-8") as f:
        json.dump(all_data, f, indent=2, ensure_ascii=False)

    print(f"\nSaved data for {len(all_data)} champions to mobafire_champion_data.json.")

# Run the scraper
if __name__ == '__main__':
    save_all_mobafire_data_from_riot_json()



Saved data for 170 champions to mobafire_champion_data.json.


### Merge 3 sources

In [None]:
import json

def load_json(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return json.load(f)

# Load all sources
riot_data = load_json("riot_champion_data.json")
wiki_data = load_json("wiki_champion_data.json")
mobafire_data = load_json("mobafire_champion_data.json")

# Convert wiki and mobafire to dicts for faster lookup
wiki_lookup = {entry["champion"]: entry for entry in wiki_data}
mobafire_lookup = {entry["champion"]: entry for entry in mobafire_data}

# Merge data
compiled = []

for entry in riot_data:
    name = entry["name"]
    riot_text = entry.get("full_text", "")
    wiki_text = wiki_lookup.get(name, {}).get("abilities", "")
    mobafire_text = mobafire_lookup.get(name, {}).get("mobafire_description", "")

    combined_text = f"{riot_text}\n\nWIKI:\n{wiki_text}\n\nMOBAFIRE:\n{mobafire_text}"

    compiled.append({
        "champion": name,
        "combined_text": combined_text
    })

# Save the final compiled profile
with open("compiled_champion_profiles.json", "w", encoding="utf-8") as f:
    json.dump(compiled, f, indent=2, ensure_ascii=False)

print(f"Saved {len(compiled)} champions to compiled_champion_profiles.json.")


Saved 170 champions to compiled_champion_profiles.json.


# Data Cleaning and Preprocessing

In [2]:
# Cleaning the champion data json
import re
import json
from sentence_transformers import SentenceTransformer

# Load raw compiled text
with open("compiled_champion_profiles.json", "r", encoding="utf-8") as f:
    champion_data = json.load(f)

# Clean text
def clean_combined_text(combined_text):
    text = combined_text.lower().strip()
    # Remove special characters and digits
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text)
    # Remove html-like tags
    text = re.sub(r'<[^>]+>', '', text)
    # Replace escaped newlines
    text = re.sub(r'\\n|\\', ' ', text)
    # Rollapse whitespace
    text = re.sub(r'\s+', ' ', text)

# Process and store cleaned data
cleaned_data = []
for entry in champion_data:
    cleaned_text = clean_combined_text(entry["combined_text"])
    cleaned_data.append({
        "champion": entry["champion"],
        "cleaned_text": cleaned_text
    })

# Save to new file
with open("cleaned_champion_profiles.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, indent=2, ensure_ascii=False)

print(f" Saved cleaned data for {len(cleaned_data)} champions to cleaned_champion_profiles.json.")


  from .autonotebook import tqdm as notebook_tqdm


 Saved cleaned data for 170 champions to cleaned_champion_profiles.json.


In [15]:
# Load cleaned data for embedding
with open("cleaned_champion_profiles.json", "r", encoding="utf-8") as f:
    cleaned_data = json.load(f)

champions = []
texts = []

for entry in cleaned_data:
    if entry["cleaned_text"]:  # filters out None or empty strings
        champions.append(entry["champion"])
        texts.append(entry["cleaned_text"])


# Create SBERT model and generate embeddings ---
SBERT_model = SentenceTransformer('all-MiniLM-L6-v2')
combined_text_embeddings = SBERT_model.encode(texts, convert_to_tensor=True)

print("SBERT embeddings generated.")

SBERT embeddings generated.


In [16]:
print("champions: ", champions)
print("texts: ", texts)

champions:  []
texts:  []


I changed the function because the original version would be reloading the SBERT model every time a user entered something, which would slow things down unnecessarily. It would also be embedding the wrong variable by mistake. In the updated version, it will only clean and embed the actual user input, using the model that we already loaded at the start of the script. This will make everything faster and work the way we intended.

In [6]:

# function for cleaning AND embedding the user input
def clean_and_embed_user_input(user_input, model):
    # lowercase and strip
    text = user_input.lower().strip()
    # remove special characters and digits
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # remove extra spaces
    text = re.sub(r"\s+", " ", text)
    # remove html-like tags
    text = re.sub(r'<[^>]+>', '', text)
    # replace escaped newlines
    text = re.sub(r'\\n|\\', ' ', text)
    # collapse whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return model.encode(text, convert_to_tensor=True)

In [7]:
# sample user input
user_input = "I want to play a champion with high mobility and burst damage."
# send user input through clean and embed function
user_input_embedding = clean_and_embed_user_input(user_input, SBERT_model)

# Text Comparisons

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

# convert user input embedding to numpy array
#user_input_embedding = user_input_embedding.cpu().numpy()
# reshape to 2D array for cosine similarity calculation
user_input_embedding = user_input_embedding.reshape(1, -1)

# convert champion embeddings to numpy array
#combined_text_embeddings = combined_text_embeddings.cpu().numpy()

# calculate cosine similarity between the champion descriptions and the user input
similarities = cosine_similarity(user_input_embedding, combined_text_embeddings)

ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [14]:
print("Champion Texts:", texts)

Champion Texts: []


In [None]:
# get the top k (start with 5) most similar champions
top_k = 5
top_k_indices = similarities.argsort()[0][-top_k:][::-1]
top_k_champions = []

for i in top_k_indices:
    print(f"Champion: {champions[i]}, Similarity: {similarities[0][i]}")
    top_k_champions.append(champions[i])


We will need more text preprocessing:
- tokenization
- truncation to fixed length
- vocab and word embeddings (gates and chatgpt reccomend GloVe and Word2Vec)

# Model Architecture
- started this for if we decide to build a CNN (tabling for now)

In [None]:
# creating a CNN using PyTorch for sentence similarity
import torch
import torch.nn as nn
import torch.nn.functional as F

class CHARMModel(nn.Module):
    # starting with model from Coding Tutorial 11 since it works with text
    def __init__(self, vocab_size, output_classes, hidden_dim=512):
        super(CHARMModel, self).__init__()
        # IF WE PASS TOKEN IDS (AND NOT WORD VECTORS) INTO THE MODEL, THEN INCLUDE THE FOLLOWING LINE 
        # self.embedding = nn.Embedding(vocab_size, 300)  # Assuming 300-dimensional word vectors

        # removed image processing part since we are only working with text
        
        # Question processing part (fully connected)
        self.fc_question1 = nn.Linear(vocab_size, 512)
        self.fc_question2 = nn.Linear(512, hidden_dim)

        # removing concatenation of image and text features since we only have text

        # final classification layer
        self.fc_output = nn.Linear(hidden_dim, output_classes)

    def forward(self, question):
        # removed image part

        # Question part (Fully Connected)
        x_question = F.relu(self.fc_question1(question))
        x_question = F.relu(self.fc_question2(x_question))

        # removing fusion between image and text features since we only have text

        # Apply fusion layer
        x_fusion = F.relu(self.fc_fusion(x_fusion))

        # Output classification
        output = self.fc_output(x_fusion)
        return output