#### Scrapping LOTR Fandom Wiki to get the list of characters

In [1]:
import requests
from bs4 import BeautifulSoup

# Base URL of the LOTR wiki
base_url = 'https://lotr.fandom.com'

# Function to get character names from a category page URL
def get_character_names(category_url):
    character_names = []
    url = base_url + category_url

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all elements with class category-page__members or category-page__trending-page
    divs = soup.find_all('div', {'class': ['category-page__members', 'category-page__trending-page']})
    for div in divs:
        # Find all character links within the div
        for link in div.find_all('a', href=True):
            href = link['href']
            if href.startswith('/wiki/') and ':' not in href:  # Ensure link is a wiki character page
                character_name = link.text.strip()
                if character_name:
                    character_names.append(character_name)

    return character_names

# Main script
def main():
    categories = [
        '/wiki/Category:Major_characters_(The_Lord_of_the_Rings)',
        '/wiki/Category:Minor_characters_(The_Lord_of_the_Rings)',
        '/wiki/Category:The_Lord_of_the_Rings_characters'
    ]

    all_character_names = []
    for category in categories:
        character_names = get_character_names(category)
        #print(character_names)
        all_character_names += character_names
    
    return list(set(all_character_names))
all_character_names = main()


In [2]:
print(all_character_names)
len(all_character_names)

['Angbor', 'Denethor II', 'Rose Cotton', 'Goodman Maggot', 'Radagast', 'Harding', 'Lagduf', 'Fredegar Bolger', 'Éomer', 'Elfhelm', 'Lotho Sackville-Baggins', 'Boromir', 'Fastred (Pelennor Fields)', 'Wilcombe (Jolly) Cotton', 'Mauhúr', 'Gollum', 'Glóin', 'Bilbo Baggins', 'Frodo Baggins', 'Bruno Bracegirdle', 'Peregrin Took', 'Celeborn', 'Angelica Baggins', 'Hirgon', 'Erkenbrand', 'Hirluin', 'Aragorn II', 'Bregalad', 'Legolas', 'Gamling', 'Grimbeorn', 'Galadriel', 'Otho Sackville-Baggins', 'Mouth of Sauron', 'Odo Proudfoot', 'Lobelia Sackville-Baggins', 'Théoden', 'Baranor (Gondor)', 'Ceorl', 'Grishnákh', 'Húrin (Warden of the Keys)', 'Shelob', 'Glorfindel', 'Horn', "Durin's Bane", 'Bergil', 'Ghân-buri-Ghân', 'Bill Ferny', 'Halbarad', 'Daddy Twofoot', 'Haldir (Lothlórien)', 'Déagol', 'Lindir', 'Meneldor', 'Adelard Took', 'Lugdush', 'Widow Rumble', 'Imrahil', 'Samwise Gamgee', 'Uglúk', 'Radbug', 'Círdan', 'Elladan and Elrohir', 'Orophin', 'Gléowine', 'Éothain', 'Gimli', 'Beechbone', 'Duil

152

#### Extracting the list of aliases for the above characters from the wiki

In [3]:
import requests
from bs4 import BeautifulSoup

# Base URL of the LOTR wiki
base_url = 'https://lotr.fandom.com/wiki/'

# Function to generate URL for a character
def generate_character_url(character_name):
    # Replace spaces with underscores for URL compatibility
    character_name_url = character_name.replace(' ', '_')
    return base_url + character_name_url

# Function to extract "Other names" from a character page
def get_other_names(character_url):
    response = requests.get(character_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    other_names = []
    for div in soup.find_all('div', {'class': 'pi-item'}):
        header = div.find('h3')
        if header and 'Other names' in header.text:
            other_names_list = div.find('div', {'class': 'pi-data-value'})
            if other_names_list:
                other_names = [name.strip() for name in other_names_list.text.split(',')]
                break  # We found the "Other names" section, no need to continue
    
    return other_names

# Main script
def main(character_names):
    all_other_names = {}

    for character_name in character_names:
        url = generate_character_url(character_name)
        other_names = get_other_names(url)
        if other_names:
            all_other_names[character_name] = other_names

    # Print the results
    for character, names in all_other_names.items():
        print(f"{character}: {', '.join(names)}")

    return all_other_names

# Run the script
aliases = main(all_character_names)


Angbor: Angbor the Fearless
Goodman Maggot: Farmer Maggot, Muddyfeet
Radagast: Aiwendil, Hrávandil, Tender of Beasts, Bird Friend, "the Bird-tamer", "the Simple", "the Fool"
Fredegar Bolger: Fatty
Éomer: Éomer Éadig
Lotho Sackville-Baggins: Little Pimple, Pimple, The Boss, The Chief
Gollum: Trahald, Slinker, Stinker, Shelob's Sneak
Bilbo Baggins: Mr. Invisible Baggins, Bilbo Took (see more)
Frodo Baggins: Frodo of the Nine Fingers, Nine-Fingered Frodo, Maura Labingi(Westron name), Mr. Underhill, Sneaky hobbit (by Gollum)
Peregrin Took: Pippin, Razanur Tûk, (Westron name)Ernil i Pheriannath, Thain Peregrin I, Fool of a Took
Celeborn: Teleporno[1][2] (Quenya)
Hirluin: Hirluin the Fair, Hirluin of the Green Hills
Aragorn II: Estel, Thorongil, Elessar / Edhelharn, Telcontar, Envinyatar, Strider, The Dúnadan, Wingfoot
Bregalad: Quickbeam
Legolas: Greenleaf[1]
Grimbeorn: Grimbeorn the Old
Galadriel: Alatáriel, Artanis, Nerwen
Lobelia Sackville-Baggins: Lobelia Bracegirdle, Mistress Lobelia
T

##### These two outputs were manually cleaned to create cleaned_name.csv and character_alaises.txt respectively

#### Extracting the list of races for the above characters from the wiki

In [9]:
import requests
from bs4 import BeautifulSoup

# Example function to scrape a category page
def scrape_category(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        characters = []
        for link in soup.select('a.category-page__member-link'):
            char_name = link.text
            char_url = link['href']
            characters.append((char_name, "https://lotr.fandom.com"+char_url))
        return characters
    else:
        return None

# Example function to get character race
def get_character_race(char_url):
    response = requests.get(char_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        # Implement the logic to find the race of the character
        # This will depend on the structure of the character page
        ele = soup.find(text='Race')
        if ele:
            race = ele.find_next('div').text
            return race
        else:
            return "Not found"
    else:
        return None

# URLs to scrape
urls = [
    'https://lotr.fandom.com/wiki/Category:Major_characters_(The_Lord_of_the_Rings)',
    'https://lotr.fandom.com/wiki/Category:Minor_characters_(The_Lord_of_the_Rings)',
    'https://lotr.fandom.com/wiki/Category:The_Lord_of_the_Rings_characters'
]
races = {}
for url in urls:
    characters = scrape_category(url)
    #print(characters)
    for char_name, char_url in characters:
        race = get_character_race(char_url)
        if race != "Not found":
            races[char_name] = race

  ele = soup.find(text='Race')


In [10]:
races

{'Aragorn II': 'Men',
 'Arwen': 'Half-elves (chose to be mortal)',
 'Frodo Baggins': 'Men (Hobbits)',
 'Boromir': 'Men',
 'Meriadoc Brandybuck': 'Hobbit',
 'Celeborn': 'Elves',
 'Denethor II': 'Men',
 'Elrond': 'Half-elves (chose to be immortal)',
 'Éomer': 'Men',
 'Éowyn': 'Men',
 'Faramir': 'Men',
 'Galadriel': 'Elves',
 'Samwise Gamgee': 'Hobbits',
 'Gandalf': 'Ainur',
 'Gimli': 'Dwarves',
 'Glorfindel': 'Elves',
 'Gollum': 'Hobbits',
 'Gwaihir': 'Great Eagles',
 'Legolas': 'Elves',
 'Saruman': 'Ainur',
 'Sauron': 'Ainur (Maiar);Maia of Morgoth (initially of Aulë)',
 'Théoden': 'Men',
 'Peregrin Took': 'Hobbits',
 'Treebeard': 'Ents, Onodrim',
 'Witch-king of Angmar': 'Man (Wraith)',
 'Gríma': 'Men',
 'Anborn': 'Men',
 'Angbor': 'Men',
 'Angelica Baggins': 'Hobbit',
 'Willie Banks': 'Hobbits',
 'Baranor (Gondor)': 'Men',
 'Beechbone': 'Ents',
 'Beorn': 'Men',
 'Bergil': 'Men',
 'Bob': 'Hobbits',
 'Folco Boffin': 'Hobbits',
 'Fredegar Bolger': 'Hobbits',
 'Bruno Bracegirdle': 'Hobbit

In [14]:
with open('data/cleaned_name.csv', 'r', encoding='utf-8') as myfile:
    names=myfile.read()
characters=names.split('\n')
characters

['Anborn',
 'Angbor',
 'Aragorn',
 'Arod',
 'Arwen',
 'Asfaloth',
 'Angelica Baggins',
 'Dora Baggins',
 'Frodo',
 'Willie Banks',
 'Baranor',
 'Beechbone',
 'Beorn',
 'Beregond',
 'Bergil',
 'Bilbo',
 'Bob',
 'Folco Boffin',
 'Fredegar Bolger',
 'Tom Bombadil',
 'Boromir',
 'Bruno Bracegirdle',
 'Melilot Brandybuck',
 'Meriadoc Brandybuck',
 'Rorimac Brandybuck',
 'Bregalad',
 'Milo Burrows',
 'Barliman Butterbur',
 'Celeborn',
 'Ceorl',
 'Bowman Cotton',
 'Carl Cotton',
 'Rosie Cotton',
 'Tolman Cotton',
 'Wilcombe Cotton',
 'Cirdan',
 'Damrod',
 'Denethor',
 'Derufin',
 'Dervorin',
 'Duilin',
 'Duinhir',
 'Durin’s Bane',
 'Dain',
 'Deagol',
 'Deorwine',
 'Dunhere',
 'Elfhelm',
 'Elladan and Elrohir',
 'Elrond',
 'Eomer',
 'Eowyn',
 'Erestor',
 'Erkenbrand',
 'Faramir',
 'Fastred',
 'Bill Ferny',
 'Fimbrethil',
 'Finglas',
 'Firefoot',
 'Fladrif',
 'Forlong',
 'Galadriel',
 'Galdor',
 'Hamfast Gamgee',
 'Sam',
 'Gamling',
 'Gandalf',
 'Ghan-buri-Ghan',
 'Gildor Inglorion',
 'Gimli',


In [15]:
character_races = {}
for character in characters:
    if character in races:
        character_races[character] = races[character]
    else:
        character_races[character] = "Not Found"

In [16]:
character_races

{'Anborn': 'Men',
 'Angbor': 'Men',
 'Aragorn': 'Not Found',
 'Arod': 'Not Found',
 'Arwen': 'Half-elves (chose to be mortal)',
 'Asfaloth': 'Not Found',
 'Angelica Baggins': 'Hobbit',
 'Dora Baggins': 'Not Found',
 'Frodo': 'Not Found',
 'Willie Banks': 'Hobbits',
 'Baranor': 'Not Found',
 'Beechbone': 'Ents',
 'Beorn': 'Men',
 'Beregond': 'Not Found',
 'Bergil': 'Men',
 'Bilbo': 'Not Found',
 'Bob': 'Hobbits',
 'Folco Boffin': 'Hobbits',
 'Fredegar Bolger': 'Hobbits',
 'Tom Bombadil': 'Unknown',
 'Boromir': 'Men',
 'Bruno Bracegirdle': 'Hobbits',
 'Melilot Brandybuck': 'Hobbits',
 'Meriadoc Brandybuck': 'Hobbit',
 'Rorimac Brandybuck': 'Hobbits',
 'Bregalad': 'Ents',
 'Milo Burrows': 'Hobbits',
 'Barliman Butterbur': 'Men',
 'Celeborn': 'Elves',
 'Ceorl': 'Men',
 'Bowman Cotton': 'Hobbits',
 'Carl Cotton': 'Hobbits',
 'Rosie Cotton': 'Not Found',
 'Tolman Cotton': 'Hobbits',
 'Wilcombe Cotton': 'Not Found',
 'Cirdan': 'Not Found',
 'Damrod': 'Men',
 'Denethor': 'Not Found',
 'Derufin