# Imports

In [165]:
from bs4 import BeautifulSoup
from bs4.element import Tag
import requests
import pandas as pd
import numpy as np
import re
import csv

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [166]:
site = 'https://game8.co/games/Last-of-Us-2/archives/290290' # walkthrough page
response = requests.get(site)
response

<Response [200]>

In [167]:
soup = BeautifulSoup(response.text, 'html.parser')

# Get images function

In [4]:
def get_filtered_image_links(soup, target_names):
    images = []
    img_tags = soup.find_all('img')
    
    for img in img_tags:
        img_url = img.get('data-src') or img.get('src')
        
        if img_url:
            if img_url.startswith('//'):
                img_url = 'https:' + img_url
            elif img_url.startswith('/'):
                img_url = 'https://game8.co' + img_url

            alt_text = img.get('alt', '').lower()
            
            if any(target_name.lower() in alt_text for target_name in target_names):
                images.append({
                    'url': img_url,
                    'alt': alt_text
                })
    
    return images

# Story

## Data of chapter number and chapter name

In [5]:
chapternumber = 'https://game8.co/games/Last-of-Us-2' # main information page
response = requests.get(chapternumber)
chapternumber_soup = BeautifulSoup(response.text , 'html.parser')
response

<Response [200]>

In [6]:
find_names = [
    'Jackson', 'Seattle Day 1', 'Seattle Day 2', 'Seattle Day 3', 'The Park', 
    'The Farm', 'Santa Barbara'
]

filtered_images = get_filtered_image_links(soup, find_names)

chapter_images = {}
for image in filtered_images:
    for chapter in find_names:
        if chapter.lower() in image['alt']:
            chapter_images[chapter] = image['url']

print(chapter_images)

{'Jackson': 'https://img.game8.co/3252676/016d74f47a69207695a8bd98faa9df04.jpeg/show', 'Seattle Day 1': 'https://img.game8.co/3252681/208e3bd72b0c4ce49f7c2ee308853d6a.jpeg/show', 'Seattle Day 2': 'https://img.game8.co/3252682/541861fadb8610c141286de7d2f98d0c.jpeg/show', 'Seattle Day 3': 'https://img.game8.co/3255016/3fb74e357dc25e5e5368f6cfb994bd54.jpeg/show', 'The Park': 'https://img.game8.co/3252680/68d10ee83ce0e2bad1b133e3aaef1019.jpeg/show', 'The Farm': 'https://img.game8.co/3252775/c195c6ace9970eb25b97e3e7dc47d8a8.png/show', 'Santa Barbara': 'https://img.game8.co/3252774/2a7f34405d54ee3b0e1ebfdecb2831d5.png/show'}


In [7]:
chapternumber = 'https://game8.co/games/Last-of-Us-2' # main information page
response = requests.get(chapternumber)
chapternumber_soup = BeautifulSoup(response.text , 'html.parser')
response

<Response [200]>

In [8]:
chapter_data = []
table = chapternumber_soup.find('table', class_='a-table')

if table:
    rows = table.find_all('tr')
    for row in rows:
        chapter_header = row.find('th').text.strip()
        chapter_data_row = row.find('td').text.strip()
        chapter_name = chapter_data_row
        chapter_image_url = chapter_images.get(chapter_name, None)

        chapter_data.append({
            'Chapter Number': chapter_header,
            'Chapter Name': chapter_name,
            'Chapter Image': chapter_image_url
        })

df_chapters = pd.DataFrame(chapter_data)
df_chapters.head() 

Unnamed: 0,Chapter Number,Chapter Name,Chapter Image
0,Chapter 1,Jackson,https://img.game8.co/3252676/016d74f47a6920769...
1,Chapter 2,Seattle Day 1,https://img.game8.co/3252681/208e3bd72b0c4ce49...
2,Chapter 3,Seattle Day 2,https://img.game8.co/3252682/541861fadb8610c14...
3,Chapter 4,Seattle Day 3,https://img.game8.co/3255016/3fb74e357dc25e5e5...
4,Chapter 5,The Park,https://img.game8.co/3252680/68d10ee83ce0e2bad...


In [9]:
df_chapters.to_csv('../Data/How many chapters.csv', index=False)

## Chapter main objectives

In [10]:
chapter_data = []
table = chapternumber_soup.find('table', class_='a-table')

if table:
    rows = table.find_all('tr')
    for row in rows:
        chapter_header = row.find('th').text.strip()
        chapter_data_row = row.find('td')
        chapter_name = chapter_data_row.text.strip()
        chapter_link_ = chapter_data_row.find('a', class_='a-link')
        chapter_link = chapter_link_['href'] if chapter_link_ else None
        chapter_data.append({'Chapter Number': chapter_header, 'Chapter Name': chapter_name, 'Chapter Link': chapter_link})

df_chapters = pd.DataFrame(chapter_data)
df_chapters.head()

Unnamed: 0,Chapter Number,Chapter Name,Chapter Link
0,Chapter 1,Jackson,/games/Last-of-Us-2/archives/290691
1,Chapter 2,Seattle Day 1,/games/Last-of-Us-2/archives/290692
2,Chapter 3,Seattle Day 2,/games/Last-of-Us-2/archives/290693
3,Chapter 4,Seattle Day 3,/games/Last-of-Us-2/archives/290694
4,Chapter 5,The Park,/games/Last-of-Us-2/archives/290695


In [11]:
chapter_data_with_links = []
base_url = "https://game8.co"

for index, row in df_chapters.iterrows():
    chapter_number = row['Chapter Number']
    chapter_name = row['Chapter Name']
    ch_link = row['Chapter Link']
    if ch_link:
        full_url = base_url + ch_link
        chapter_data_with_links.append({'Chapter Number': chapter_number, 'Chapter Name': chapter_name, 'Chapter Link': full_url})

df_chapters_with_links = pd.DataFrame(chapter_data_with_links)
df_chapters_with_links.head()

Unnamed: 0,Chapter Number,Chapter Name,Chapter Link
0,Chapter 1,Jackson,https://game8.co/games/Last-of-Us-2/archives/2...
1,Chapter 2,Seattle Day 1,https://game8.co/games/Last-of-Us-2/archives/2...
2,Chapter 3,Seattle Day 2,https://game8.co/games/Last-of-Us-2/archives/2...
3,Chapter 4,Seattle Day 3,https://game8.co/games/Last-of-Us-2/archives/2...
4,Chapter 5,The Park,https://game8.co/games/Last-of-Us-2/archives/2...


In [12]:
chapter_soups = []
for url in df_chapters_with_links['Chapter Link']:
    response = requests.get(url)
    chapter_soup = BeautifulSoup(response.text, 'html.parser')
    chapter_soups.append(chapter_soup)

In [13]:
parsed_data = []

for chapter_index, chapter_html in enumerate(chapter_soups):
    chapter_info = df_chapters_with_links.loc[chapter_index]
    chapter_table = chapter_html.find('table', class_='a-table a-table')
    
    if chapter_table:
        table_rows = chapter_table.find_all('tr')
        
        for row in table_rows[1:]:
            row_elements = row.find_all(['th', 'td'])
            
            if len(row_elements) >= 2:
                section_title = row_elements[0].get_text(strip=True)
                primary_goal = row_elements[1].get_text(strip=True)
                
                parsed_data.append({
                    'Chapter Number': chapter_info['Chapter Number'],
                    'Chapter Name': chapter_info['Chapter Name'],
                    'Chapter Section': section_title,
                    'Main Objective': primary_goal
                })

In [15]:
df_final_objectives = pd.DataFrame(parsed_data)
df_final_objectives.head()

Unnamed: 0,Chapter Number,Chapter Name,Chapter Section,Main Objective
0,Chapter 1,Jackson,Prologue,・Ride into town on horse・Play guitar as Joel
1,Chapter 1,Jackson,Waking Up,・Explore the town・Have a snowball fight
2,Chapter 1,Jackson,The Overlook,・Fight through the infected while learning combat
3,Chapter 1,Jackson,Patrol,"・Explore the patrol area, eliminating any infe..."
4,Chapter 1,Jackson,The Horde,"・Run from the infected, following behind Joel ..."


In [16]:
df_final_objectives.to_csv('../Data/Chapter name with chapter section with main objectives.csv', index=False)

## Chapter section walkthrough

In [17]:
section_links_data = []
base_url = "https://game8.co"

for _, chapter_row in df_chapters_with_links.iterrows():
    chapter_name = chapter_row['Chapter Name']
    chapter_url = chapter_row['Chapter Link']

    if not chapter_url:
        continue

    try:
        response = requests.get(chapter_url, timeout=10)
        response.raise_for_status() 
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {chapter_url}: {e}")
        continue

    soup = BeautifulSoup(response.text, 'html.parser')
    content_table = soup.find('table', class_='a-table a-table')

    if not content_table:
        print(f"No relevant table found on {chapter_url}")
        continue

    for table_row in content_table.find_all('tr')[1:]:
        
        first_cell = table_row.find(['th', 'td'])
        if not first_cell:
            continue

        section_title = first_cell.get_text(strip=True)
        link_element = first_cell.find('a', class_='a-link')
        
        section_full_link = None
        if link_element and 'href' in link_element.attrs:
            relative_url = link_element['href']
            section_full_link = base_url + relative_url

        section_links_data.append({
            'Chapter Name': chapter_name,
            'Chapter Section': section_title,
            'Section Link': section_full_link
        })


In [18]:
df_chapter_section_links = pd.DataFrame(section_links_data)
df_chapter_section_links.head()

Unnamed: 0,Chapter Name,Chapter Section,Section Link
0,Jackson,Prologue,https://game8.co/games/Last-of-Us-2/archives/2...
1,Jackson,Waking Up,https://game8.co/games/Last-of-Us-2/archives/2...
2,Jackson,The Overlook,https://game8.co/games/Last-of-Us-2/archives/2...
3,Jackson,Patrol,https://game8.co/games/Last-of-Us-2/archives/2...
4,Jackson,The Horde,https://game8.co/games/Last-of-Us-2/archives/2...


In [21]:
all_section_data = []

def fetch_page_content(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"Error fetching content from {url}: {e}")
        return None

In [22]:
def extract_walkthrough_steps(soup_object):
    walkthrough_steps = []
    spans = soup_object.find_all('span', style="font-size:120%;")
    
    if spans:
        for i, span in enumerate(spans):
            step_description = span.get_text(strip=True)
            if step_description:
                walkthrough_steps.append(f"{i + 1}. {step_description}")
    
    return "\n".join(walkthrough_steps) if walkthrough_steps else "No walkthrough found"

In [23]:
def extract_tips_and_strategies(soup_object):
    mission_tips = []
    tip_headers = soup_object.find_all('h3', id=re.compile(r'hm_\d+'))

    for header in tip_headers:
        tips_text = []
        for sibling in header.next_siblings:
            if sibling.name in ['h2', 'h3']:
                break
            
            if sibling.name == 'p' and 'a-paragraph' in sibling.get('class', []):
                paragraph_text = ""
                for content in sibling.contents:
                    if isinstance(content, str):
                        paragraph_text += content.strip()
                    elif content.name in ['b', 'span', 'a']:
                        paragraph_text += content.get_text(strip=True)

                if paragraph_text:
                    tips_text.append(paragraph_text)
                    
        if tips_text:
            mission_tips.append(f"**{header.get_text(strip=True)}**\n" + "\n".join(tips_text))
            
    return "\n\n".join(mission_tips) if mission_tips else "No mission tips and strategy found"

In [24]:
def extract_section_image(soup_object):
    archive_wrapper = soup_object.find('div', class_='archive-style-wrapper')
    if archive_wrapper:
        img_tag = archive_wrapper.find('p', class_='a-paragraph').find('img') if archive_wrapper.find('p', class_='a-paragraph') else None
        if img_tag and 'data-src' in img_tag.attrs:
            return img_tag['data-src']
            
    return "No image found"

In [25]:
for _, row_data in df_chapter_section_links.iterrows():
    section_url = row_data['Section Link']
    section_name = row_data['Chapter Section']
    chapter_name = row_data['Chapter Name']

    if not section_url:
        print(f"Skipping empty link for section: {section_name}")
        continue

    section_soup = fetch_page_content(section_url)
    
    if section_soup:
        walkthrough_text = extract_walkthrough_steps(section_soup)
        tips_and_strategies_text = extract_tips_and_strategies(section_soup)
        section_image_link = extract_section_image(section_soup)

        if walkthrough_text == "No walkthrough found":
            print(f"Walkthrough not found for: {section_name}")
        if tips_and_strategies_text == "No mission tips and strategy found":
            print(f"Tips not found for: {section_name}")
        if section_image_link == "No image found":
            print(f"Image not found for: {section_name}")

        all_section_data.append({
            'Chapter Name': chapter_name,
            'Chapter Section': section_name,
            'Section Walkthrough': walkthrough_text,
            'Mission Tips and Strategy': tips_and_strategies_text,
            'Chapter Section Image': section_image_link
        })
    else:
        all_section_data.append({
            'Chapter Name': chapter_name,
            'Chapter Section': section_name,
            'Section Walkthrough': "Error: Could not fetch content.",
            'Mission Tips and Strategy': "Error: Could not fetch content.",
            'Chapter Section Image': "Error: Could not fetch content."
        })

Tips not found for: The Horde
Tips not found for: The Chalet
Tips not found for: Packing Up
Tips not found for: Channel 13
Tips not found for: The Birthday Gift
Tips not found for: St. Mary's Hospital
Tips not found for: Infiltration
Tips not found for: Tracking Lesson
Tips not found for: The Stadium
Tips not found for: The Aquarium
Tips not found for: Return to the Aquarium
Tips not found for: The Farm
Tips not found for: 2425 Constance
Tips not found for: Epilogue


In [26]:
df_section_walkthroughs = pd.DataFrame(all_section_data)
df_section_walkthroughs.head()

Unnamed: 0,Chapter Name,Chapter Section,Section Walkthrough,Mission Tips and Strategy,Chapter Section Image
0,Jackson,Prologue,1. Follow Tommy all the way to town.\n2. Play ...,"**Playing Guitar**\nTo play guitar, push the c...",https://img.game8.co/3254598/6752f40e66a1fc1eb...
1,Jackson,Waking Up,1. Follow Jesse.\n2. Follow Jesse and Maria.\n...,**The Snowball Fight**\nMake a snowball by pic...,https://img.game8.co/3254599/3c820373f2db39162...
2,Jackson,The Overlook,1. Follow Owen\n2. After you and Owen split up...,**Using Listening Mode**\nListen Mode is one o...,https://img.game8.co/3254637/692d3d01b3030c2b6...
3,Jackson,Patrol,1. Follow Dina.\n2. After getting off your hor...,**Utilizing Stealth Kills**\nStealth kills are...,https://img.game8.co/3254638/e4bfa69a24601c619...
4,Jackson,The Horde,1. Run away from the horde of infected.\n2. Fo...,No mission tips and strategy found,https://img.game8.co/3254639/39dd93f96c55ae94e...


In [27]:
pattern_to_remove = r'How to.*?\|.*'

df_section_walkthroughs['Mission Tips and Strategy'] = df_section_walkthroughs['Mission Tips and Strategy'].str.replace(pattern_to_remove, '', regex=True).str.strip()


df_section_walkthroughs.head()

Unnamed: 0,Chapter Name,Chapter Section,Section Walkthrough,Mission Tips and Strategy,Chapter Section Image
0,Jackson,Prologue,1. Follow Tommy all the way to town.\n2. Play ...,"**Playing Guitar**\nTo play guitar, push the c...",https://img.game8.co/3254598/6752f40e66a1fc1eb...
1,Jackson,Waking Up,1. Follow Jesse.\n2. Follow Jesse and Maria.\n...,**The Snowball Fight**\nMake a snowball by pic...,https://img.game8.co/3254599/3c820373f2db39162...
2,Jackson,The Overlook,1. Follow Owen\n2. After you and Owen split up...,**Using Listening Mode**\nListen Mode is one o...,https://img.game8.co/3254637/692d3d01b3030c2b6...
3,Jackson,Patrol,1. Follow Dina.\n2. After getting off your hor...,**Utilizing Stealth Kills**\nStealth kills are...,https://img.game8.co/3254638/e4bfa69a24601c619...
4,Jackson,The Horde,1. Run away from the horde of infected.\n2. Fo...,No mission tips and strategy found,https://img.game8.co/3254639/39dd93f96c55ae94e...


In [28]:
df_section_walkthroughs.to_csv('../Data/Chapter section walkthrough.csv', index=False)

# Collectibles

## Collectibles by Chapter

In [31]:
chapter_collectibles_links = ['https://game8.co/games/Last-of-Us-2/archives/290856',
                              'https://game8.co/games/Last-of-Us-2/archives/290858',
                              'https://game8.co/games/Last-of-Us-2/archives/290861',
                              'https://game8.co/games/Last-of-Us-2/archives/290862',
                              'https://game8.co/games/Last-of-Us-2/archives/290863',
                              'https://game8.co/games/Last-of-Us-2/archives/290869',
                              'https://game8.co/games/Last-of-Us-2/archives/290929',
                              'https://game8.co/games/Last-of-Us-2/archives/290931']

In [32]:
url = "https://game8.co/games/Last-of-Us-2/archives/290856"
response = requests.get(url)
response

<Response [200]>

In [33]:
collectible_soups = []

for link in chapter_collectibles_links:
    try:
        response = requests.get(link)
        response.raise_for_status()
        collectible_soup = BeautifulSoup(response.text, 'html.parser')
        collectible_soups.append(collectible_soup)
        print(f"Successfully fetched: {link}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {link}: {e}")
        collectible_soups.append(None) 

Successfully fetched: https://game8.co/games/Last-of-Us-2/archives/290856
Successfully fetched: https://game8.co/games/Last-of-Us-2/archives/290858
Successfully fetched: https://game8.co/games/Last-of-Us-2/archives/290861
Successfully fetched: https://game8.co/games/Last-of-Us-2/archives/290862
Successfully fetched: https://game8.co/games/Last-of-Us-2/archives/290863
Successfully fetched: https://game8.co/games/Last-of-Us-2/archives/290869
Successfully fetched: https://game8.co/games/Last-of-Us-2/archives/290929
Successfully fetched: https://game8.co/games/Last-of-Us-2/archives/290931


In [34]:
collectible_data = []

for index, soup in enumerate(collectible_soups):
    if soup: 
        chapter_name = "Unknown Chapter"
        title_tag = soup.find('title')
        if title_tag:
            title_text = title_tag.text.strip()
            chapter_match = re.search(r'(.*?) Collectibles Guide', title_text)
            if chapter_match:
                chapter_name = chapter_match.group(1).strip()
            else:
                 header_tag = soup.find('h1') or soup.find('h2')
                 if header_tag:
                     header_text = header_tag.get_text(strip=True)
                     chapter_match = re.search(r'(.*?) Collectibles', header_text)
                     if chapter_match:
                         chapter_name = chapter_match.group(1).strip()


        collectible_sections = {
            'Artifact': re.compile(r'Artifacts', re.IGNORECASE),
            'Journal Entry': re.compile(r'Journal Entries', re.IGNORECASE),
            'Trading Card': re.compile(r'Trading Cards', re.IGNORECASE),
            'Coin': re.compile(r'Coins', re.IGNORECASE),
            'Training Manual': re.compile(r'Training Manuals', re.IGNORECASE)
        }

        for collectible_type, header_pattern in collectible_sections.items():
            collectible_header = soup.find(['h2', 'h3'], string=header_pattern)

            if collectible_header:
                next_element = collectible_header.find_next_sibling()
                while next_element:
                    if next_element.name == 'table' and 'a-table' in next_element.get('class', []):
                        collectible_name = None
                        collectible_chapter = chapter_name 
                        collectible_location = None

                        rows = next_element.find_all('tr')
                        for row in rows:
                            header_row = row.find('th')
                            data_row = row.find('td')

                            if header_row and data_row:
                                header_text = header_row.get_text(strip=True)
                                data_text = data_row.get_text(strip=True)

                                if header_text == 'Chapter':
                                    collectible_chapter = data_text
                                elif header_text == 'Location':
                                    collectible_location = data_text
                                    
                            elif data_row and not header_row:
                                if collectible_name is None:
                                     b_tag = data_row.find('b', class_='a-bold')
                                     collectible_name = b_tag.get_text(strip=True) if b_tag else data_row.get_text(strip=True)



                        if collectible_name:
                             collectible_data.append({
                                'Collectible Name': collectible_name,
                                'Collectible Type': collectible_type,
                                'Which chapter it can be found': collectible_chapter,
                                'Collectible location': collectible_location,

                            })

                    elif next_element.name in ['h2', 'h3']:
                        break

                    next_element = next_element.find_next_sibling()

    else:
        print(f"Skipping processing for link {index} due to fetch error.")

In [35]:
df_final_collectibles = pd.DataFrame(collectible_data)
df_final_collectibles.head()

Unnamed: 0,Collectible Name,Collectible Type,Which chapter it can be found,Collectible location
0,Volunteer Request,Artifact,Jackson - Waking Up,This artifact is on a crate beside multiple pi...
1,The View,Journal Entry,Jackson - Patrol,This journal entry can be gained by interactin...
2,Toy Giraffe,Journal Entry,Jackson - Patrol,This journal entry can be gained by interactin...
3,Seismicayla,Trading Card,Waking Up,This trading card is pinned on a cork board wh...
4,The Keene Twins,Trading Card,Waking Up,This trading card is on top of a barrel beside...


In [36]:
df_final_collectibles.to_csv('../Data/Collectible information and location.csv', index=False)

# Enemies

In [37]:
enemies = 'https://game8.co/games/Last-of-Us-2/archives/290294' #enemies page
response = requests.get(enemies)
response

<Response [200]>

In [38]:
enemies_soup = BeautifulSoup(response.text, 'html.parser')

In [39]:
def extract_text_from_paragraph(paragraph_element):
    if not paragraph_element:
        return 'No Description Available'
    
    parts = []
    for content in paragraph_element.contents:
        if isinstance(content, str):
            parts.append(content.strip())
        elif isinstance(content, Tag) and content.name not in ['img', 'a']:
            parts.append(content.get_text(strip=True))

    return " ".join(parts).strip()

In [40]:
def get_full_url(relative_url, base_url="https://game8.co"):
    if not relative_url:
        return None
    
    if relative_url.startswith('//'):
        return 'https:' + relative_url
    elif relative_url.startswith('/'):
        return base_url + relative_url
    else:
        return relative_url

In [41]:
def scrape_enemy_info(soup, enemy_id):
    enemy_tag = soup.find('h3', id=enemy_id)

    if not enemy_tag:
        print(f"Enemy with ID '{enemy_id}' not found.")
        return None

    enemy_name = enemy_tag.get_text(strip=True)
    first_paragraph = enemy_tag.find_next_sibling('p', class_='a-paragraph')
    second_paragraph = first_paragraph.find_next_sibling('p', class_='a-paragraph') if first_paragraph else None

    enemy_image_url = 'No Image Available'
    if first_paragraph:
        img_tag = first_paragraph.find('img')
        if img_tag:
            image_src = img_tag.get('data-src') or img_tag.get('src')
            enemy_image_url = get_full_url(image_src)

    description = extract_text_from_paragraph(first_paragraph)

    how_to_kill_url = 'No URL Available'
    if second_paragraph:
        link_tag = second_paragraph.find('a', class_='a-btn')
        if link_tag:
            how_to_kill_url = get_full_url(link_tag.get('href'))
    
    return {
        'Enemy Name': enemy_name,
        'Enemy Image URL': enemy_image_url,
        'Description': description,
        'How to Kill URL': how_to_kill_url
    }

In [44]:
target_enemy_ids = ['hm_1', 'hm_2', 'hm_3', 'hm_4', 'hm_5', 'hm_6', 'hm_7', 'hm_8', 'hm_9', 'hm_10']
all_enemy_data = []

for enemy_id in target_enemy_ids:
    enemy_info = scrape_enemy_info(enemies_soup, enemy_id)
    if enemy_info:
        all_enemy_data.append(enemy_info)

In [46]:
enemy_df = pd.DataFrame(all_enemy_data)
enemy_df.head()

Unnamed: 0,Enemy Name,Enemy Image URL,Description,How to Kill URL
0,Runners,https://img.game8.co/3252326/b28115a05f1fe7505...,Runners are the most common enemy in The Last ...,https://game8.co/games/Last-of-Us-2/archives/2...
1,Stalkers,https://img.game8.co/3252515/e72f76b3a6874dec3...,Stalkers are the next stage of infected after ...,https://game8.co/games/Last-of-Us-2/archives/2...
2,Clickers,https://img.game8.co/3252055/472aa08fcbf0ce14b...,"Clickers are completely blind, thus there is n...",https://game8.co/games/Last-of-Us-2/archives/2...
3,Bloaters,https://img.game8.co/3252524/439c652f79f7543be...,"Bloaters are a late stage of infected, are ext...",https://game8.co/games/Last-of-Us-2/archives/2...
4,Shamblers,https://img.game8.co/3252325/bedcc8bb79b5be7c4...,The shambler is a new type of powerful infecte...,https://game8.co/games/Last-of-Us-2/archives/2...


In [47]:
enemy_df.to_csv('../Data/Enemies.csv', index=False)

## How to kill each enemy

In [48]:
def fetch_webpage_content(url):
    if not url or url == 'No URL Available':
        print("Skipping due to missing URL.")
        return None
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [49]:
def extract_guide_details(soup):
    guide_info = {}
    current_section_title = None
    current_section_content = []

    content_elements = soup.select('h2.a-header--2, h3.a-header--3, p.a-paragraph, img')

    for element in content_elements:
        if element.name in ['h2', 'h3']:
            if current_section_title and current_section_content:
                guide_info[current_section_title] = current_section_content
            current_section_title = element.get_text(strip=True)
            current_section_content = []
        
        elif element.name == 'p' and 'a-paragraph' in element.get('class', []):
            if current_section_title:
                paragraph_text = "".join(content.get_text(strip=True) for content in element.contents if isinstance(content, str) or content.name in ['span', 'b', 'a'])
                if paragraph_text:
                    current_section_content.append({'type': 'paragraph', 'content': paragraph_text})

        elif element.name == 'img':
            if current_section_title:
                img_url = element.get('data-src') or element.get('src')
                if img_url:
                    if img_url.startswith('//'):
                        img_url = 'https:' + img_url
                    elif img_url.startswith('/'):
                        img_url = 'https://game8.co' + img_url
                    current_section_content.append({'type': 'image', 'url': img_url})

    if current_section_title and current_section_content:
        guide_info[current_section_title] = current_section_content

    return guide_info

In [50]:
def clean_guide_data(guide_data):
    unwanted_sections = ['Comment', 'Author', 'Last of Us 2 Enemies']
    for section in unwanted_sections:
        guide_data.pop(section, None)
    return guide_data

In [54]:
how_to_kill_details = []

for _, enemy_info in enemy_df.iterrows():
    enemy_name = enemy_info['Enemy Name'] 
    guide_url = enemy_info['How to Kill URL'] 

    guide_soup = fetch_webpage_content(guide_url)

    if guide_soup:
        raw_guide_data = extract_guide_details(guide_soup)
        cleaned_guide_data = clean_guide_data(raw_guide_data)

        how_to_kill_details.append({
            'Enemy Name': enemy_name, 
            'How to Kill Guide': cleaned_guide_data
        })
    else:
        how_to_kill_details.append({
            'Enemy Name': enemy_name,
            'How to Kill Guide': 'No Guide Available'
        })

In [56]:
df_how_to_kill = pd.DataFrame(how_to_kill_details)
merged_enemy_df = pd.merge(enemy_df, df_how_to_kill, on='Enemy Name', how='left')
merged_enemy_df.head()

Unnamed: 0,Enemy Name,Enemy Image URL,Description,How to Kill URL,How to Kill Guide
0,Runners,https://img.game8.co/3252326/b28115a05f1fe7505...,Runners are the most common enemy in The Last ...,https://game8.co/games/Last-of-Us-2/archives/2...,"{'How to Kill Runners': [{'type': 'paragraph',..."
1,Stalkers,https://img.game8.co/3252515/e72f76b3a6874dec3...,Stalkers are the next stage of infected after ...,https://game8.co/games/Last-of-Us-2/archives/2...,{'How to Kill Stalkers': [{'type': 'paragraph'...
2,Clickers,https://img.game8.co/3252055/472aa08fcbf0ce14b...,"Clickers are completely blind, thus there is n...",https://game8.co/games/Last-of-Us-2/archives/2...,{'How to Kill Clickers': [{'type': 'paragraph'...
3,Bloaters,https://img.game8.co/3252524/439c652f79f7543be...,"Bloaters are a late stage of infected, are ext...",https://game8.co/games/Last-of-Us-2/archives/2...,{'How to Kill Bloaters': [{'type': 'paragraph'...
4,Shamblers,https://img.game8.co/3252325/bedcc8bb79b5be7c4...,The shambler is a new type of powerful infecte...,https://game8.co/games/Last-of-Us-2/archives/2...,{'How to Kill Shamblers': [{'type': 'paragraph...


In [57]:
merged_enemy_df.to_csv('../Data/Enemies.csv', index=False)

# Weapon

In [168]:
weapons = 'https://game8.co/games/Last-of-Us-2/archives/290291' #weapons page
response = requests.get(weapons)
response

<Response [200]>

In [169]:
weapons_soup = BeautifulSoup(response.text, 'html.parser')

In [170]:
def get_absolute_url(base_url, relative_url):
    if not relative_url:
        return ''
    if relative_url.startswith('//'):
        return f'https:{relative_url}'
    elif relative_url.startswith('/'):
        return f'{base_url}{relative_url}'
    return relative_url

In [171]:
def parse_weapon_details(cell, headers):
    row_data = {}
    for i, header in enumerate(headers):
        cell_data = cell.find_all('td')[i]
        cell_text = cell_data.get_text(strip=True)

        if header == 'Weapon':
            weapon_link_tag = cell_data.find('a', class_='a-link')
            if weapon_link_tag:
                row_data['Weapon Name'] = weapon_link_tag.get_text(strip=True)
                href = weapon_link_tag.get('href')
                row_data['Weapon Link'] = get_absolute_url('https://game8.co', href)

            icon_img_tag = cell_data.find('img')
            if icon_img_tag:
                img_url = icon_img_tag.get('data-src') or icon_img_tag.get('src')
                row_data['Weapon Icon URL'] = get_absolute_url('https://game8.co', img_url)
        else:
            row_data[header] = cell_text
    return row_data

In [172]:
def scrape_weapon_category(soup, header_id):
    header_tag = soup.find('h3', id=header_id, class_='a-header--3')
    if not header_tag:
        print(f"Header with id='{header_id}' not found. Skipping.")
        return None, None

    category_name = header_tag.get_text(strip=True)
    weapon_table = header_tag.find_next_sibling('table', class_='a-table')

    if not weapon_table:
        print(f"No table found for category '{category_name}'. Skipping.")
        return category_name, []

    table_data = []
    header_row = weapon_table.find('tr')
    if header_row:
        headers = [th.get_text(strip=True) for th in header_row.find_all('th')]
        data_rows = weapon_table.find_all('tr')[1:]

        for row in data_rows:
            cells = row.find_all('td')
            if len(headers) == len(cells):
                weapon_details = parse_weapon_details(row, headers)
                if weapon_details:
                    table_data.append(weapon_details)
            else:
                print(f"Skipping row due to mismatched headers/cells in category '{category_name}'.")

    return category_name, table_data

In [173]:
weapon_categories_data = {}
target_header_ids = ['hm_1', 'hm_2', 'hm_3', 'hm_4', 'hm_5']

for header_id in target_header_ids:
    category, weapons_list = scrape_weapon_category(weapons_soup, header_id)
    if category and weapons_list:
        weapon_categories_data[category] = weapons_list

In [174]:
flattened_weapon_list = []
for category, weapons in weapon_categories_data.items():
    for weapon in weapons:
        flattened_weapon_list.append({
            'Weapon Category': category,
            'Weapon Name': weapon.get('Weapon Name', ''),
            'Weapon Link': weapon.get('Weapon Link', ''),
            'Weapon Icon URL': weapon.get('Weapon Icon URL', ''),
            'Description': weapon.get('Description', ''),
            'Location': weapon.get('Location', '')
        })

In [175]:
weapons_df = pd.DataFrame(flattened_weapon_list)
weapons_df.head()

Unnamed: 0,Weapon Category,Weapon Name,Weapon Link,Weapon Icon URL,Description,Location
0,Handguns,Semi-Auto Pistol,https://game8.co/games/Last-of-Us-2/archives/2...,https://img.game8.co/3252180/ec9f8dd1d23c44108...,Short-range handgun with high fire rate and mo...,Obtained by default in Jackson - Waking Up.
1,Handguns,Revolver,https://game8.co/games/Last-of-Us-2/archives/2...,https://img.game8.co/3252178/aa7459e5b5f413b88...,Mid-range revolver with moderate damage and sl...,Get from the box upstairs in Jackson - Packing...
2,Handguns,Military Pistol,https://game8.co/games/Last-of-Us-2/archives/2...,https://img.game8.co/3252528/85a7ba460f685c0ad...,Short-range handgun with high fire rate and mo...,Obtain automatically in Seattle Day 1 - The St...
3,Handguns,Hunting Pistol,https://game8.co/games/Last-of-Us-2/archives/2...,https://img.game8.co/3252556/81bdd2a352fe591a5...,Long-range handgun with high damage and slow r...,Seattle Day 1 - On Foot (Safe)
4,Long Guns,Bolt-Action Rifle,https://game8.co/games/Last-of-Us-2/archives/2...,https://img.game8.co/3252179/a9a804ab28bafba79...,Long-range rifle with high damage and slow fir...,Obtained by default in Jackson - Waking Up.


In [176]:
weapons_df.to_csv('../Data/Weapons.csv', index=False)

## Full weapon information

In [86]:
def extract_table_data(table_soup):
    if not table_soup:
        return []
    
    table_data = []
    header_row = table_soup.find('tr')
    if header_row:
        headers = [th.get_text(strip=True) for th in header_row.find_all('th')]
        data_rows = table_soup.find_all('tr')[1:]
        
        for row in data_rows:
            cells = row.find_all('td')
            if len(headers) == len(cells):
                row_data = {
                    headers[i]: "".join(
                        content.get_text(strip=True) for content in cells[i].contents
                        if hasattr(content, 'get_text')
                    )
                    for i in range(len(headers))
                }
                table_data.append(row_data)
                
    return table_data

In [87]:
def extract_paragraphs(start_heading_soup):
    if not start_heading_soup:
        return 'Information not found.'
    
    paragraphs = []
    current_sibling = start_heading_soup.find_next_sibling()
    
    while current_sibling and current_sibling.name not in ['h2', 'h3']:
        if current_sibling.name == 'p' and 'a-paragraph' in current_sibling.get('class', []):
            paragraphs.append(current_sibling.get_text(strip=True))
        current_sibling = current_sibling.find_next_sibling()
        
    return "\n".join(paragraphs) if paragraphs else 'Information not found.'

In [88]:
def scrape_weapon_details(weapon_name, weapon_url):
    weapon_details = {
        'Weapon Name': weapon_name,
        'Weapon URL': weapon_url,
        'Basic Information': 'N/A',
        'Best Upgrades': 'No upgrades for this weapon',
        'How to use': 'N/A',
        'How to get': 'N/A'
    }

    try:
        response = requests.get(weapon_url)
        response.raise_for_status()
        weapon_soup = BeautifulSoup(response.text, 'html.parser')

        basic_info_heading = weapon_soup.find('h2', id='hl_1', class_='a-header--2')
        if basic_info_heading:
            basic_info_table = basic_info_heading.find_next_sibling('table', class_='a-table')
            if basic_info_table:
                weapon_details['Basic Information'] = extract_table_data(basic_info_table)
            else:
                weapon_details['Basic Information'] = {"paragraphs": extract_paragraphs(basic_info_heading)}

        best_upgrades_heading = weapon_soup.find('h2', id='hl_2', class_='a-header--2')
        if best_upgrades_heading:
            best_upgrades_table = best_upgrades_heading.find_next_sibling('table', class_='a-table')
            if best_upgrades_table:
                upgrades_data = extract_table_data(best_upgrades_table)
                if upgrades_data:
                    weapon_details['Best Upgrades'] = upgrades_data
            else:
                upgrades_paragraphs = extract_paragraphs(best_upgrades_heading)
                if upgrades_paragraphs != 'Information not found.':
                    weapon_details['Best Upgrades'] = {"paragraphs": upgrades_paragraphs}
        
        how_to_use_heading = weapon_soup.find('h2', text=re.compile(r'How to Use', re.IGNORECASE), class_='a-header--2')
        if not how_to_use_heading:
            how_to_use_heading = weapon_soup.find('h2', id='hl_2', class_='a-header--2')
        weapon_details['How to use'] = extract_paragraphs(how_to_use_heading)
            
        how_to_get_heading = weapon_soup.find('h2', id=re.compile(r'hl_3|hl_4'), class_='a-header--2')
        weapon_details['How to get'] = extract_paragraphs(how_to_get_heading)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {weapon_url}: {e}")
        error_message = f"Error fetching data: {e}"
        weapon_details.update({
            'Basic Information': error_message,
            'Best Upgrades': error_message,
            'How to use': error_message,
            'How to get': error_message
        })
    except Exception as e:
        print(f"An unexpected error occurred while processing {weapon_url}: {e}")
        error_message = f"An unexpected error occurred: {e}"
        weapon_details.update({
            'Basic Information': error_message,
            'Best Upgrades': error_message,
            'How to use': error_message,
            'How to get': error_message
        })
        
    return weapon_details

In [89]:
scraped_weapon_details = []

for index, row in weapons_df.iterrows():
    weapon_name = row['Weapon Name']
    weapon_url = row['Weapon Link']
    
    if weapon_name == 'Switchblade':
        weapon_url = 'https://game8.co/games/Last-of-Us-2/archives/291376'
    
    if not weapon_url or not weapon_url.startswith('http'):
        print(f"Skipping {weapon_name} due to missing or invalid URL: {weapon_url}")
        scraped_weapon_details.append({
            'Weapon Name': weapon_name,
            'Weapon URL': weapon_url,
            'Basic Information': 'N/A', 
            'Best Upgrades': 'No upgrades for this weapon',
            'How to use': 'N/A',
            'How to get': 'N/A'
        })
        continue
    
    details = scrape_weapon_details(weapon_name, weapon_url)
    scraped_weapon_details.append(details)

  how_to_use_heading = weapon_soup.find('h2', text=re.compile(r'How to Use', re.IGNORECASE), class_='a-header--2')


In [90]:
full_weapons_df = pd.DataFrame(scraped_weapon_details)
full_weapons_df.head()

Unnamed: 0,Weapon Name,Weapon URL,Basic Information,Best Upgrades,How to use,How to get
0,Semi-Auto Pistol,https://game8.co/games/Last-of-Us-2/archives/2...,[{'Weapon': 'Short-range handgun with high fir...,"[{'Name': 'Fire Rate', 'Parts': '40', 'Effect'...",The semi-auto pistol is the most basic gun in ...,The semi-auto pistol is the most basic gun in ...
1,Revolver,https://game8.co/games/Last-of-Us-2/archives/2...,[{'Weapon': 'Mid-range revolver with moderate ...,"[{'Name': 'Fire Rate', 'Parts': '30', 'Effect'...",While headshots are important for any gun you ...,While headshots are important for any gun you ...
2,Military Pistol,https://game8.co/games/Last-of-Us-2/archives/2...,[{'Weapon': 'Short-range handgun with high fir...,"[{'Name': 'Fire Rate', 'Parts': '40', 'Effect'...",The military pistol is pretty much the same as...,The military pistol is pretty much the same as...
3,Hunting Pistol,https://game8.co/games/Last-of-Us-2/archives/2...,[{'Weapon': 'Long-range handgun with high dama...,"[{'Name': 'Stability', 'Parts': '50', 'Effect'...","Similar to theRevolver, the Hunting Pistol has...","Similar to theRevolver, the Hunting Pistol has..."
4,Bolt-Action Rifle,https://game8.co/games/Last-of-Us-2/archives/2...,[{'Weapon': 'Long-range rifle with high damage...,"[{'Name': 'Stability', 'Parts': '50', 'Effect'...",The rifle is a high damaging but very slow wea...,The rifle is a high damaging but very slow wea...


In [91]:
full_weapons_df.to_csv('../Data/Weapons.csv', index=False)

# Safecode

In [92]:
safe_codes = 'https://game8.co/games/Last-of-Us-2/archives/290690' #safecode page
response = requests.get(safe_codes)
response

<Response [200]>

In [93]:
safecode_soup = BeautifulSoup(response.text, 'html.parser')

In [94]:
safe_codes_data = {
    'Chapter': [
        'Jackson - Patrol', 'Seattle Day 1 - Downtown', 'Seattle Day 1 - Downtown',
        'Seattle Day 1 - Downtown', 'Seattle Day 1 - Capitol Hill', 'Seattle Day 1 - Tunnels',
        'Seattle Day 2 - Hillcrest', 'Seattle Day 2 - The Seraphites', 'Seattle Day 2 - The Seraphites',
        'Seattle Day 3 - The Flooded City', 'Seattle Day 1 - On Foot', 'Seattle Day 1 - Hostile Territory',
        'Seattle Day 1 - The Coast', 'Seattle Day 2 - The Shortcut', 'Seattle Day 2 - The Descent'
    ],
    'Location': [
        'Super Market', 'Bank Vault', 'Courthouse', 'West Gate 2', 'Thrift Store',
        'Locker Room', 'Auto Repair Shop', 'Apartment', 'Weston\'s Pharmacy', 'First Gate',
        'Big Win Safe', 'Jasmine Bakery', 'Boat Control Room', 'Apartment Bedroom', 'Across From Gym'
    ],
    'Combination/Code': [
        '07-20-13', '60-23-06', '86-07-22', '04-51', '55-01-33', '15243', '30-82-65',
        '10-08-83', '38-55-23', '70-12-64', '17-38-07', '68-96-89', '90-77-01',
        '30-23-04', '12-18-79'
    ]
}


safe_codes_df = pd.DataFrame(safe_codes_data)
safe_codes_df.head()

Unnamed: 0,Chapter,Location,Combination/Code
0,Jackson - Patrol,Super Market,07-20-13
1,Seattle Day 1 - Downtown,Bank Vault,60-23-06
2,Seattle Day 1 - Downtown,Courthouse,86-07-22
3,Seattle Day 1 - Downtown,West Gate 2,04-51
4,Seattle Day 1 - Capitol Hill,Thrift Store,55-01-33


In [95]:
safe_codes_df.to_csv('../Data/Safecode.csv', index=False)

## Solving the safecode

In [108]:
safe_codes_locations = {
  'Super Market' : 'https://game8.co/games/Last-of-Us-2/archives/290864',
  'Bank Vault' : 'https://game8.co/games/Last-of-Us-2/archives/290681',
  'Courthouse' : 'https://game8.co/games/Last-of-Us-2/archives/290860',
  'West Gate 2': 'https://game8.co/games/Last-of-Us-2/archives/290844',
  'Thrift Store': 'https://game8.co/games/Last-of-Us-2/archives/290825',
  'Locker Room': 'https://game8.co/games/Last-of-Us-2/archives/290676',
  'Auto Repair Shop': 'https://game8.co/games/Last-of-Us-2/archives/290837',
  'Apartment': 'https://game8.co/games/Last-of-Us-2/archives/290854',
  'Weston\'s Pharmacy': 'https://game8.co/games/Last-of-Us-2/archives/290917',
  'First Gate': 'https://game8.co/games/Last-of-Us-2/archives/290923',
  'Big Win Safe': 'https://game8.co/games/Last-of-Us-2/archives/291118',
  'Jasmine Bakery': 'https://game8.co/games/Last-of-Us-2/archives/291124',
  'Boat Control Room': 'https://game8.co/games/Last-of-Us-2/archives/291129',
  'Apartment Bedroom': 'https://game8.co/games/Last-of-Us-2/archives/291135',
  'Across From Gym': 'https://game8.co/games/Last-of-Us-2/archives/291158'
}

In [109]:
def format_image_url(url):
    if url.startswith('//'):
        return f'https:{url}'
    elif url.startswith('/'):
        return f'https://game8.co{url}'
    return url

In [110]:
def extract_text_from_element(element):
    parts = []
    for content in element.contents:
        if isinstance(content, str):
            parts.append(content.strip())
        elif hasattr(content, 'name'):
            parts.append(content.get_text(strip=True))
    return " ".join(parts).strip()

In [111]:
def scrape_archive_content(soup):
    paragraphs = []
    images = []
    
    archive_wrapper = soup.find('div', class_='archive-style-wrapper')
    if not archive_wrapper:
        return paragraphs, images

    for element in archive_wrapper.contents:
        if element.name == 'h3' and element.get('id') == 'hm_3':
            break

        if element.name == 'p' and 'a-paragraph' in element.get('class', []):
            paragraphs.append(extract_text_from_element(element))
            
            img_tags = element.find_all('img')
            for img in img_tags:
                img_url = img.get('data-src') or img.get('src')
                if img_url:
                    images.append(format_image_url(img_url))

        elif element.name == 'img':
            img_url = element.get('data-src') or element.get('src')
            if img_url:
                images.append(format_image_url(img_url))

    return paragraphs, images

In [112]:
def scrape_step_by_step_instructions(soup):
    steps = []
    h3_tags = soup.find_all('h3', class_='a-header--3')
    
    for h3 in h3_tags:
        if h3.get('id') not in ['hm_1', 'hm_3'] and not steps:
            continue
            
        step_content = {
            'heading': h3.get_text(strip=True), 
            'paragraphs': [], 
            'images': []
        }
        
        next_sibling = h3.next_sibling
        while next_sibling:
            if next_sibling.name in ['h2', 'h3']:
                break
                
            if hasattr(next_sibling, 'name') and next_sibling.name == 'p' and 'a-paragraph' in next_sibling.get('class', []):
                step_content['paragraphs'].append(extract_text_from_element(next_sibling))
                
                img_tags = next_sibling.find_all('img')
                for img in img_tags:
                    img_url = img.get('data-src') or img.get('src')
                    if img_url:
                        step_content['images'].append(format_image_url(img_url))
            
            next_sibling = next_sibling.next_sibling
            
        steps.append(step_content)
        
    return steps

In [113]:
def scrape_headings(soup):
    headings = {}
    h2_hl1 = soup.find('h2', id='hl_1', class_='a-header--2')
    if h2_hl1:
        headings['hl_1'] = h2_hl1.get_text(strip=True)

    h2_hl2 = soup.find('h2', id='hl_2', class_='a-header--2')
    if h2_hl2:
        headings['hl_2'] = h2_hl2.get_text(strip=True)
        
    return headings

In [114]:
def process_location(location_name, url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        paragraphs, images = scrape_archive_content(soup)
        steps_to_safe = scrape_step_by_step_instructions(soup)
        headings = scrape_headings(soup)

        return {
            'Location': location_name,
            'URL': url,
            'Images': images,
            'Paragraphs': paragraphs,
            'Headings': headings,
            'Steps to Safe': steps_to_safe
        }

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return {
            'Location': location_name,
            'URL': url,
            'Error': f"Error fetching data: {e}"
        }
    except Exception as e:
        print(f"An unexpected error occurred while processing {url}: {e}")
        return {
            'Location': location_name,
            'URL': url,
            'Error': f"An unexpected error occurred: {e}"
        }

In [115]:
safe_location_data = []


for location_name, url in safe_codes_locations.items():
    scraped_data = process_location(location_name, url)
    safe_location_data.append(scraped_data)

In [116]:
safe_locations_df = pd.DataFrame(safe_location_data)
safe_locations_df.head()

Unnamed: 0,Location,URL,Images,Paragraphs,Headings,Steps to Safe
0,Super Market,https://game8.co/games/Last-of-Us-2/archives/2...,[https://img.game8.co/3252183/b8d35c36c9326013...,"[, This guide will show you how to open the su...","{'hl_1': 'Supermarket Safe Location', 'hl_2': ...","[{'heading': 'Finding the Code', 'paragraphs':..."
1,Bank Vault,https://game8.co/games/Last-of-Us-2/archives/2...,[https://img.game8.co/3252020/ffaf25313126fdb4...,"[, Downtown Seattle in The Last of Us 2 is hom...","{'hl_1': 'Bank Vault Location', 'hl_2': 'Bank ...","[{'heading': 'Finding the Bank', 'paragraphs':..."
2,Courthouse,https://game8.co/games/Last-of-Us-2/archives/2...,[https://img.game8.co/3252906/4c524880161ae198...,"[, This guide will show you how to unlock the ...","{'hl_1': 'Courthouse Safe Location', 'hl_2': '...","[{'heading': 'Finding the Code', 'paragraphs':..."
3,West Gate 2,https://game8.co/games/Last-of-Us-2/archives/2...,[https://img.game8.co/3252837/e9638d1add0564b8...,[This is a guide to the Checkpoint Gate Codes ...,"{'hl_1': 'Downtown Seattle Gate Codes', 'hl_2'...","[{'heading': 'Where to Find the Gate Codes', '..."
4,Thrift Store,https://game8.co/games/Last-of-Us-2/archives/2...,[https://img.game8.co/3252221/3f924449f3a3c12b...,"[, This guide will show you how to unlock the ...","{'hl_1': 'Thrift Store Safe Location', 'hl_2':...","[{'heading': 'Hint 1', 'paragraphs': ['From th..."


In [117]:
merged_safe_data_df = pd.merge(safe_codes_df, safe_locations_df, on='Location', how='left')
merged_safe_data_df.head()

Unnamed: 0,Chapter,Location,Combination/Code,URL,Images,Paragraphs,Headings,Steps to Safe
0,Jackson - Patrol,Super Market,07-20-13,https://game8.co/games/Last-of-Us-2/archives/2...,[https://img.game8.co/3252183/b8d35c36c9326013...,"[, This guide will show you how to open the su...","{'hl_1': 'Supermarket Safe Location', 'hl_2': ...","[{'heading': 'Finding the Code', 'paragraphs':..."
1,Seattle Day 1 - Downtown,Bank Vault,60-23-06,https://game8.co/games/Last-of-Us-2/archives/2...,[https://img.game8.co/3252020/ffaf25313126fdb4...,"[, Downtown Seattle in The Last of Us 2 is hom...","{'hl_1': 'Bank Vault Location', 'hl_2': 'Bank ...","[{'heading': 'Finding the Bank', 'paragraphs':..."
2,Seattle Day 1 - Downtown,Courthouse,86-07-22,https://game8.co/games/Last-of-Us-2/archives/2...,[https://img.game8.co/3252906/4c524880161ae198...,"[, This guide will show you how to unlock the ...","{'hl_1': 'Courthouse Safe Location', 'hl_2': '...","[{'heading': 'Finding the Code', 'paragraphs':..."
3,Seattle Day 1 - Downtown,West Gate 2,04-51,https://game8.co/games/Last-of-Us-2/archives/2...,[https://img.game8.co/3252837/e9638d1add0564b8...,[This is a guide to the Checkpoint Gate Codes ...,"{'hl_1': 'Downtown Seattle Gate Codes', 'hl_2'...","[{'heading': 'Where to Find the Gate Codes', '..."
4,Seattle Day 1 - Capitol Hill,Thrift Store,55-01-33,https://game8.co/games/Last-of-Us-2/archives/2...,[https://img.game8.co/3252221/3f924449f3a3c12b...,"[, This guide will show you how to unlock the ...","{'hl_1': 'Thrift Store Safe Location', 'hl_2':...","[{'heading': 'Hint 1', 'paragraphs': ['From th..."


In [118]:
merged_safe_data_df.to_csv('../Data/Safecode.csv', index=False)

# Trophies

In [125]:
trophy = 'https://game8.co/games/Last-of-Us-2/archives/290658' #trophy page
response = requests.get(trophy)
response  

<Response [200]>

In [126]:
trophy_soup = BeautifulSoup(response.text, 'html.parser')

In [127]:
def find_trophy_section(soup, h3_id):
    header_tag = soup.find('h3', id=h3_id, class_='a-header--3')
    if header_tag:
        trophy_table = header_tag.find_next_sibling('table', class_='a-table')
        return header_tag, trophy_table
    return None, None

In [128]:
def parse_trophy_table(trophy_table):
    trophy_entries = []
    if not trophy_table:
        return trophy_entries

    rows = trophy_table.find_all('tr')
    current_trophy = {}

    for row in rows:
        img_tag = row.find('img')
        if img_tag:
            image_url = img_tag.get('data-src') or img_tag.get('src')
            if image_url:
                if image_url.startswith('//'):
                    image_url = 'https:' + image_url
                elif image_url.startswith('/'):
                    image_url = 'https://game8.co' + image_url
                current_trophy['image'] = image_url

        title_tag = row.find('th')
        if title_tag:
            current_trophy['title'] = title_tag.get_text(strip=True)

        description_td = next((td for td in row.find_all('td') if not td.find('img')), None)
        if description_td:
            description_parts = []
            for content in description_td.contents:
                if content and isinstance(content, str):
                    description_parts.append(content.strip())
                elif content and hasattr(content, 'get_text'):
                    description_parts.append(content.get_text(strip=True))

            current_trophy['description'] = " ".join(description_parts).strip()

        if 'title' in current_trophy and 'description' in current_trophy:
            trophy_entries.append(current_trophy)
            current_trophy = {}

    return trophy_entries


In [129]:
def extract_trophy_data(soup, section_ids):
    all_trophy_details = {}
    for section_id in section_ids:
        header, trophy_table = find_trophy_section(soup, section_id)
        if header and trophy_table:
            category_name = header.get_text(strip=True)
            trophy_list = parse_trophy_table(trophy_table)
            all_trophy_details[category_name] = trophy_list
        else:
            print(f"Section with ID '{section_id}' not found or has no table.")
    return all_trophy_details

In [130]:
def prepare_data_for_csv(trophy_details):
    csv_rows = []
    for category, trophies in trophy_details.items():
        for trophy in trophies:
            csv_rows.append({
                'Trophy Category': category,
                'Image URL': trophy.get('image', ''),
                'Title': trophy.get('title', ''),
                'Description': trophy.get('description', '')
            })
    return csv_rows

In [134]:
target_section_ids = ['hm_1', 'hm_2', 'hm_3', 'hm_4', 'hm_5']

if trophy_soup:
    trophy_data = extract_trophy_data(trophy_soup, target_section_ids)
    print("Extracted Data:", trophy_data)
else:
    print("Could not retrieve webpage content. Exiting.")

Extracted Data: {'Platinum Trophy Guide': [{'image': 'https://img.game8.co/3243707/9785cef1a5b2d639ec003eeb023eb44c.png/show', 'title': 'Every Last One of Them', 'description': 'Collect all trophies'}], 'Gold Trophies Guide': [{'image': 'https://img.game8.co/3243708/0677172e17d90a5926d0a1c6350ffbe6.png/show', 'title': 'What I Had to Do', 'description': 'Complete the story'}, {'image': 'https://img.game8.co/3243708/0677172e17d90a5926d0a1c6350ffbe6.png/show', 'title': 'Survival Expert', 'description': 'Learn all player upgrades'}, {'image': 'https://img.game8.co/3243708/0677172e17d90a5926d0a1c6350ffbe6.png/show', 'title': 'Arms Master', 'description': 'Fully upgrade all weapons'}, {'image': 'https://img.game8.co/3243708/0677172e17d90a5926d0a1c6350ffbe6.png/show', 'title': 'Archivist', 'description': 'Find all artifacts and journal entries'}, {'image': 'https://img.game8.co/3243708/0677172e17d90a5926d0a1c6350ffbe6.png/show', 'title': 'Master Set', 'description': 'Find all trading cards'},

In [137]:
csv_data = prepare_data_for_csv(trophy_data)
trophy_df = pd.DataFrame(csv_data)
trophy_df.head()

Unnamed: 0,Trophy Category,Image URL,Title,Description
0,Platinum Trophy Guide,https://img.game8.co/3243707/9785cef1a5b2d639e...,Every Last One of Them,Collect all trophies
1,Gold Trophies Guide,https://img.game8.co/3243708/0677172e17d90a592...,What I Had to Do,Complete the story
2,Gold Trophies Guide,https://img.game8.co/3243708/0677172e17d90a592...,Survival Expert,Learn all player upgrades
3,Gold Trophies Guide,https://img.game8.co/3243708/0677172e17d90a592...,Arms Master,Fully upgrade all weapons
4,Gold Trophies Guide,https://img.game8.co/3243708/0677172e17d90a592...,Archivist,Find all artifacts and journal entries


In [138]:
trophy_df.to_csv('../Data/Trophies.csv', index=False)

# Characters

In [149]:
character = 'https://game8.co/games/Last-of-Us-2/archives/290477' #character page
response = requests.get(character)
response

<Response [200]>

In [150]:
character_soup = BeautifulSoup(response.text, 'html.parser')

In [151]:
target_names = [
    'Ellie Icon.png',
    'Dina Icon.png',
    'Joel Icon.png',
    'Jesse Icon.png',
    'Tommy Icon.png',
    'Maria Icon.png',
    'Seth Icon.png',
    'Abby Icon.png',
    'Owen Icon.png',
    'Mel Icon.png',
    'Nora Icon.png',
    'Manny Icon.png',
    'Jordan Icon.png',
    'Isaac Icon.png',
    'Alice Icon.png',
    'Jerry Icon.png',
    'Whitney Icon.png',
    'Nick Icon.png',
    'Lev Icon.png',
    'Yara Icon.png',
    'Emily Icon.png'

]


filtered_images = get_filtered_image_links(character_soup, target_names)

character_images = {}
for image in filtered_images:
    for character_name_with_icon in target_names:
        if character_name_with_icon.lower() in image['alt']:
            character_name = character_name_with_icon.replace(' Icon.png', '').strip()
            character_images[character_name] = image['url']
            break


print(character_images)

{'Ellie': 'https://img.game8.co/3253990/a38ae26121b6dd4dd9b02bc0927eee71.png/show', 'Dina': 'https://img.game8.co/3253995/6a10facf9fdc19b06b66ba6105ec3127.png/show', 'Joel': 'https://img.game8.co/3253992/2dca693d4b0e9c6b1116513760f536d5.png/show', 'Jesse': 'https://img.game8.co/3253997/53633ae7d796ae3ad6a17ac3c6d56b04.png/show', 'Tommy': 'https://img.game8.co/3253998/6cbe56fb66b303818741c5604f55c4a8.png/show', 'Maria': 'https://img.game8.co/3253999/fdb7b56e1a98b135f7c3ff258536c08b.png/show', 'Seth': 'https://img.game8.co/3254000/2c0e33c88eb0cbc41e3f1d3847e29932.png/show', 'Abby': 'https://img.game8.co/3254001/77d11f17280b8bcf5ee16bbbe64c3a92.png/show', 'Owen': 'https://img.game8.co/3254006/e515ec83b6df00155171912e605f14c5.png/show', 'Mel': 'https://img.game8.co/3254005/e736c8192c7100674f18acbde78b99ba.png/show', 'Nora': 'https://img.game8.co/3254007/8894baf742d1e32bce1f36b8233de319.png/show', 'Manny': 'https://img.game8.co/3254008/7e283f44c3b115378936fe417fd45ff4.png/show', 'Jordan': '

In [152]:
character_main_image = {'Ellie' : 'https://img.game8.co/3253674/d94845ae35742fb7aeab666c545ce17c.jpeg/show',
                          'Joel' : 'https://img.game8.co/3253664/103b20839a9d395c4391a3465867cdcb.jpeg/show',
                          'Dina' : 'https://img.game8.co/3253702/457134b322ef240fbf357b4f3e2022bb.jpeg/show',
                          'Abby' : 'https://img.game8.co/3253680/e37a8d8fc6ff287dbf28926e004aebd3.jpeg/show',
                          'Yara' : 'https://img.game8.co/3253753/a53e019404ad3d253ca442e87e2b5270.jpeg/show',
                          'Nora' : 'https://img.game8.co/3253701/527bfd2d36d8b19d5dd586499b79b98f.jpeg/show',
                          'Lev' : 'https://img.game8.co/3253754/95629e336e9e7bc1a7f9ffd871a12089.jpeg/show',
                          'Emily' : 'https://img.game8.co/3253750/b64de7fa1386b449733d20ec2343fc69.jpeg/show',
                          'Jesse' : 'https://img.game8.co/3253666/cd8a49d1e146c09662f466f1417b8063.jpeg/show',
                          'Tommy' : 'https://img.game8.co/3253667/5f6bdd9e98558a4b3a8d18fd39c8d45a.jpeg/show',
                          'Seth' : 'https://img.game8.co/3253700/442701af05e3748b9ccedc8a793a0000.jpeg/show',
                          'Mel' : 'https://img.game8.co/3253673/43e4ae0186bd6083dc0b0f1bda3d6fae.jpeg/show',
                          'Owen' : 'https://img.game8.co/3253668/37cb30d83159273a09bcd4c3eac43dc1.jpeg/show',
                          'Manny' : 'https://img.game8.co/3253714/59d1b2cada49306c655dc50b5690e78e.jpeg/show',
                          'Jordan' : 'https://img.game8.co/3253748/db9e05b070614c194c9d8e910f158883.jpeg/show',
                          'Nick' : 'https://img.game8.co/3253751/0cbbe465209bd878cc3c4fe7b134b961.jpeg/show',
                          'Alice' : 'https://img.game8.co/3253752/40349185b76d1d125192c59788f9e7a9.jpeg/show',
                          'Maria' : 'https://img.game8.co/3253871/34d5cb007beec80ed621a67594bcd565.jpeg/show',
                          'Jerry' : 'https://img.game8.co/3253888/433549ea350fab01b217e903199f165d.jpeg/show',
                          'Whitney' : 'https://img.game8.co/3253876/211eb1e5683a5d9ae17872049440fc81.jpeg/show',
                          'Isaac' : 'https://img.game8.co/3253872/66e409e5ea1a6c1649ac10146a013d70.jpeg/show'

}

In [153]:
character_profile_URLs = {'Ellie_page' : 'https://game8.co/games/Last-of-Us-2/archives/290451',
                          'Joel_page' : 'https://game8.co/games/Last-of-Us-2/archives/290454',
                          'Dina_page' : 'https://game8.co/games/Last-of-Us-2/archives/290477',
                          'Abby_page' : 'https://game8.co/games/Last-of-Us-2/archives/290497',
                          'Yara_page' : 'https://game8.co/games/Last-of-Us-2/archives/290493',
                          'Nora_page' : 'https://game8.co/games/Last-of-Us-2/archives/290491',
                          'Lev_page' : 'https://game8.co/games/Last-of-Us-2/archives/290495',
                          'Emily_page' : 'https://game8.co/games/Last-of-Us-2/archives/290496',
                          'Jesse_page' : 'https://game8.co/games/Last-of-Us-2/archives/290486',
                          'Tommy_page' : 'https://game8.co/games/Last-of-Us-2/archives/290458',
                          'Seth_page' : 'https://game8.co/games/Last-of-Us-2/archives/291199',
                          'Mel_page' : 'https://game8.co/games/Last-of-Us-2/archives/291200',
                          'Owen_page' : 'https://game8.co/games/Last-of-Us-2/archives/291205',
                          'Manny_page' : 'https://game8.co/games/Last-of-Us-2/archives/291206',
                          'Jordan_page' : 'https://game8.co/games/Last-of-Us-2/archives/291209',
                          'Nick_page' : 'https://game8.co/games/Last-of-Us-2/archives/291210',
                          'Alice_page' : 'https://game8.co/games/Last-of-Us-2/archives/291211',
                          'Maria_page' : 'https://game8.co/games/Last-of-Us-2/archives/291219',
                          'Jerry_page' : 'https://game8.co/games/Last-of-Us-2/archives/291220',
                          'Whitney_page' : 'https://game8.co/games/Last-of-Us-2/archives/291221',
                          'Isaac_page' : 'https://game8.co/games/Last-of-Us-2/archives/291222'

}

In [154]:
character_details_list = []

for character_name_with_suffix, profile_url in character_profile_URLs.items():
    try:
        response = requests.get(profile_url)
        response.raise_for_status()
        profile_soup = BeautifulSoup(response.text, 'html.parser')

        character_name = character_name_with_suffix.replace('_page', '').strip()
        character_info_dict = {}
        current_heading = None


        content_elements = profile_soup.select('h2.a-header--2, h3.a-header--3, p.a-paragraph, table.a-table')

        for element in content_elements:
            if element.name in ['h2', 'h3']:
                current_heading = element.get_text(strip=True)
                character_info_dict[current_heading] = [] 
            elif element.name == 'p' and 'a-paragraph' in element.get('class', []):
                if current_heading:
                    character_info_dict[current_heading].append(element.get_text(strip=True))
            elif element.name == 'table' and 'a-table' in element.get('class', []):
                 if current_heading:
                    table_data = []
                    header_row = element.find('tr')
                    if header_row:
                        headers = [th.get_text(strip=True) for th in header_row.find_all('th')]
                        data_rows = element.find_all('tr')[1:]
                        for row in data_rows:
                            cells = [td.get_text(strip=True) for td in row.find_all('td')]
                            if len(headers) == len(cells):
                                table_data.append(dict(zip(headers, cells)))
                    if table_data:
                         character_info_dict[current_heading].append({'table_data': table_data})


        formatted_character_info = {}
        for heading, content_list in character_info_dict.items():
            paragraph_content = [item for item in content_list if not isinstance(item, dict) or 'table_data' not in item]
            table_content = [item for item in content_list if isinstance(item, dict) and 'table_data' in item]

            heading_data = {}
            if paragraph_content:
                heading_data['paragraphs'] = "\n".join(paragraph_content)
            if table_content:
                heading_data['tables'] = table_content

            if heading_data:
                formatted_character_info[heading] = heading_data


        icon_image_url = character_images.get(character_name, 'No Icon Image Available')
        main_image_url = character_main_image.get(character_name, 'No Main Image Available')


        character_details_list.append({
            'Character Name': character_name,
            'Character Icon URL': icon_image_url,
            'Character Main Image URL': main_image_url, 
            'Profile URL': profile_url,
            'Character Info': formatted_character_info 
        })

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {profile_url}: {e}")
        cleaned_name = character_name_with_suffix.replace('_page', '').strip()
        character_details_list.append({
            'Character Name': cleaned_name,
            'Character Icon URL': character_images.get(cleaned_name, 'No Icon Image Available'),
            'Character Main Image URL': character_main_image.get(cleaned_name, 'No Main Image Available'), 
            'Profile URL': profile_url,
            'Character Info': f"Error fetching data: {e}", 
        })
    except Exception as e:
        print(f"An unexpected error occurred while processing {profile_url}: {e}")
        cleaned_name = character_name_with_suffix.replace('_page', '').strip()
        character_details_list.append({
            'Character Name': cleaned_name,
            'Character Icon URL': character_images.get(cleaned_name, 'No Icon Image Available'),
            'Character Main Image URL': character_main_image.get(cleaned_name, 'No Main Image Available'), 
            'Profile URL': profile_url,
            'Character Info': f"An unexpected error occurred: {e}", 
        })

In [155]:
character_df = pd.DataFrame(character_details_list)
character_df.head()

Unnamed: 0,Character Name,Character Icon URL,Character Main Image URL,Profile URL,Character Info
0,Ellie,https://img.game8.co/3253990/a38ae26121b6dd4dd...,https://img.game8.co/3253674/d94845ae35742fb7a...,https://game8.co/games/Last-of-Us-2/archives/2...,{'Ellie: Character Information': {'paragraphs'...
1,Joel,https://img.game8.co/3253992/2dca693d4b0e9c6b1...,https://img.game8.co/3253664/103b20839a9d395c4...,https://game8.co/games/Last-of-Us-2/archives/2...,{'Joel: Character Information': {'paragraphs':...
2,Dina,https://img.game8.co/3253995/6a10facf9fdc19b06...,https://img.game8.co/3253702/457134b322ef240fb...,https://game8.co/games/Last-of-Us-2/archives/2...,{'Dina: Character Information': {'paragraphs':...
3,Abby,https://img.game8.co/3254001/77d11f17280b8bcf5...,https://img.game8.co/3253680/e37a8d8fc6ff287db...,https://game8.co/games/Last-of-Us-2/archives/2...,{'Abby: Character Information': {'paragraphs':...
4,Yara,https://img.game8.co/3254016/e921679aa5dfbf1e7...,https://img.game8.co/3253753/a53e019404ad3d253...,https://game8.co/games/Last-of-Us-2/archives/2...,{'Yara: Character Information': {'paragraphs':...


In [156]:
character_df.to_csv('../Data/Characters.csv', index=False)