## Importing libraries

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import csv

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
site = 'https://game8.co/games/Last-of-Us-2/archives/290290' # walkthrough page
response = requests.get(site)
response

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')

## Chapter

This function is used to get all of the images links based on the alt text.

So by making a loop to go through either data-src or src images links we get a list of images links and depending if the src begins with //, we can add the https or the full website url to make it a full url.

Then it compares if any of the target names are found in the url alt texts, if yes then they are agged to the images link.


In [None]:
def get_filtered_image_links(soup, target_names):
    images = []
    img_tags = soup.find_all('img')
    
    for img in img_tags:
        img_url = img.get('data-src') or img.get('src')
        
        if img_url:
            if img_url.startswith('//'):
                img_url = 'https:' + img_url
            elif img_url.startswith('/'):
                img_url = 'https://game8.co' + img_url

            alt_text = img.get('alt', '').lower()
            
            if any(target_name.lower() in alt_text for target_name in target_names):
                images.append({
                    'url': img_url,
                    'alt': alt_text
                })
    
    return images

Make a list of names I need to specifically find and run the function.

Then I run a loop through each image and compare their alt text to my find names text and make a dictionary key pair to store in the chapter_images dictionary.

In [None]:
find_names = [
    'Jackson', 'Seattle Day 1', 'Seattle Day 2', 'Seattle Day 3', 'The Park', 
    'The Farm', 'Santa Barbara'
]

filtered_images = get_filtered_image_links(soup, find_names)

chapter_images = {}
for image in filtered_images:
    for chapter in find_names:
        if chapter.lower() in image['alt']:
            chapter_images[chapter] = image['url']

print(chapter_images)

This code is extracting both chapter names and their sub-chapters with urls.

It finds teh chapter names -> processes each chapter by finding their sub-chapter and links them together

At the end for each chapter it has a list of sub-chapter, each containtg a name and url, and storing them in a dictionary

In [None]:
chapters = soup.find_all('a', class_='list_contents')
chapter_names = [chapter.get_text(strip=True) for chapter in chapters]

base_url = "https://game8.co"
sub_chapters_dict = {}

chapters_header = soup.find_all('h3', class_='a-header--3')

for chapter in chapters_header:
    chapter_title = chapter.get_text(strip=True)
    table = chapter.find_next('table', class_='a-table')
    sub_chapters = table.find_all('td', class_='center')
    
    sub_chapters_list = []
    
    for sub_chapter in sub_chapters:
        link = sub_chapter.find('a', class_='a-link')
        if link:
            sub_chapters_list.append({
                'text': link.get_text(strip=True),
                'url': base_url + link['href']
            })
    
    sub_chapters_dict[chapter_title] = sub_chapters_list

In [None]:
csv_data = []

for chapter_key in sub_chapters_dict.keys():
    chapter_name = chapter_key.split(':')[-1].strip()

    row = {
        'Chapter Name': chapter_key,  
        'Chapter Image': chapter_images.get(chapter_name, 'No Image Available'), # placeholder if no image is found 
        'Sub-chapters': ', '.join([sub_chap['text'] for sub_chap in sub_chapters_dict.get(chapter_key, [])]),
        'Sub-chapter URLs': ', '.join([sub_chap['url'] for sub_chap in sub_chapters_dict.get(chapter_key, [])]),
    }
    csv_data.append(row)


csv_file = 'chapter_data.csv'

with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=['Chapter Name', 'Chapter Image', 'Sub-chapters', 'Sub-chapter URLs'])
    writer.writeheader()
    writer.writerows(csv_data)

print(f"CSV file '{csv_file}' created successfully!")

## Chapter walkthrough

In [None]:
chapters_df = pd.read_csv('chapter_data.csv')
chapters_df.head()

In [None]:
all_walkthrough_data = []

for index, row in chapters_df.iterrows():
    chapter_name = row['Chapter Name']
    sub_chapter_urls_string = row['Sub-chapter URLs']
    sub_chapter_urls_list = sub_chapter_urls_string.split(', ')

In [None]:
all_walkthrough_data = []

for index, row in chapters_df.iterrows():
    chapter_name = row['Chapter Name']
    sub_chapter_urls_string = row['Sub-chapter URLs']

    sub_chapter_urls_list = sub_chapter_urls_string.split(', ')
    sub_chapter_names_string = row['Sub-chapters']
    sub_chapter_names_list = sub_chapter_names_string.split(', ')


    main_chapter_name = chapter_name.split(':')[-1].strip()

    for i, sub_chapter_url in enumerate(sub_chapter_urls_list):
        try:
            response = requests.get(sub_chapter_url)
            response.raise_for_status()  

            chapter_soup = BeautifulSoup(response.text, 'html.parser')

            sub_chapter_tag = chapter_soup.find('h2', class_='a-header--2')
            if sub_chapter_tag:
                extracted_sub_chapter_name = sub_chapter_tag.get_text(strip=True)
                extracted_sub_chapter_name = extracted_sub_chapter_name.replace(f"{main_chapter_name} - ", "").strip()
                extracted_sub_chapter_name = extracted_sub_chapter_name.replace(f"{main_chapter_name}: ", "").strip()

                sub_chapter_name = f"{main_chapter_name} - {extracted_sub_chapter_name}"
            elif i < len(sub_chapter_names_list):
                sub_chapter_name = f"{main_chapter_name} - {sub_chapter_names_list[i].strip()}"
            else:
                sub_chapter_name = 'N/A'


            spans = chapter_soup.find_all('span', style="font-size:120%;")
            extracted_text = [span.get_text(strip=True) for span in spans]

            formatted_text = []
            if extracted_text:
                first_item = extracted_text[0].rstrip('.')
                formatted_text.append(first_item[0].upper() + first_item[1:].lower())

                for item in extracted_text[1:]:
                    formatted_text.append(item.lower())


            grouped_text = []
            for j in range(0, len(formatted_text), 2):
                line_items = formatted_text[j:j+2]
                if len(line_items) > 1:
                    grouped_text.append(", and ".join(line_items))
                elif line_items:
                    grouped_text.append(line_items[0])

            all_walkthrough_data.append({
                'Chapter Name': chapter_name,
                'Sub-chapter Name': sub_chapter_name,
                'Sub-chapter URL': sub_chapter_url,
                'Walkthrough Text': "\n".join(grouped_text)
            })

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {sub_chapter_url}: {e}")
            all_walkthrough_data.append({
                'Chapter Name': chapter_name,
                'Sub-chapter Name': f"{main_chapter_name} - {sub_chapter_names_list[i].strip()}" if i < len(sub_chapter_names_list) else 'N/A',
                'Sub-chapter URL': sub_chapter_url,
                'Walkthrough Text': f"Error fetching data: {e}"
            })
        except Exception as e:
            print(f"An unexpected error occurred while processing {sub_chapter_url}: {e}")
            all_walkthrough_data.append({
                'Chapter Name': chapter_name,
                'Sub-chapter Name': f"{main_chapter_name} - {sub_chapter_names_list[i].strip()}" if i < len(sub_chapter_names_list) else 'N/A',
                'Sub-chapter URL': sub_chapter_url,
                'Walkthrough Text': f"An unexpected error occurred: {e}"
            })

In [None]:
walkthrough_df = pd.DataFrame(all_walkthrough_data)
walkthrough_df.head()

In [None]:
walkthrough_df.to_csv('walkthrough_data.csv', index=False)

## Tips and Tricks

In [None]:
tips = 'https://game8.co/games/Last-of-Us-2/archives/290295' # tips page
response = requests.get(tips)
response

In [None]:
tips_soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
target_names = [
    'Combat Tips - Banner.jpg', 'Horse Riding - Jumping.jpg', 'Crafting Training Manual.jpg', 'Opening Safe.jpg', 'Guitar Chords.jpg'
]

filtered_images = get_filtered_image_links(tips_soup, target_names)

tips_images = {}
for image in filtered_images:
    for tip_name in target_names:
        if tip_name.lower() in image['alt']:
            tips_images[tip_name.replace('.jpg', '').strip()] = image['url']
            break

print(tips_images)

In [None]:
base_url = "https://game8.co"
sub_tips_dict = {}

tips_header = tips_soup.find_all('h3', class_='a-header--3')

tips_elements = tips_soup.find_all('a', class_='list_contents')
tips_names = [tip.get_text(strip=True) for tip in tips_elements]


for tip in tips_header:
  tip_title = tip.get_text(strip=True)
  table = tip.find_next('table', class_='a-table')
  sub_tips = table.find_all('td', class_='center')

  sub_tips_list = []

  for sub_tip in sub_tips:
    link = sub_tip.find('a', class_='a-link')
    if link:
      sub_tips_list.append({
        'text': link.get_text(strip=True),
        'url': base_url + link['href']
      })

  sub_tips_dict[tip_title] = sub_tips_list



print(sub_tips_dict)

In [None]:

tip_image_mapping = {
    'Last of Us 2 Gameplay Guides': 'Combat Tips - Banner',
    'Last of Us 2 Controls': 'Horse Riding - Jumping', 
    'Last of Us 2 Materials and Parts': 'Crafting Training Manual', 
    'Last of Us 2 Exploration and Secrets Guides': 'Opening Safe', 
    'Last of Us 2 Miscellaneous Guides': 'Guitar Chords' 
}


print("Tip image mapping created:")
print(tip_image_mapping)

In [None]:
csv_data = []

for tip_key in sub_tips_dict.keys():

    image_key_from_mapping = tip_image_mapping.get(tip_key)


    matching_image_url = tips_images.get(image_key_from_mapping, 'No Image Available')

    row = {
        'Tip Name': tip_key,
        'Tip Image': matching_image_url,
        'Sub-tips': ', '.join([sub_tip['text'] for sub_tip in sub_tips_dict.get(tip_key, [])]),
        'Sub-tip URLs': ', '.join([sub_tip['url'] for sub_tip in sub_tips_dict.get(tip_key, [])]),
    }
    csv_data.append(row)


csv_file = 'tips_data.csv'

with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=['Tip Name', 'Tip Image', 'Sub-tips', 'Sub-tip URLs'])
    writer.writeheader()
    writer.writerows(csv_data)

print(f"CSV file '{csv_file}' created successfully!")

In [None]:
tips_df = pd.read_csv('tips_data.csv')
tips_df.head()

In [None]:
all_tips_data = []

for index, row in tips_df.iterrows():
    tip_name = row['Tip Name']
    sub_tip_urls_string = row['Sub-tip URLs']

    sub_tips_urls_list = sub_tip_urls_string.split(', ')
    sub_tip_name_string = row['Sub-tips']
    sub_tip_name_list = sub_tip_name_string.split(', ')

    main_tip_name = tip_name.split(':')[-1].strip()

    for i, sub_tip_url in enumerate(sub_tips_urls_list):
        try:
            response = requests.get(sub_tip_url)
            response.raise_for_status()

            sub_tip_soup = BeautifulSoup(response.text, 'html.parser') 

            
            sub_chapter_tag = sub_tip_soup.find('h2', class_='a-header--2')
            if sub_chapter_tag:
                extracted_sub_tip_name = sub_chapter_tag.get_text(strip=True)
                
                extracted_sub_tip_name = extracted_sub_tip_name.replace(f"{main_tip_name} - ", "").strip()
                extracted_sub_tip_name = extracted_sub_tip_name.replace(f"{main_tip_name}: ", "").strip()

                sub_tip_name = f"{main_tip_name} - {extracted_sub_tip_name}"
            elif i < len(sub_tip_name_list):
                sub_tip_name = f"{main_tip_name} - {sub_tip_name_list[i].strip()}"
            else:
                sub_tip_name = 'N/A'

            
            detailed_tip_dict = {}
            current_heading = None

            
            content_elements = sub_tip_soup.select('h2.a-header--2, h3.a-header--3, p.a-paragraph, table.a-table')

            for element in content_elements:
                if element.name in ['h2', 'h3']:
                    current_heading = element.get_text(strip=True)
                    detailed_tip_dict[current_heading] = [] 
                elif element.name == 'p' and 'a-paragraph' in element.get('class', []):
                    if current_heading:
                        detailed_tip_dict[current_heading].append(element.get_text(strip=True))
                elif element.name == 'table' and 'a-table' in element.get('class', []):
                     if current_heading:
                        
                        table_data = []
                        header_row = element.find('tr')
                        if header_row:
                            headers = [th.get_text(strip=True) for th in header_row.find_all('th')]
                            data_rows = element.find_all('tr')[1:] 
                            for row in data_rows:
                                cells = [td.get_text(strip=True) for td in row.find_all('td')]
                                if len(headers) == len(cells):
                                    table_data.append(dict(zip(headers, cells)))
                       
                        detailed_tip_dict[current_heading].append({'table_data': table_data})


           
            formatted_detailed_tip_dict = {}
            for heading, content_list in detailed_tip_dict.items():
                paragraph_content = [item for item in content_list if not isinstance(item, dict) or 'table_data' not in item]
                table_content = [item for item in content_list if isinstance(item, dict) and 'table_data' in item]

                heading_data = {}
                if paragraph_content:
                    heading_data['paragraphs'] = "\n".join(paragraph_content)
                if table_content:
                    heading_data['tables'] = table_content

                formatted_detailed_tip_dict[heading] = heading_data


            all_tips_data.append({
                'Tip Name': tip_name,
                'Sub-tip Name': sub_tip_name,
                'Sub-tip URL': sub_tip_url,
                'Detailed Tip Text': formatted_detailed_tip_dict 
            })

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {sub_tip_url}: {e}")
            all_tips_data.append({
                'Tip Name': tip_name,
                'Sub-tip Name': f"{main_tip_name} - {sub_tip_name_list[i].strip()}" if i < len(sub_tip_name_list) else 'N/A',
                'Sub-tip URL': sub_tip_url,
                'Detailed Tip Text': f"Error fetching data: {e}"
            })
        except Exception as e:
            print(f"An unexpected error occurred while processing {sub_tip_url}: {e}")
            all_tips_data.append({
                'Tip Name': tip_name,
                'Sub-tip Name': f"{main_tip_name} - {sub_tip_name_list[i].strip()}" if i < len(sub_tip_name_list) else 'N/A',
                'Sub-tip URL': sub_tip_url,
                'Detailed Tip Text': f"An unexpected error occurred: {e}"
            })

In [None]:
tips_df = pd.DataFrame(all_tips_data)
tips_df.head()

In [None]:
tips_df.to_csv('detailed_tips_data.csv', index=False)
print("Detailed tips data saved to 'detailed_tips_data.csv'")

## Characters

In [None]:
character = 'https://game8.co/games/Last-of-Us-2/archives/290477' #character page
response = requests.get(character)
response

In [None]:
character_soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
target_names = [
    'Ellie Icon.png',
    'Dina Icon.png',
    'Joel Icon.png',
    'Jesse Icon.png',
    'Tommy Icon.png',
    'Maria Icon.png',
    'Seth Icon.png',
    'Abby Icon.png',
    'Owen Icon.png',
    'Mel Icon.png',
    'Nora Icon.png',
    'Manny Icon.png',
    'Jordan Icon.png',
    'Isaac Icon.png',
    'Alice Icon.png',
    'Jerry Icon.png',
    'Whitney Icon.png',
    'Nick Icon.png',
    'Lev Icon.png',
    'Yara Icon.png',
    'Emily Icon.png'

]


filtered_images = get_filtered_image_links(character_soup, target_names)

character_images = {}
for image in filtered_images:
    for character_name_with_icon in target_names:
        if character_name_with_icon.lower() in image['alt']:
            # Extract just the character name by removing ' Icon.png'
            character_name = character_name_with_icon.replace(' Icon.png', '').strip()
            character_images[character_name] = image['url']
            break


print(character_images)

In [None]:
character_main_image = {'Ellie' : 'https://img.game8.co/3253674/d94845ae35742fb7aeab666c545ce17c.jpeg/show',
                          'Joel' : 'https://img.game8.co/3253664/103b20839a9d395c4391a3465867cdcb.jpeg/show',
                          'Dina' : 'https://img.game8.co/3253702/457134b322ef240fbf357b4f3e2022bb.jpeg/show',
                          'Abby' : 'https://img.game8.co/3253680/e37a8d8fc6ff287dbf28926e004aebd3.jpeg/show',
                          'Yara' : 'https://img.game8.co/3253753/a53e019404ad3d253ca442e87e2b5270.jpeg/show',
                          'Nora' : 'https://img.game8.co/3253701/527bfd2d36d8b19d5dd586499b79b98f.jpeg/show',
                          'Lev_' : 'https://img.game8.co/3253754/95629e336e9e7bc1a7f9ffd871a12089.jpeg/show',
                          'Emily' : 'https://img.game8.co/3253750/b64de7fa1386b449733d20ec2343fc69.jpeg/show',
                          'Jesse' : 'https://img.game8.co/3253666/cd8a49d1e146c09662f466f1417b8063.jpeg/show',
                          'Tommy' : 'https://img.game8.co/3253667/5f6bdd9e98558a4b3a8d18fd39c8d45a.jpeg/show',
                          'Seth' : 'https://img.game8.co/3253700/442701af05e3748b9ccedc8a793a0000.jpeg/show',
                          'Mel' : 'https://img.game8.co/3253673/43e4ae0186bd6083dc0b0f1bda3d6fae.jpeg/show',
                          'Owen' : 'https://img.game8.co/3253668/37cb30d83159273a09bcd4c3eac43dc1.jpeg/show',
                          'Manny' : 'https://img.game8.co/3253714/59d1b2cada49306c655dc50b5690e78e.jpeg/show',
                          'Jordan' : 'https://img.game8.co/3253748/db9e05b070614c194c9d8e910f158883.jpeg/show',
                          'Nick' : 'https://img.game8.co/3253751/0cbbe465209bd878cc3c4fe7b134b961.jpeg/show',
                          'Alice' : 'https://img.game8.co/3253752/40349185b76d1d125192c59788f9e7a9.jpeg/show',
                          'Maria' : 'https://img.game8.co/3253871/34d5cb007beec80ed621a67594bcd565.jpeg/show',
                          'Jerry' : 'https://img.game8.co/3253888/433549ea350fab01b217e903199f165d.jpeg/show',
                          'Whitney' : 'https://img.game8.co/3253876/211eb1e5683a5d9ae17872049440fc81.jpeg/show',
                          'Isaac' : 'https://img.game8.co/3253872/66e409e5ea1a6c1649ac10146a013d70.jpeg/show'

}

In [None]:
character_profile_URLs = {'Ellie_page' : 'https://game8.co/games/Last-of-Us-2/archives/290451',
                          'Joel_page' : 'https://game8.co/games/Last-of-Us-2/archives/290454',
                          'Dina_page' : 'https://game8.co/games/Last-of-Us-2/archives/290477',
                          'Abby_page' : 'https://game8.co/games/Last-of-Us-2/archives/290497',
                          'Yara_page' : 'https://game8.co/games/Last-of-Us-2/archives/290493',
                          'Nora_page' : 'https://game8.co/games/Last-of-Us-2/archives/290491',
                          'Lev_page' : 'https://game8.co/games/Last-of-Us-2/archives/290495',
                          'Emily_page' : 'https://game8.co/games/Last-of-Us-2/archives/290496',
                          'Jesse_page' : 'https://game8.co/games/Last-of-Us-2/archives/290486',
                          'Tommy_page' : 'https://game8.co/games/Last-of-Us-2/archives/290458',
                          'Seth_page' : 'https://game8.co/games/Last-of-Us-2/archives/291199',
                          'Mel_page' : 'https://game8.co/games/Last-of-Us-2/archives/291200',
                          'Owen_page' : 'https://game8.co/games/Last-of-Us-2/archives/291205',
                          'Manny_page' : 'https://game8.co/games/Last-of-Us-2/archives/291206',
                          'Jordan_page' : 'https://game8.co/games/Last-of-Us-2/archives/291209',
                          'Nick_page' : 'https://game8.co/games/Last-of-Us-2/archives/291210',
                          'Alice_page' : 'https://game8.co/games/Last-of-Us-2/archives/291211',
                          'Maria_page' : 'https://game8.co/games/Last-of-Us-2/archives/291219',
                          'Jerry_page' : 'https://game8.co/games/Last-of-Us-2/archives/291220',
                          'Whitney_page' : 'https://game8.co/games/Last-of-Us-2/archives/291221',
                          'Isaac_page' : 'https://game8.co/games/Last-of-Us-2/archives/291222'

}

In [None]:
character_details_list = []

for character_name_with_suffix, profile_url in character_profile_URLs.items():
    try:
        response = requests.get(profile_url)
        response.raise_for_status() # Raise an exception for bad status codes

        profile_soup = BeautifulSoup(response.text, 'html.parser')

        # Extract the character name without the '_page' suffix
        character_name = character_name_with_suffix.replace('_page', '').strip()


        # --- Extract Character Information based on the provided structure ---
        character_info_dict = {}
        current_heading = None

        # Find all relevant heading, paragraph, and table tags in order
        content_elements = profile_soup.select('h2.a-header--2, h3.a-header--3, p.a-paragraph, table.a-table')

        for element in content_elements:
            if element.name in ['h2', 'h3']:
                current_heading = element.get_text(strip=True)
                character_info_dict[current_heading] = [] # Initialize a list for content under this heading
            elif element.name == 'p' and 'a-paragraph' in element.get('class', []):
                # Append the paragraph text to the list under the current heading if a heading was found
                if current_heading:
                    character_info_dict[current_heading].append(element.get_text(strip=True))
            elif element.name == 'table' and 'a-table' in element.get('class', []):
                # Check for specific table classes if needed, but targeting 'a-table' seems sufficient based on description
                 if current_heading:
                    # Extract table data
                    table_data = []
                    header_row = element.find('tr')
                    if header_row:
                        headers = [th.get_text(strip=True) for th in header_row.find_all('th')]
                        data_rows = element.find_all('tr')[1:] # Skip header row
                        for row in data_rows:
                            cells = [td.get_text(strip=True) for td in row.find_all('td')]
                            if len(headers) == len(cells):
                                table_data.append(dict(zip(headers, cells)))
                    # Add table data to the list under the current heading, maybe with a key to indicate it's a table
                    if table_data: # Only add if table data was actually extracted
                         character_info_dict[current_heading].append({'table_data': table_data})


        # --- Format the extracted information ---
        # Join paragraphs under each heading and include tables
        formatted_character_info = {}
        for heading, content_list in character_info_dict.items():
            paragraph_content = [item for item in content_list if not isinstance(item, dict) or 'table_data' not in item]
            table_content = [item for item in content_list if isinstance(item, dict) and 'table_data' in item]

            heading_data = {}
            if paragraph_content:
                heading_data['paragraphs'] = "\n".join(paragraph_content)
            if table_content:
                heading_data['tables'] = table_content

            if heading_data: # Only include the heading in the formatted output if it has content
                formatted_character_info[heading] = heading_data


        # --- Combine with Image URLs ---
        # Get the character's icon image URL from the character_images dictionary using the cleaned character_name
        icon_image_url = character_images.get(character_name, 'No Icon Image Available')
        # Get the character's main image URL from the character_main_image dictionary
        main_image_url = character_main_image.get(character_name, 'No Main Image Available')


        # Append the collected data for this character
        character_details_list.append({
            'Character Name': character_name,
            'Character Icon URL': icon_image_url,
            'Character Main Image URL': main_image_url, # Added main image URL
            'Profile URL': profile_url,
            'Character Info': formatted_character_info # Store the structured information dictionary
        })

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {profile_url}: {e}")
        # Use the cleaned character_name for the error entry as well
        cleaned_name = character_name_with_suffix.replace('_page', '').strip()
        character_details_list.append({
            'Character Name': cleaned_name,
            'Character Icon URL': character_images.get(cleaned_name, 'No Icon Image Available'),
            'Character Main Image URL': character_main_image.get(cleaned_name, 'No Main Image Available'), # Added main image URL for error case
            'Profile URL': profile_url,
            'Character Info': f"Error fetching data: {e}", # Indicate error in info field
        })
    except Exception as e:
        print(f"An unexpected error occurred while processing {profile_url}: {e}")
        # Use the cleaned character_name for the error entry as well
        cleaned_name = character_name_with_suffix.replace('_page', '').strip()
        character_details_list.append({
            'Character Name': cleaned_name,
            'Character Icon URL': character_images.get(cleaned_name, 'No Icon Image Available'),
            'Character Main Image URL': character_main_image.get(cleaned_name, 'No Main Image Available'), # Added main image URL for error case
            'Profile URL': profile_url,
            'Character Info': f"An unexpected error occurred: {e}", # Indicate error in info field
        })

# Now character_details_list contains the scraped data for each character
# The next step is to convert this into a DataFrame and save to CSV
print(f"Finished scraping details for {len(character_details_list)} characters.")

In [None]:
# Convert the list of dictionaries into a pandas DataFrame
character_df = pd.DataFrame(character_details_list)

# Display the first few rows to verify the data
display(character_df.head())

# Define the CSV file name
csv_file = 'character_data.csv'

# Save the DataFrame to a CSV file
character_df.to_csv(csv_file, index=False)

print(f"Character details saved to '{csv_file}'")

## Safe Codes

In [None]:
safe_codes = 'https://game8.co/games/Last-of-Us-2/archives/290690' #safecode page
response = requests.get(safe_codes)
response

In [None]:
safecode_soup = BeautifulSoup(response.text, 'html.parser')

In [None]:

# Provided safe code data
safe_codes_data = {
    'Chapter': [
        'Jackson - Patrol', 'Seattle Day 1 - Downtown', 'Seattle Day 1 - Downtown',
        'Seattle Day 1 - Downtown', 'Seattle Day 1 - Capitol Hill', 'Seattle Day 1 - Tunnels',
        'Seattle Day 2 - Hillcrest', 'Seattle Day 2 - The Seraphites', 'Seattle Day 2 - The Seraphites',
        'Seattle Day 3 - The Flooded City', 'Seattle Day 1 - On Foot', 'Seattle Day 1 - Hostile Territory',
        'Seattle Day 1 - The Coast', 'Seattle Day 2 - The Shortcut', 'Seattle Day 2 - The Descent'
    ],
    'Location': [
        'Super Market', 'Bank Vault', 'Courthouse', 'West Gate 2', 'Thrift Store',
        'Locker Room', 'Auto Repair Shop', 'Apartment', 'Weston\'s Pharmacy', 'First Gate',
        'Big Win Safe', 'Jasmine Bakery', 'Boat Control Room', 'Apartment Bedroom', 'Across From Gym'
    ],
    'Combination/Code': [
        '07-20-13', '60-23-06', '86-07-22', '04-51', '55-01-33', '15243', '30-82-65',
        '10-08-83', '38-55-23', '70-12-64', '17-38-07', '68-96-89', '90-77-01',
        '30-23-04', '12-18-79'
    ]
}

# Create a DataFrame
safe_codes_df = pd.DataFrame(safe_codes_data)

# Display the DataFrame
display(safe_codes_df)

# Save to CSV
csv_file = 'safe_codes_data.csv'
safe_codes_df.to_csv(csv_file, index=False)

print(f"CSV file '{csv_file}' created successfully!")

In [None]:
safe_codes_locations = {
  'Super Market' : 'https://game8.co/games/Last-of-Us-2/archives/290864',
  'Bank Vault' : 'https://game8.co/games/Last-of-Us-2/archives/290681',
  'Courthouse' : 'https://game8.co/games/Last-of-Us-2/archives/290860',
  'West Gate 2': 'https://game8.co/games/Last-of-Us-2/archives/290844',
  'Thrift Store': 'https://game8.co/games/Last-of-Us-2/archives/290825',
  'Locker Room': 'https://game8.co/games/Last-of-Us-2/archives/290676',
  'Auto Repair Shop': 'https://game8.co/games/Last-of-Us-2/archives/290837',
  'Apartment': 'https://game8.co/games/Last-of-Us-2/archives/290854',
  'Weston\'s Pharmacy': 'https://game8.co/games/Last-of-Us-2/archives/290917',
  'First Gate': 'https://game8.co/games/Last-of-Us-2/archives/290923',
  'Big Win Safe': 'https://game8.co/games/Last-of-Us-2/archives/291118',
  'Jasmine Bakery': 'https://game8.co/games/Last-of-Us-2/archives/291124',
  'Boat Control Room': 'https://game8.co/games/Last-of-Us-2/archives/291129',
  'Apartment Bedroom': 'https://game8.co/games/Last-of-Us-2/archives/291135',
  'Across From Gym': 'https://game8.co/games/Last-of-Us-2/archives/291158'
}

In [None]:
safe_locations_details = []


for location, url in safe_codes_locations.items():
    try:
        response = requests.get(url)
        response.raise_for_status() # Raise an exception for bad status codes

        location_soup = BeautifulSoup(response.text, 'html.parser')

        # Dictionary to store details for this location
        details = {
            'Location': location,
            'URL': url,
            'Images': [],
            'Paragraph Text': [],
            'Headings': {},
            'Steps to Safe': [] # To store steps under h3 tags
        }

        # Find the archive-style-wrapper div
        archive_wrapper = location_soup.find('div', class_='archive-style-wrapper')

        if archive_wrapper:
            # Iterate through the contents of the archive_wrapper
            for element in archive_wrapper.contents:
                # Stop if we reach or pass an h3 with id 'hm_3'
                if element.name == 'h3' and element.get('id') == 'hm_3':
                    break

                # Extract paragraph text
                if element.name == 'p' and 'a-paragraph' in element.get('class', []):
                     # Extract all text content from the paragraph, including nested tags
                    paragraph_text = "".join(content.get_text(strip=True) for content in element.contents if isinstance(content, str) or content.name in ['span', 'b'])
                    if paragraph_text:
                        details['Paragraph Text'].append(paragraph_text)

                    # Extract image URLs within this paragraph
                    img_tags = element.find_all('img')
                    for img in img_tags:
                        img_url = img.get('data-src') or img.get('src')
                        if img_url:
                            if img_url.startswith('//'):
                                img_url = 'https:' + img_url
                            elif img_url.startswith('/'):
                                img_url = 'https://game8.co' + img_url
                            details['Images'].append(img_url)

                # Extract images that might be direct children of the wrapper (less common but possible)
                elif element.name == 'img':
                     img_url = element.get('data-src') or element.get('src')
                     if img_url:
                         if img_url.startswith('//'):
                             img_url = 'https:' + img_url
                         elif img_url.startswith('/'):
                             img_url = 'https://game8.co' + img_url
                         details['Images'].append(img_url)


            # Continue with extracting specific headings and steps to safe as before
            # Extract H2 headings with specific IDs (these might be outside the wrapper or within it)
            h2_hl1 = location_soup.find('h2', id='hl_1', class_='a-header--2')
            if h2_hl1:
                details['Headings']['hl_1'] = h2_hl1.get_text(strip=True)

            h2_hl2 = location_soup.find('h2', id='hl_2', class_='a-header--2')
            if h2_hl2:
                details['Headings']['hl_2'] = h2_hl2.get_text(strip=True)

            # Extract Steps to Safe under h3 tags (hm_1, hm_3, and subsequent)
            # Find all h3 tags with class 'a-header--3'
            h3_tags = location_soup.find_all('h3', class_='a-header--3')
            current_step_heading = None
            for h3 in h3_tags:
                h3_text = h3.get_text(strip=True)
                # Consider h3 with id 'hm_1' or 'hm_3' as starting points for steps
                if h3.get('id') in ['hm_1', 'hm_3'] or current_step_heading:
                     current_step_heading = h3_text
                     step_content = {'heading': h3_text, 'paragraphs': [], 'images': []}

                     # Find subsequent paragraphs and images until the next heading (h2 or h3) or end of content
                     next_sibling = h3.next_sibling
                     while next_sibling:
                         if next_sibling.name in ['h2', 'h3']:
                             break # Stop if we hit another heading
                         if next_sibling.name == 'p' and 'a-paragraph' in next_sibling.get('class', []):
                             step_content['paragraphs'].append(next_sibling.get_text(strip=True))
                             # Find images within this paragraph as well
                             step_img_tags = next_sibling.find_all('img')
                             for img in step_img_tags:
                                 img_url = img.get('data-src') or img.get('src')
                                 if img_url:
                                     if img_url.startswith('//'):
                                         img_url = 'https:' + img_url
                                     elif img_url.startswith('/'):
                                         img_url = 'https://game8.co' + img_url
                                     step_content['images'].append(img_url)

                         next_sibling = next_sibling.next_sibling

                     details['Steps to Safe'].append(step_content)


        safe_locations_details.append(details)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        safe_locations_details.append({
            'Location': location,
            'URL': url,
            'Error': f"Error fetching data: {e}"
        })
    except Exception as e:
        print(f"An unexpected error occurred while processing {url}: {e}")
        safe_locations_details.append({
            'Location': location,
            'URL': url,
            'Error': f"An unexpected error occurred: {e}"
        })

# Now safe_locations_details is a list of dictionaries containing the scraped details for each location
# You can inspect this list or convert it to a DataFrame
print(f"Finished scraping details for {len(safe_locations_details)} safe locations.")



In [None]:
# Convert the safe_locations_details list into a DataFrame
safe_locations_df = pd.DataFrame(safe_locations_details)

# Display the first few rows of the safe_locations_df to verify
display(safe_locations_df.head())

# Merge the safe_codes_df with the safe_locations_df based on the 'Location' column
# Assuming 'Location' is the common column in both DataFrames
merged_safe_data_df = pd.merge(safe_codes_df, safe_locations_df, on='Location', how='left')

# Display the first few rows of the merged DataFrame to verify
display(merged_safe_data_df.head())

# Define the CSV file name for the combined data
csv_file = 'safe_codes_.csv'

# Save the merged DataFrame to a CSV file
merged_safe_data_df.to_csv(csv_file, index=False)

print(f"Combined safe codes data saved to '{csv_file}'")

## Trohpy

In [None]:
trophy = 'https://game8.co/games/Last-of-Us-2/archives/290658' #trophy page
response = requests.get(trophy)
response  

In [None]:
trophy_soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
trophy_details = {}

# List of h3 IDs to target
target_h3_ids = ['hm_1', 'hm_2', 'hm_3', 'hm_4', 'hm_5']

for h3_id in target_h3_ids:
    # Find the specific h3 tag by its ID and class
    h3_tag = trophy_soup.find('h3', id=h3_id, class_='a-header--3')

    if h3_tag:
        heading_text = h3_tag.get_text(strip=True)

        # Find the table immediately following this h3 tag
        trophy_table = h3_tag.find_next_sibling('table', class_='a-table')

        if trophy_table:
            # Extract data from the table
            table_data = []
            # Assuming the table structure is consistent: image in first td, then th for title, then td for description
            rows = trophy_table.find_all('tr')

            current_trophy = {}
            for row in rows:
                # Extract image URL from the first td if present
                img_tag = row.find('img')
                if img_tag:
                    img_url = img_tag.get('data-src') or img_tag.get('src')
                    if img_url:
                         if img_url.startswith('//'):
                             img_url = 'https:' + img_url
                         elif img_url.startswith('/'):
                             img_url = 'https://game8.co' + img_url
                         current_trophy['image'] = img_url

                # Extract trophy title from th
                th_tag = row.find('th')
                if th_tag:
                    current_trophy['title'] = th_tag.get_text(strip=True)

                # Extract trophy description from td (excluding the td with the image)
                td_tags = row.find_all('td')
                # Find td that does NOT contain an img tag
                description_td = None
                for td in td_tags:
                    if not td.find('img'):
                        description_td = td
                        break

                if description_td:
                    # Extract all text content from the description td and join with spaces
                    description_text_parts = [content.strip() for content in description_td.contents if isinstance(content, str)]
                    current_trophy['description'] = " ".join(description_text_parts).strip()

                    # If we have a title and description, this is likely a complete trophy entry
                    if 'title' in current_trophy and 'description' in current_trophy:
                         table_data.append(current_trophy)
                         current_trophy = {} # Reset for the next trophy entry


            # Store the extracted table data under the heading text
            trophy_details[heading_text] = table_data
        else:
            print(f"No table found after h3 with id='{h3_id}'")
    else:
        print(f"h3 with id='{h3_id}' not found")

# Display the extracted trophy details
display(trophy_details)

In [None]:


# Prepare data for CSV
csv_data = []
for category, trophies in trophy_details.items():
    for trophy in trophies:
        csv_data.append({
            'Trophy Category': category,
            'Image URL': trophy.get('image', ''),
            'Title': trophy.get('title', ''),
            'Description': trophy.get('description', '')
        })

# Define CSV file name
csv_file = 'trophy_data.csv'

# Write data to CSV
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=['Trophy Category', 'Image URL', 'Title', 'Description'])
    writer.writeheader()
    writer.writerows(csv_data)

print(f"Trophy data saved to '{csv_file}'")

## Weapons

In [None]:
weapons = 'https://game8.co/games/Last-of-Us-2/archives/290291' #weapons page
response = requests.get(weapons)
response

In [None]:
weapons_soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
weapons_data = {}

# List of h3 IDs to target for weapon categories
target_h3_ids = ['hm_1', 'hm_2', 'hm_3', 'hm_4', 'hm_5']

for h3_id in target_h3_ids:
    # Find the specific h3 tag by its ID and class
    h3_tag = weapons_soup.find('h3', id=h3_id, class_='a-header--3')

    if h3_tag:
        heading_text = h3_tag.get_text(strip=True)

        # Find the table immediately following this h3 tag
        weapon_table = h3_tag.find_next_sibling('table', class_='a-table') # Use class 'a-table' as shown in structure

        if weapon_table:
            # Extract data from the table
            table_data = []
            # Assuming the first row is the header
            header_row = weapon_table.find('tr')
            if header_row:
                headers = [th.get_text(strip=True) for th in header_row.find_all('th')]

                # Extract data rows (skip the header row)
                data_rows = weapon_table.find_all('tr')[1:]

                for row in data_rows:
                    cells = row.find_all('td')
                    # Ensure we have the expected number of cells
                    if len(headers) == len(cells):
                        row_data = {}
                        # Extract data for each column based on header
                        for i, header in enumerate(headers):
                            cell = cells[i]
                            cell_text = cell.get_text(strip=True)

                            # Special handling for the 'Weapon' column to get image and link
                            if header == 'Weapon':
                                weapon_link_tag = cell.find('a', class_='a-link')
                                if weapon_link_tag:
                                    row_data['Weapon Name'] = weapon_link_tag.get_text(strip=True)
                                    # Construct full URL if href is relative
                                    href = weapon_link_tag.get('href')
                                    if href:
                                         if href.startswith('//'):
                                             row_data['Weapon Link'] = 'https:' + href
                                         elif href.startswith('/'):
                                             row_data['Weapon Link'] = 'https://game8.co' + href
                                         else:
                                             row_data['Weapon Link'] = href # Assume absolute if no scheme/leading slash


                                icon_img_tag = cell.find('img')
                                if icon_img_tag:
                                    img_url = icon_img_tag.get('data-src') or icon_img_tag.get('src')
                                    if img_url:
                                        if img_url.startswith('//'):
                                            row_data['Weapon Icon URL'] = 'https:' + img_url
                                        elif img_url.startswith('/'):
                                            row_data['Weapon Icon URL'] = 'https://game8.co' + img_url
                                        else:
                                            row_data['Weapon Icon URL'] = img_url # Assume absolute
                            else:
                                # For other columns, just get the text
                                row_data[header] = cell_text

                        table_data.append(row_data)
                    else:
                        print(f"Skipping weapon row due to header/cell mismatch: {headers} vs {[cell.get_text(strip=True) for cell in cells]}")

            # Store the extracted table data under the heading text
            weapons_data[heading_text] = table_data
        else:
            print(f"No weapon table found after h3 with id='{h3_id}'")
    else:
        print(f"h3 with id='{h3_id}' not found")

# Display the extracted weapon data
display(weapons_data)

In [None]:

csv_data = []
for category, weapons in weapons_data.items():
    for weapon in weapons:
        csv_data.append({
            'Weapon Category': category,
            'Weapon Name': weapon.get('Weapon Name', ''),
            'Weapon Link': weapon.get('Weapon Link', ''),
            'Weapon Icon URL': weapon.get('Weapon Icon URL', ''),
            'Description': weapon.get('Description', ''),
            'Location': weapon.get('Location', '')
        })

# Convert the list of dictionaries into a pandas DataFrame
weapons_df = pd.DataFrame(csv_data)

# Define CSV file name
csv_file = 'weapons_data.csv'

# Write data to CSV
weapons_df.to_csv(csv_file, index=False)

print(f"Weapon data saved to '{csv_file}'")

## Full weapon info

In [None]:
full_weapon_data = []

# Iterate through the weapons_df DataFrame
for index, row in weapons_df.iterrows():
    weapon_name = row['Weapon Name']
    weapon_url = row['Weapon Link']

    # Skip if the weapon link is empty or not a valid URL
    if not weapon_url or not weapon_url.startswith('http'):
        print(f"Skipping {weapon_name} due to missing or invalid URL: {weapon_url}")
        full_weapon_data.append({
            'Weapon Name': weapon_name,
            'Weapon URL': weapon_url,
            'Basic Information': 'N/A', # Use N/A for missing sections
            'Best Upgrades': 'N/A',
            'How to use': 'N/A',
            'How to get': 'N/A' # Add N/A for the new column
        })
        continue

    try:
        response = requests.get(weapon_url)
        response.raise_for_status() # Raise an exception for bad status codes

        weapon_soup = BeautifulSoup(response.text, 'html.parser')

        # Dictionary to store detailed information for this weapon
        weapon_details = {
            'Weapon Name': weapon_name,
            'Weapon URL': weapon_url,
            'Basic Information': {},
            'Best Upgrades': {},
            'How to use': "", # Assuming How to use is mainly paragraph text
            'How to get': "" # Initialize the new 'How to get' field
        }

        # --- Extract Basic Information ---
        # This section seems to be under an h2 with id 'hl_1' and contains a table
        basic_info_heading = weapon_soup.find('h2', id='hl_1', class_='a-header--2')
        if basic_info_heading:
            basic_info_table = basic_info_heading.find_next_sibling('table', class_='a-table')
            if basic_info_table:
                table_data = []
                header_row = basic_info_table.find('tr')
                if header_row:
                    headers = [th.get_text(strip=True) for th in header_row.find_all('th')]
                    data_rows = basic_info_table.find_all('tr')[1:]
                    for r in data_rows:
                        cells = r.find_all('td')
                        if len(headers) == len(cells):
                            row_data = {}
                            for i, header in enumerate(headers):
                                # Extract text from td, including nested tags
                                cell_text = "".join(content.get_text(strip=True) for content in cells[i].contents if isinstance(content, str) or content.name in ['span', 'b', 'a', 'img'])
                                row_data[header] = cell_text
                            table_data.append(row_data)
                weapon_details['Basic Information'] = table_data
            else:
                # If no table, look for paragraphs under this heading
                basic_info_paragraphs = []
                next_sibling = basic_info_heading.next_sibling
                while next_sibling and (next_sibling.name not in ['h2', 'h3'] if hasattr(next_sibling, 'name') else True): # Check if it's a tag before accessing name
                    if hasattr(next_sibling, 'name') and next_sibling.name == 'p' and 'a-paragraph' in next_sibling.get('class', []):
                         basic_info_paragraphs.append(next_sibling.get_text(strip=True))
                    next_sibling = next_sibling.next_sibling
                if basic_info_paragraphs:
                    weapon_details['Basic Information'] = {"paragraphs": "\n".join(basic_info_paragraphs)}


        # --- Extract Best Upgrades ---
        # This section seems to be under an h2 with id 'hl_2' and contains an h3 and a table
        best_upgrades_heading = weapon_soup.find('h2', id='hl_2', class_='a-header--2')
        if best_upgrades_heading:
             best_upgrades_sub_heading = best_upgrades_heading.find_next_sibling('h3', id='hm_1', class_='a-header--3') # Assuming hm_1 is the sub-heading
             if best_upgrades_sub_heading:
                best_upgrades_table = best_upgrades_sub_heading.find_next_sibling('table', class_='a-table')
                if best_upgrades_table:
                    table_data = []
                    header_row = best_upgrades_table.find('tr')
                    if header_row:
                        headers = [th.get_text(strip=True) for th in header_row.find_all('th')]
                        data_rows = best_upgrades_table.find_all('tr')[1:]
                        for r in data_rows:
                            cells = r.find_all('td')
                            if len(headers) == len(cells):
                                row_data = {}
                                for i, header in enumerate(headers):
                                    # Extract text from td, including nested tags
                                    cell_text = "".join(content.get_text(strip=True) for content in cells[i].contents if isinstance(content, str) or content.name in ['span', 'b', 'a', 'img'])
                                    row_data[header] = cell_text
                                table_data.append(row_data)
                    weapon_details['Best Upgrades'] = table_data
                else:
                    # If no table, look for paragraphs under this heading
                    best_upgrades_paragraphs = []
                    next_sibling = best_upgrades_sub_heading.next_sibling
                    while next_sibling and (next_sibling.name not in ['h2', 'h3'] if hasattr(next_sibling, 'name') else True): # Check if it's a tag before accessing name
                         if hasattr(next_sibling, 'name') and next_sibling.name == 'p' and 'a-paragraph' in next_sibling.get('class', []):
                              best_upgrades_paragraphs.append(next_sibling.get_text(strip=True))
                         next_sibling = next_sibling.next_sibling
                    if best_upgrades_paragraphs:
                         weapon_details['Best Upgrades'] = {"paragraphs": "\n".join(best_upgrades_paragraphs)}
             else:
                 # If no h3 sub-heading, look for table or paragraphs directly under h2
                 best_upgrades_table = best_upgrades_heading.find_next_sibling('table', class_='a-table')
                 if best_upgrades_table:
                     table_data = []
                     header_row = best_upgrades_table.find('tr')
                     if header_row:
                         headers = [th.get_text(strip=True) for th in header_row.find_all('th')]
                         data_rows = best_upgrades_table.find_all('tr')[1:]
                         for r in data_rows:
                             cells = r.find_all('td')
                             if len(headers) == len(cells):
                                 row_data = {}
                                 for i, header in enumerate(headers):
                                     # Extract text from td, including nested tags
                                     cell_text = "".join(content.get_text(strip=True) for content in cells[i].contents if isinstance(content, str) or content.name in ['span', 'b', 'a', 'img'])
                                     row_data[header] = cell_text
                                 table_data.append(row_data)
                     weapon_details['Best Upgrades'] = table_data
                 else:
                     best_upgrades_paragraphs = []
                     next_sibling = best_upgrades_heading.next_sibling
                     while next_sibling and (next_sibling.name not in ['h2', 'h3'] if hasattr(next_sibling, 'name') else True): # Check if it's a tag before accessing name
                          if hasattr(next_sibling, 'name') and next_sibling.name == 'p' and 'a-paragraph' in next_sibling.get('class', []):
                               best_upgrades_paragraphs.append(next_sibling.get_text(strip=True))
                          next_sibling = next_sibling.next_sibling
                     if best_upgrades_paragraphs:
                          weapon_details['Best Upgrades'] = {"paragraphs": "\n".join(best_upgrades_paragraphs)}


        # --- Extract How to use ---
        # This section might be under an h2 with id 'hl_3' or similar
        how_to_use_heading = weapon_soup.find('h2', text=re.compile(r'How to Use', re.IGNORECASE), class_='a-header--2') # Look for h2 with 'How to Use' in text
        if how_to_use_heading:
            how_to_use_paragraphs = []
            next_sibling = how_to_use_heading.next_sibling
            while next_sibling and (next_sibling.name not in ['h2', 'h3'] if hasattr(next_sibling, 'name') else True): # Check if it's a tag before accessing name
                if hasattr(next_sibling, 'name') and next_sibling.name == 'p' and 'a-paragraph' in next_sibling.get('class', []):
                    how_to_use_paragraphs.append(next_sibling.get_text(strip=True))
                next_sibling = next_sibling.next_sibling
            weapon_details['How to use'] = "\n".join(how_to_use_paragraphs)


        # --- Extract How to get ---
        # This section seems to be under an h2 with id 'hl_4'
        how_to_get_heading = weapon_soup.find('h2', id='hl_4', class_='a-header--2')
        if how_to_get_heading:
             how_to_get_content = []
             # Collect subsequent h3 and p tags until the next h2 or end of content
             next_sibling = how_to_get_heading.next_sibling
             while next_sibling and (next_sibling.name not in ['h2'] if hasattr(next_sibling, 'name') else True): # Check if it's a tag before accessing name
                  if hasattr(next_sibling, 'name') and next_sibling.name in ['h3', 'p'] and (next_sibling.get('class', []) == ['a-header--3'] or next_sibling.get('class', []) == ['a-paragraph']):
                       how_to_get_content.append(next_sibling.get_text(strip=True))
                  next_sibling = next_sibling.next_sibling
             weapon_details['How to get'] = "\n".join(how_to_get_content)



        full_weapon_data.append(weapon_details)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {weapon_url}: {e}")
        full_weapon_data.append({
            'Weapon Name': weapon_name,
            'Weapon URL': weapon_url,
            'Basic Information': f"Error fetching data: {e}",
            'Best Upgrades': f"Error fetching data: {e}",
            'How to use': f"Error fetching data: {e}",
            'How to get': f"Error fetching data: {e}"
        })
    except Exception as e:
        print(f"An unexpected error occurred while processing {weapon_url}: {e}")
        full_weapon_data.append({
            'Weapon Name': weapon_name,
            'Weapon URL': weapon_url,
            'Basic Information': f"An unexpected error occurred: {e}",
            'Best Upgrades': f"An unexpected error occurred: {e}",
            'How to use': f"An unexpected error occurred: {e}",
            'How to get': f"An unexpected error occurred: {e}"
        })


# Convert the list of dictionaries into a pandas DataFrame
full_weapons_df = pd.DataFrame(full_weapon_data)

# Display the first few rows to verify the data
display(full_weapons_df.head())

# Save the DataFrame to a CSV file
csv_file = 'full_weapon_info.csv'
full_weapons_df.to_csv(csv_file, index=False)

print(f"Full weapon information saved to '{csv_file}'")

## Enemies

In [None]:
enemies = 'https://game8.co/games/Last-of-Us-2/archives/290294' #enemies page
response = requests.get(enemies)
response

In [None]:
enemies_soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
enemy_data = []

# List of h3 IDs to target for enemy sections
target_h3_ids = ['hm_1', 'hm_2', 'hm_3', 'hm_4', 'hm_5', 'hm_6', 'hm_7', 'hm_8', 'hm_9', 'hm_10']

for h3_id in target_h3_ids:
    # Find the specific h3 tag by its ID and class
    h3_tag = enemies_soup.find('h3', id=h3_id, class_='a-header--3')

    if h3_tag:
        enemy_name = h3_tag.get_text(strip=True)

        # Find the first paragraph immediately following the h3 tag
        first_paragraph = h3_tag.find_next_sibling('p', class_='a-paragraph')

        # Find the second paragraph immediately following the first paragraph
        second_paragraph = first_paragraph.find_next_sibling('p', class_='a-paragraph') if first_paragraph else None

        enemy_image_url = 'No Image Available'
        description = 'No Description Available'
        how_to_kill_url = 'No URL Available'

        if first_paragraph:
            # Extract image URL from the first paragraph
            img_tag = first_paragraph.find('img')
            if img_tag:
                img_url = img_tag.get('data-src') or img_tag.get('src')
                if img_url:
                     if img_url.startswith('//'):
                         enemy_image_url = 'https:' + img_url
                     elif img_url.startswith('/'):
                         enemy_image_url = 'https://game8.co' + img_url
                     else:
                         enemy_image_url = img_url # Assume absolute


            # Extract description text from the first paragraph (excluding image alt text if any)
            # Get all text content and join with spaces
            description_parts = [content.strip() for content in first_paragraph.contents if isinstance(content, str) or content.name in ['span', 'b']]
            description = " ".join(description_parts).strip()


        if second_paragraph:
            # Extract the URL from the link in the second paragraph
            link_tag = second_paragraph.find('a', class_='a-btn')
            if link_tag:
                href = link_tag.get('href')
                if href:
                     if href.startswith('//'):
                         how_to_kill_url = 'https:' + href
                     elif href.startswith('/'):
                         how_to_kill_url = 'https://game8.co' + href
                     else:
                         how_to_kill_url = href # Assume absolute


        enemy_data.append({
            'Enemy name': enemy_name,
            'enemy image url': enemy_image_url,
            'description': description,
            'how to kill url': how_to_kill_url
        })
    else:
        print(f"h3 with id='{h3_id}' not found")

# Display the extracted enemy data (optional)
# display(enemy_data)

# Convert to DataFrame and save to CSV
enemy_df = pd.DataFrame(enemy_data)
display(enemy_df.head())

csv_file = 'enemy_data.csv'
enemy_df.to_csv(csv_file, index=False)

print(f"Enemy data saved to '{csv_file}'")

## How to kill enemy

In [None]:
how_to_kill_details = []

# Iterate through the enemy_df DataFrame
for index, row in enemy_df.iterrows():
    enemy_name = row['Enemy name']
    how_to_kill_url = row['how to kill url']

    # Skip if the URL is 'No URL Available' or empty
    if how_to_kill_url == 'No URL Available' or not how_to_kill_url:
        print(f"Skipping {enemy_name} due to missing How to Kill URL.")
        how_to_kill_details.append({
            'Enemy name': enemy_name,
            'How to Kill Guide': 'No Guide Available' # Placeholder for missing guide
        })
        continue

    try:
        response = requests.get(how_to_kill_url)
        response.raise_for_status() # Raise an exception for bad status codes

        guide_soup = BeautifulSoup(response.text, 'html.parser')

        # Dictionary to store the guide details for this enemy
        guide_info_dict = {}
        current_heading = None
        current_content_list = [] # To store paragraphs and image URLs under the current heading

        # Find all h2, h3, p, and img tags within the main content area
        # This selects all h2 with class a-header--2, h3 with class a-header--3, p with class a-paragraph, and img tags
        content_elements = guide_soup.select('h2.a-header--2, h3.a-header--3, p.a-paragraph, img')

        for element in content_elements:
            # If it's an h2 or h3 heading
            if element.name in ['h2', 'h3']:
                # If we have collected content under the previous heading, store it
                if current_heading and current_content_list:
                     guide_info_dict[current_heading] = current_content_list

                current_heading = element.get_text(strip=True)
                current_content_list = [] # Reset content list for the new heading

            # If it's a paragraph and we have a current heading
            elif element.name == 'p' and 'a-paragraph' in element.get('class', []):
                if current_heading:
                    # Extract all text content from the paragraph, including nested span and a tags
                    paragraph_text = "".join(content.get_text(strip=True) for content in element.contents if isinstance(content, str) or content.name in ['span', 'b', 'a'])
                    if paragraph_text:
                         current_content_list.append({'type': 'paragraph', 'content': paragraph_text})

            # If it's an image and we have a current heading
            elif element.name == 'img':
                 if current_heading:
                     img_url = element.get('data-src') or element.get('src')
                     if img_url:
                         if img_url.startswith('//'):
                             img_url = 'https:' + img_url
                         elif img_url.startswith('/'):
                             img_url = 'https://game8.co' + img_url
                         current_content_list.append({'type': 'image', 'url': img_url})


        # Add the last collected content
        if current_heading and current_content_list:
             guide_info_dict[current_heading] = current_content_list

        # Remove unwanted sections (like Comments and Author) after collection
        unwanted_headings = ['Comment', 'Author']
        for heading in unwanted_headings:
            guide_info_dict.pop(heading, None)

        # Explicitly remove the 'Last of Us 2 Enemies' entry if it exists
        guide_info_dict.pop('Last of Us 2 Enemies', None)


        how_to_kill_details.append({
            'Enemy name': enemy_name,
            'How to Kill Guide': guide_info_dict # Store the dictionary
        })

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {how_to_kill_url}: {e}")
        how_to_kill_details.append({
            'Enemy name': enemy_name,
            'How to Kill Guide': f"Error fetching data: {e}" # Indicate error
        })
    except Exception as e:
        print(f"An unexpected error occurred while processing {how_to_kill_url}: {e}")
        how_to_kill_details.append({
            'Enemy name': enemy_name,
            'How to Kill Guide': f"An unexpected error occurred: {e}" # Indicate error
        })

# Now how_to_kill_details is a list of dictionaries with enemy names and their guides
# You can merge this with the enemy_df DataFrame
how_to_kill_df = pd.DataFrame(how_to_kill_details)

# Merge with the original enemy_df
merged_enemy_df = pd.merge(enemy_df, how_to_kill_df, on='Enemy name', how='left')

# Display the merged DataFrame
display(merged_enemy_df.head())

# You can save the merged DataFrame to a CSV
# merged_enemy_df.to_csv('enemy_data_with_guides.csv', index=False)
# print("Enemy data with guides saved to 'enemy_data_with_guides.csv'")

In [None]:
# Define the CSV file name for the merged enemy data
csv_file = 'how_to_kill_enemy.csv'

# Save the merged DataFrame to a CSV file
merged_enemy_df.to_csv(csv_file, index=False)

print(f"Enemy data with guides saved to '{csv_file}'")