In [None]:
import pandas as pd
import sys
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

#Set options to run Chrome in 'Headless' mode and other environment settings.
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.page_load_strategy = "eager"

In [None]:
def start_driver_and_open_url(url):
    """
    Starts a webdriver and opens the given URL.

    Parameters
    ----------
    url: `str`
        String of the URL to retrieve image information.

    Returns
    -------
    driver : WebDriver
        Google webdriver at a predetermined URL.
    """

    #This installs the latest version of the official Google chromedriver
    #Accesses cached version if present.

    #Initialize variable
    driver = None

    #Opens driver. Installs if not present.
    try:
        #Attempt to intialize the driver
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),\
                options=chrome_options)
        
    except Exception as err:
        print(f"'Setting Driver Error: {err}'")
        sys.exit(0)

    #Set URL
    driver.get(url)
    
    return driver

This creates a dictionary of lists containing all the image names and image URLs found on the webpage.

If the image name has an extension of `.png`, then it is pulled into the dictionary.

If the image has an associated link, then it is pulled into the dictionary.

In [None]:
def get_image_info(driver):
    """
    This function retrieves all images from a specified webdriver.
    
    Parameters
    ----------
    driver : WebDriver
        Google webdriver at a predetermined URL.

    Returns
    -------
    image_dict : `dict`
        Dictionary containing image information such as the name of the image 
        and the link to the image.    
    """

    image_dict = {"name":[],"image_Link":[]}
    try:
        list_of_images = driver.find_elements(By.TAG_NAME,"img")
        for images in list_of_images:
            #Get Image Name
            if '.png' in images.get_attribute("alt"):
                image_dict['name'].append(images.get_attribute("alt"))
            #Get Image Link
            if images.get_attribute("data-src") == None:
                continue
            else:
                image_dict['image_Link'].append(images.get_attribute("data-src"))
    except Exception as e:
        print(e)
    
    return image_dict

Now that we have the image names and image links, we must match the image names to the image links.

To do this, we need to search each link for the given image name.

However, there are a few things we need to address and fix:

    1. The image link contains the image name, with the exception that the name now has underscores vs. spaces.
    2. The image size is not consistent (preference is 250 pixels).

In [None]:
def link_resize(image_Link_list, size = '250'):
    """
    Function to resize an image given it's URL from the DayZ Wiki.

    Parameters
    ----------
    image_Link_list : `list`
        A list containing url links to an image on the DayZ Wiki.

    size: `string`, optional
        Value of the size of the new image.
        Default resize value is 250 pixels.

    Returns
    -------
    new_link_list : `list`
        A list containing url links to an image on the DayZ Wiki
        adjusted by the `size` parameter.
    """

    new_link_list = []

    for links in image_Link_list:
        #Focus on section of link that needs to be changed
        string_index = links.find('down/')
        end_of_link = links[string_index+5:]
        #Split link by character to isolate picture size value
        temp_list = end_of_link.split('?')
        #Set new size to 250 pixels
        temp_list[0] = size
        #Combine new size with remaining portion of the image link
        new_link_end = temp_list[0] + '?' + temp_list[1]
        #Replace the old size with new size
        new_link = links.replace(end_of_link,new_link_end)
        new_link_list.append(new_link)

    return new_link_list

Now we will match on the adjusted name and append the actual item name and image link to the dictionary.

In [None]:
def match_name_to_link(image_dict):
    """
    Used to match image names to their corresponding updated links in
    a given image dictionary.

    Parameters
    ----------
    image_dict : `dict`
        Dictionary containing image information such as the name of the image 
        and the link to the image.

    Returns
    -------
    final_pair: `dict`
        Dictionary containing image name and the corresponding updated link.
    """
    final_pair = {"item_Name":[],"image_Link":[]}
    for names,adjusted_names in zip(image_dict['name'],image_dict['adjusted_name']):
        for links in image_dict['adjusted_links']:
            if adjusted_names in links:
                final_pair['item_Name'].append(names)
                final_pair['image_Link'].append(links)

    return final_pair

We also need to add the category of the items to the dictionary.

We will derive the category from the URL.

In [None]:

def get_item_category(url):
    """
    Retrieve the category from the given URL and input as a column
    in the given dictionary.

    Parameters
    ----------
    url: `str`
        String of the URL to retrieve image information.

    Returns
    -------
    image_category: `str`
        Category of the item.
    """

    #Split the URL and grab the category
    image_category = url.split('/')[-1]

    return image_category

Create final dataframe for the items on the page

In [None]:
def create_item_frame(final_pair,url):
    """
    Create final dataframe.

    Parameters
    ----------
    final_pair: `dict`
        Dictionary containing image name and the corresponding updated link.

    Returns
    -------
    item_df: `DataFrame`
        DataFrame containing image information from the given URL and dictionary.
    """

    #Turn the dictionary into a data frame
    item_df = pd.DataFrame(final_pair)
    #Set the column
    item_df['Category'] = get_item_category(url)
    #Fix item names
    item_df['item_Name'] = [x.replace('.png','') for x in item_df['item_Name']]

    return item_df
    

Use this to grab images from pages on the DayZ Wiki.

In [None]:
def execute_item_scrape(driver,url):
    """
    Master execution function.
    Grab all image information from a given URL.

    Parameters
    ----------
    driver: WebDriver
        The WebDriver instance to use.

    url: `str`
        String of the URL to fetch image information from.
    
    Returns
    -------
    item_df: `DataFrame`
        DataFrame containing image information from the given URL and dictionary.
    """

    #Grab image info
    image_dict = get_image_info(driver)
    #Adjust image names
    image_dict['adjusted_name'] = [name.replace(' ','_') for name in image_dict['name']]
    #Resize images
    image_dict['adjusted_links'] = link_resize(image_dict['image_Link'])
    #Rematch image names to thier links
    final_pair = match_name_to_link(image_dict)
    #Create final frame
    item_df = create_item_frame(final_pair,url)

    return item_df

In [None]:
def get_slot_size(url,first_item,top_range,exclude_list=None):
    """
    Retrieve the slot size for the given item.
    
    Parameters
    ----------
    url: `str`
        String of the URL to fetch image information from.

    first_item: `str`
        The the name of the first item found on the page we
        want to scrape. If the name includes spaces, you must use
        underscores.

        Example: `Apples` or `45_ACP_Rounds`

    top_range: `int`
        The number of links/items to include in the scrape.

    exclude_list: `list`, optional
        The list of items to exclude from the scrape.

    Returns
    -------
    slot_size_dict: `dict`
        The dictionary containing the size of the items within the given URL.
    """

    if exclude_list is None:
        exclude_list = []

    #Set variables needed for the function
    slot_size_dict = {'item_Name': [], 'slot_Size': []}

    #List to save the link values only
    new_refs_links = []

    driver = start_driver_and_open_url(url)
    #Get all the links from the page
    a_refs = driver.find_elements(By.TAG_NAME,"a")

    #Index counter to keep track of the indice of the first item in the list
    start_point_index = 0

    #LOGIC BLOCK:
    # For each link, if the link contains the first item:
    # Retrieve the index of the link matching the first item in the list and
    # Calculate the ending index and
    # Break the loop

    for links in a_refs:
        start_point_index = start_point_index + 1
        if first_item in links.get_attribute('href'):
            start_point_index = start_point_index - 1
            end_point_index = start_point_index + top_range
            break

    #Set the new range for valid links
    new_refs = a_refs[start_point_index:end_point_index]

    #LOGIC BLOCK:
    # Extract the link values and save to a new list
    for links in new_refs:
        site = links.get_attribute('href')
        new_refs_links.append(site)

    #LOGIC BLOCK:
    # For each link, if the link does not contain an excluded value:
    # 1. Open Link and focus on the information table
    # 2. Split the information table to isolate the text area around 'Slot'
    # 3. Split the text area around 'Slot' to isolate the slot size
    # 4. Add information to the dictionary

    for links in new_refs_links:
        if not any(elements in links for elements in exclude_list):
            try:
                driver.get(links)
                WebDriverWait(driver, 10).until(\
                    EC.presence_of_element_located((By.CLASS_NAME,'infobox')))
            except Exception as e:
                print(e)
            slot_elements = driver.find_elements(By.TAG_NAME,"TBODY")
            item_detail = slot_elements[0].get_attribute('innerText').split('\n')
            print(links)
            slot_info_index = [item_detail.index(element) for element in item_detail if 'Slot' in element][0]
            slot_size_text = item_detail[slot_info_index].replace('(','').replace(')','')
            slot_size = [int(s) for s in slot_size_text.split() if s.isdigit()][-1]
            slot_size_dict['item_Name'].append(links)
            slot_size_dict['slot_Size'].append(slot_size)
        
    return slot_size_dict

In [None]:
ammo_url = 'https://dayz.fandom.com/wiki/Category:Ammunition'
ammo_exclude = ['40mm_Explosive','40mm_Smoke','Dart','Ammunition','ammunition','Rocket','RPG','Loot','damage','VOG']
ammo_slot_size = pd.DataFrame(get_slot_size(ammo_url,'22_LR_Rounds',28,exclude_list=ammo_exclude))
ammo_slot_df = pd.DataFrame(ammo_slot_size)
ammo_slot_df.to_csv('ammo_slot_info.csv', header=True, index=False)

In [None]:
food_url = 'https://dayz.fandom.com/wiki/Category:Food'
food_exclude = ['Egg','Food_and_Drink','Food_decay','Fox_Steak']
food_slot_sizes = pd.DataFrame(get_slot_size(food_url,'Apple',75,exclude_list=food_exclude))
food_slot_df = pd.DataFrame(food_slot_sizes)
food_slot_df.to_csv('food_slot_info.csv',header=True,index=False)

In [None]:
magazine_url = 'https://dayz.fandom.com/wiki/Category:Magazines'
magazine_exclude = ['Speedloader','10_Round','100Rnd_Beta_C-Mag',
				 '10rd_SK_59/66_Clip','10rd_Sporter_22_Mag','Snaploader',
				 'PM73','M249','Groza','Coupled','5rd_Mosin','Quiver',
				 'CO2','List','Magazines','PB_Pistol','PKM','Shock', 'Uzi']
magazine_slot_sizes = pd.DataFrame(get_slot_size(magazine_url,'357_Speed',62,exclude_list=magazine_exclude))
magazine_slot_df = pd.DataFrame(magazine_slot_sizes)
magazine_slot_df.to_csv('magazine_slot_info.csv', header=True, index=False)

In [None]:
attachment_url = "https://dayz.fandom.com/wiki/Category:Attachments"
attachment_exclude = ['Attachments','PSO-1-1']
attachment_slot_sizes = pd.DataFrame(get_slot_size(attachment_url,'1PN51',55,exclude_list=attachment_exclude))
attachment_slot_df = pd.DataFrame(attachment_slot_sizes)
attachment_slot_df.to_csv('attachment_slot_info.csv',header=True, index=False)


In [None]:
equipment_url = "https://dayz.fandom.com/wiki/Category:Equipment"
equipment_exclude = ['Equipment','Burlap_Sack','Gas_Canister']
equipment_slot_sizes = pd.DataFrame(get_slot_size(equipment_url,'9V',41,exclude_list=equipment_exclude))
equipment_slot_df = pd.DataFrame(equipment_slot_sizes)
equipment_slot_df.to_csv('equipment_slot_info.csv',header=True, index=False)
## Burlap Sack has a "hotbar bonus" that is being picked up by the 'Slots' vernacular.
## Quick fix is to include Burlap Sack in the list of excluded items, however, we should search for the SIZE
## keyword and parse characters from that point vs. find 'Slots' and working backwards.

## Gas canister has variations (small, medium, large) which have to be parsed separately.

In [None]:
driver = start_driver_and_open_url('https://dayz.fandom.com')

In [None]:
url_dictionary = {'ammunition_url':'https://dayz.fandom.com/wiki/Ammunition',
'attachment_url':'https://dayz.fandom.com/wiki/Attachments',
'backpack_url':'https://dayz.fandom.com/wiki/Backpack',
'clothing_url':'https://dayz.fandom.com/wiki/Clothing',
'equipment_url':'https://dayz.fandom.com/wiki/Equipment',
'food_url':'https://dayz.fandom.com/wiki/Food_and_Drink',
'magazine_url':'https://dayz.fandom.com/wiki/Magazines',
'medical_url':'https://dayz.fandom.com/wiki/Medical_Supplies',
'resources_url':'https://dayz.fandom.com/wiki/Resources',
'weapons_url':'https://dayz.fandom.com/wiki/Weapons'
}

In [None]:
df_name = []
df_list = []
for url_names in url_dictionary.keys():
    #Access the URL
    url_link = url_dictionary.get(url_names)
    driver.get(url_link)
    df_name.append(f'{url_names}_image_df')
    df_list.append(execute_item_scrape(driver,url_link))
    


In [None]:
master_df = pd.DataFrame()

In [None]:
for frames in df_list:
    master_df = pd.concat([master_df,frames])

In [None]:
master_df = master_df.drop_duplicates(
    subset = ['item_Name','image_Link'],
    keep = 'first').reset_index(drop = True)

In [None]:
master_df = master_df.drop_duplicates().reset_index(drop = True)

In [None]:
master_df.to_csv('dayz_images_info.csv',header=True,index=False)

In [None]:
    master_df