In [12]:
import pandas as pd
import sys
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

#Set options to run Chrome in 'Headless' mode
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920x1080")


In [13]:
def start_driver_and_open_url(url):
    """
    Starts a webdriver and opens the given URL.

    Parameters
    ----------
    url: `str`
        String of the URL to retrieve image information.

    Returns
    -------
    driver : WebDriver
        Google webdriver at a predetermined URL.
    """

    #This installs the latest version of the official Google chromedriver
    #Accesses cached version if present.

    #Initialize variable
    driver = None

    #Opens driver. Installs if not present.
    try:
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),\
                options=chrome_options)
    except Exception as err:
        print(f"'Setting Driver Error: {err}'")
        sys.exit(0)

    #Set URL
    driver.get(url)
    
    return driver

This creates a dictionary of lists containing all the image names and image URLs found on the webpage.

If the image name has an extension of `.png`, then it is pulled into the dictionary.

If the image has an associated link, then it is pulled into the dictionary.

In [14]:
def get_image_info(driver):
    """
    This function retrieves all images from a specified webdriver.
    
    Parameters
    ----------
    driver : WebDriver
        Google webdriver at a predetermined URL.

    Returns
    -------
    image_dict : `dict`
        Dictionary containing image information such as the name of the image 
        and the link to the image.    
    """

    image_dict = {"name":[],"image_link":[]}
    try:
        list_of_images = driver.find_elements(By.TAG_NAME,"img")
        for images in list_of_images:
            #Get Image Name
            if '.png' in images.get_attribute("alt"):
                image_dict['name'].append(images.get_attribute("alt"))
            #Get Image Link
            if images.get_attribute("data-src") == None:
                continue
            else:
                image_dict['image_link'].append(images.get_attribute("data-src"))
    except Exception as e:
        print(e)
    
    return image_dict

Now that we have the image names and image links, we must match the image names to the image links.

To do this, we need to search each link for the given image name.

However, there are a few things we need to address and fix:

    1. The image link contains the image name, with the exception that the name now has underscores vs. spaces.
    2. The image size is not consistent (preference is 250 pixels).

In [15]:
def link_resize(image_link_list, size = '250'):
    """
    Function to resize an image given it's URL from the DayZ Wiki.

    Parameters
    ----------
    image_link_list : `list`
        A list containing url links to an image on the DayZ Wiki.

    size: `string`, optional
        Value of the size of the new image.
        Default resize value is 250 pixels.

    Returns
    -------
    new_link_list : `list`
        A list containing url links to an image on the DayZ Wiki
        adjusted by the `size` parameter.
    """

    new_link_list = []

    for links in image_link_list:
        #Focus on section of link that needs to be changed
        string_index = links.find('down/')
        end_of_link = links[string_index+5:]
        #Split link by character to isolate picture size value
        temp_list = end_of_link.split('?')
        #Set new size to 250 pixels
        temp_list[0] = size
        #Combine new size with remaining portion of the image link
        new_link_end = temp_list[0] + '?' + temp_list[1]
        #Replace the old size with new size
        new_link = links.replace(end_of_link,new_link_end)
        new_link_list.append(new_link)

    return new_link_list

Now we will match on the adjusted name and append the actual item name and image link to the dictionary.

In [16]:
def match_name_to_link(image_dict):
    """
    Used to match image names to their corresponding updated links in
    a given image dictionary.

    Parameters
    ----------
    image_dict : `dict`
        Dictionary containing image information such as the name of the image 
        and the link to the image.

    Returns
    -------
    final_pair: `dict`
        Dictionary containing image name and the corresponding updated link.
    """
    final_pair = {"item_Name":[],"image_link":[]}
    for names,adjusted_names in zip(image_dict['name'],image_dict['adjusted_name']):
        for links in image_dict['adjusted_links']:
            if adjusted_names in links:
                final_pair['item_Name'].append(names)
                final_pair['image_link'].append(links)

    return final_pair

We also need to add the category of the items to the dictionary.

We will derive the category from the URL.

In [17]:

def get_item_category(url):
    """
    Retrieve the category from the given URL and input as a column
    in the given dictionary.

    Parameters
    ----------
    url: `str`
        String of the URL to retrieve image information.

    Returns
    -------
    image_category: `str`
        Category of the item.
    """

    #Split the URL and grab the category
    image_category = url.split('/')[-1]

    return image_category


Create final dataframe for the items on the page

In [18]:
def create_item_frame(final_pair,url):
    """
    Create final dataframe.

    Parameters
    ----------
    final_pair: `dict`
        Dictionary containing image name and the corresponding updated link.

    Returns
    -------
    item_df: `DataFrame`
        DataFrame containing image information from the given URL and dictionary.
    """

    #Turn the dictionary into a data frame
    item_df = pd.DataFrame(final_pair)
    #Set the column
    item_df['Category'] = get_item_category(url)

    return item_df
    

Use this to grab images from pages on the DayZ Wiki.

In [19]:
def execute_item_scrape(driver,url):
    """
    Master execution function.
    Grab all image information from a given URL.

    Parameters
    ----------
    driver: WebDriver
    url: `str`
    
    Returns
    -------
    item_df: `DataFrame`
        fdjskafdsla;jfdsoa
    """

    #Grab image info
    image_dict = get_image_info(driver)
    #Adjust image names
    image_dict['adjusted_name'] = [name.replace(' ','_') for name in image_dict['name']]
    #Resize images
    image_dict['adjusted_links'] = link_resize(image_dict['image_link'])
    #Rematch image names to thier links
    final_pair = match_name_to_link(image_dict)
    #Create final frame
    item_df = create_item_frame(final_pair,url)

    return item_df

In [20]:
driver = start_driver_and_open_url('https://dayz.fandom.com')




In [21]:
url_dictionary = {'ammunition_url':'https://dayz.fandom.com/wiki/Ammunition',
'attachment_url':'https://dayz.fandom.com/wiki/Attachments',
'backpack_url':'https://dayz.fandom.com/wiki/Backpack',
'clothing_url':'https://dayz.fandom.com/wiki/Clothing',
'equipment_url':'https://dayz.fandom.com/wiki/Equipment',
'food_url':'https://dayz.fandom.com/wiki/Food_and_Drink',
'magazine_url':'https://dayz.fandom.com/wiki/Magazines',
'medical_url':'https://dayz.fandom.com/wiki/Medical_Supplies',
'resources_url':'https://dayz.fandom.com/wiki/Resources','weapons_url':'https://dayz.fandom.com/wiki/Weapons'
}

In [23]:
master_df = pd.DataFrame()

for url_names in url_dictionary.keys():
    #Access the URL
    url_link = url_dictionary.get(url_names)
    driver.get(url_link)
    df_name.append(f'{url_names}_image_df')
    df_list.append(execute_item_scrape(driver,url_link))
    


In [58]:
master_df = pd.DataFrame()

In [59]:
for frames in df_list:
    master_df = pd.concat([master_df,frames]).drop_duplicates().reset_index(drop=True)

In [60]:
master_df = master_df.drop_duplicates(
    subset = ['item_Name','image_link'],
    keep = 'first').reset_index(drop = True)

In [61]:
master_df.to_csv('dayz_images_info.csv',header=True,index=False)

In [62]:
    master_df

Unnamed: 0,item_Name,image_link,Category
0,Sporter 22 Wood.png,https://static.wikia.nocookie.net/dayz_gameped...,Ammunition
1,15rd Sporter 22 Mag.png,https://static.wikia.nocookie.net/dayz_gameped...,Ammunition
2,30 rnd 22 mag.png,https://static.wikia.nocookie.net/dayz_gameped...,Ammunition
3,Ammo 380.png,https://static.wikia.nocookie.net/dayz_gameped...,Ammunition
4,AmmoBox 380 35Rnd.png,https://static.wikia.nocookie.net/dayz_gameped...,Ammunition
...,...,...,...
512,Plastic Explosive.png,https://static.wikia.nocookie.net/dayz_gameped...,Weapons
513,Improvised Explosive.png,https://static.wikia.nocookie.net/dayz_gameped...,Weapons
514,LandMine.png,https://static.wikia.nocookie.net/dayz_gameped...,Weapons
515,RGD-5 Grenade.png,https://static.wikia.nocookie.net/dayz_gameped...,Weapons
