# Final Scraping Code
This code is the final scraping code from previous 3 pages condensed into what worked as the final model.
It will load seatm in selennium, verify age, scrape the list, then scrape each game in the list.

In [29]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time, os
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np


chromedriver = "/Applications/chromedriver" 
os.environ["webdriver.chrome.driver"] = chromedriver

url = 'https://store.steampowered.com/search/?filter=topsellers'

driver = webdriver.Chrome(chromedriver)
driver.get(url)

#Age verification for site

# finds first game button
button = driver.find_element_by_xpath('//*[@id="search_resultsRows"]/a[1]') 
button.click()

# finds the year slider 
year_slider = driver.find_element_by_xpath('//*[@id="ageYear"]')
#chooses an appropriate age
right_age = driver.find_element_by_xpath('//*[@id="ageYear"]/option[91]')
#confirm button. 
confirm = driver.find_element_by_xpath('//*[@id="app_agegate"]/div[1]/div[3]/a[1]')

year_slider.click()
right_age.click()
confirm.click()
driver.get(url) # goes back to list after verification

# Creating master list of games

In [5]:
# scrolls the entire 'top sellers' game page 
for i in range(800):
    #Scroll
    driver.execute_script(
        "window.scrollTo(0, document.documentElement.scrollHeight);"
    )
    time.sleep(1)

In [7]:
#the following pulls the table containing the games, and separates them by hyperlink, which is each game
soup = BeautifulSoup(driver.page_source, 'html5lib')
table = soup.find('div', id='search_resultsRows')
table
rows = [row for row in table.find_all('a')]
rows[0]
len(rows)

16863

In [30]:
# extracts first bits of data from the master list: game name, release date, if it is on sale, and original price 

games = {}

for row in rows:
    title = (row.find('span', class_='title').text)
    url = row.get('href')
    release_date = (row.find('div', class_='col search_released responsive_secondrow').text)
    if (row.find('div', class_='col search_discount responsive_secondrow').text.strip()) == '':
        been_sale = 0
    else:
        been_sale = 1
    if been_sale == 0:
        orig_price = (row.find('div', class_='col search_price responsive_secondrow').text.strip())
    else:
        orig_price = (row.find('strike').text)
    games[title] = (url, 
                    release_date,
                    been_sale,
                    orig_price)
    

In [31]:
# Converting data scraped from list into DataFrame
gamesdf1 = pd.DataFrame(games).T
gamesdf1.columns = ['link', 'release_date', 'been_sale', 'orig_price']

gamesdf1.shape

(16730, 4)

# Scraping each game from master list

In [32]:

def get_game_details(url):

    '''
    the following function will take the url from the dataframe passed (gamesdf1 in this case), and will scrape
    the game page for title, total number of reviews, review rating, number of languages offered, 
    user assigned tags for the game, number of different tags, game genres, number of different genres, and the 
    publisher that produced the game. It will assign all of this information into a dictionary of games. If 
    the data is not found on the game page, it will retrn null.
    '''

    
    
    #Request HTML and parse
    response = driver.get(url)
    
    for i in range(3):
        #Scroll
        driver.execute_script(
            "window.scrollTo(0, document.documentElement.scrollHeight);" #Alternatively, document.body.scrollHeight
        )
        time.sleep(.2)
        #element = WebDriverWait(driver, 10).until(
        #EC.presence_of_element_located((By.ID, "ViewAllReviewssummary")))
            
    
    soup = BeautifulSoup(driver.page_source, "html5lib")
    
    

    
    headers = ['title', 'num_reviews', 'review_rating', 
              'num_languages', 'tags', 'num_tags',
              'genres', 'num_genres', 'publisher']
    
    #Get title
    if not (soup.find('div', class_='apphub_AppName')):
        title = 'HELP'
    else:
        title = soup.find('div', class_='apphub_AppName').text

    #Get number of reviews
    if not (soup.find('div', id='Reviews_summary')):
        num_reviews = np.nan
    elif not (soup.find('div', id='Reviews_summary').find('div', id='ViewAllReviewssummary')):
        num_reviews = np.nan
    elif not soup.find('div', id='Reviews_summary').find('div', id='ViewAllReviewssummary').find('a'):
        num_reviews = np.nan
    else:
        rev_string = soup.find('div', id='Reviews_summary').find('div', id='ViewAllReviewssummary').find('a').text
        num_reviews = rev_string.split()[2]
     

    #Get review rating
    if not (soup.find(class_='user_reviews_summary_bar')):
        rating = np.nan
    elif not soup.find(class_='user_reviews_summary_bar').find_all('span'):
        rating = np.nan
    else:
        rating = soup.find(class_='user_reviews_summary_bar').find_all('span')[0].text
    
    
    #Get number of languages offered in
    if not soup.find('a', class_='all_languages'):
        num_languages = np.nan
    else:
        num_languages = soup.find('a', class_='all_languages').text.split()[2]
    

    #Get tags associated with game
    if not soup.find('div', class_='glance_tags'):
        tags_list = np.nan
    else:
        tags = soup.find('div', class_='glance_tags').find_all('a')
        tags_list = []
        for items in tags:
            tags_list.append(items.text.strip())
    
    
    #number of tags
    if not soup.find('div', class_='glance_tags'):
        num_tags = np.nan
    else:
        num_tags = len(tags_list)
    
    #genres list
    if not soup.find('div', class_='details_block'):
        genres_list = np.nan
    else:
        genres_all = soup.find('div', class_='details_block').find_all('a')[0:-2]
        genres_list = []
        for links in genres_all:
            genres_list.append(links.text)
        
    # number of genres
    if not soup.find('div', class_='details_block'):
        num_genres = np.nan
    else:
        num_genres = len(genres_list)
    
    
    
    if not soup.find('div', class_='glance_ctn_responsive_left'):
        publisher = np.nan
    elif not soup.find('div',class_='glance_ctn_responsive_left').find_all('a'):
        publisher = np.nan
    else:
        publisher = soup.find('div', class_='glance_ctn_responsive_left').find_all('a')[-1].text
    
    #create dict of all games
    games_dict = dict(zip(headers, [title,
                                num_reviews,
                                rating,
                                num_languages, 
                                tags_list,
                                num_tags,
                                genres_list,
                                num_genres,
                                publisher]))

    return games_dict

In [33]:
games_info = []

for url in gamesdf1.link:
    games_info.append(get_game_details(url))





In [34]:
len(games_info)

16730

In [35]:
games_info = pd.DataFrame(games_info)

In [36]:
games_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16730 entries, 0 to 16729
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title          16730 non-null  object 
 1   num_reviews    5806 non-null   object 
 2   review_rating  14165 non-null  object 
 3   num_languages  6932 non-null   object 
 4   tags           15126 non-null  object 
 5   num_tags       15126 non-null  float64
 6   genres         16727 non-null  object 
 7   num_genres     16727 non-null  float64
 8   publisher      15103 non-null  object 
dtypes: float64(2), object(7)
memory usage: 1.1+ MB


In [45]:
games_info.head(3)

Unnamed: 0,title,num_reviews,review_rating,num_languages,tags,num_tags,genres,num_genres,publisher
0,NieR Replicant™ ver.1.22474487139...,,,9,"[Great Soundtrack, Action, RPG, Adventure, Sto...",20.0,"[Action, Adventure, RPG, Square Enix]",4.0,Square Enix
1,OUTRIDERS,36837.0,Mixed,13,"[RPG, Action, Adventure, Co-op, Third-Person S...",20.0,"[Action, Adventure, RPG]",3.0,Square Enix
2,It Takes Two,15688.0,Overwhelmingly Positive,12,"[Co-op, Adventure, Puzzle, 3D Platformer, Puzz...",20.0,"[Action, Adventure, Hazelight]",3.0,Electronic Arts


# Merging into one total Data Frame

In [54]:
gamesdf1.reset_index(inplace=True)

In [57]:
gamesdf1.rename(columns={'index':'title'},inplace=True)

In [59]:
totaldf = pd.merge(gamesdf1, games_info, on='title')

In [60]:
totaldf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15109 entries, 0 to 15108
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title          15109 non-null  object 
 1   link           15109 non-null  object 
 2   release_date   15109 non-null  object 
 3   been_sale      15109 non-null  object 
 4   orig_price     15109 non-null  object 
 5   num_reviews    5793 non-null   object 
 6   review_rating  14152 non-null  object 
 7   num_languages  6928 non-null   object 
 8   tags           15109 non-null  object 
 9   num_tags       15109 non-null  float64
 10  genres         15109 non-null  object 
 11  num_genres     15109 non-null  float64
 12  publisher      15086 non-null  object 
dtypes: float64(2), object(11)
memory usage: 1.6+ MB


In [64]:
totaldf.to_csv(r'/Users/michaelharnett/Desktop/METIS\totaldf.csv', index=False)