In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import requests
import pandas as pd
from bs4 import BeautifulSoup

import os
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

driver = webdriver.Chrome(chromedriver)
driver.get('https://steamdb.info/graph/')

# Alternatives to Chrome: Firefox, phantomjs

In [2]:
select_dropdown = driver.find_element_by_id('table-apps_length')
for option in select_dropdown.find_elements_by_tag_name('option'):
    if option.text == 'All':
        option.click() # select() in earlier versions of webdriver
        break

In [3]:
# Selenium hands the page to Beautiful soup

soup_level1=BeautifulSoup(driver.page_source, 'lxml')

In [4]:
#import requests
#import pandas as pd
#from bs4 import BeautifulSoup

class HTMLTableParser:
   
    def parse_url(self, url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        return [(table['id'],self.parse_html_table(table))\
                for table in soup.find_all('table')]  
    
    def parse_soup(self, soup):
        return [(table['id'],self.parse_html_table(table))\
                for table in soup.find_all('table')]  

    def parse_html_table(self, table):
        n_columns = 0
        n_rows=0
        column_names = []

        # Find number of rows and columns
        # we also find the column titles if we can
        for row in table.find_all('tr'):
            
            # Determine the number of rows in the table
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    # Set the number of columns for our table
                    n_columns = len(td_tags)
                    
            # Handle column names if we find them
            th_tags = row.find_all('th') 
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())

        # Safeguard on Column Titles
        if len(column_names) > 0 and len(column_names) != n_columns:
            raise Exception("Column titles do not match the number of columns")

        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        df = pd.DataFrame(columns = columns,
                          index= range(0,n_rows))
        row_marker = 0
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[row_marker,column_marker] = column.get_text()
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1
                
        # Convert to float if possible
        for col in df:
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                pass
        
        return df

In [6]:
steam_users_table.shape

(8503, 6)

In [5]:
hp = HTMLTableParser()
steam_users_table = hp.parse_soup(soup_level1)[0][1] # Grabbing the table from the tuple
steam_users_table.head()

Unnamed: 0,Unnamed: 1,AppID,Name,Current,24h Peak,All-Time Peak
0,+,753.0,Steam,9469112,14752493,18537490
1,+,578080.0,PLAYERUNKNOWN'S BATTLEGROUNDS,282809,1992584,3257248
2,+,570.0,Dota 2,226441,621653,1295114
3,+,730.0,Counter-Strike: Global Offensive,172999,421633,854801
4,+,359550.0,Tom Clancy's Rainbow Six Siege,40794,99486,178953


In [7]:
#list(steam_users_table)
steam_users_table = steam_users_table.drop([' '], axis=1)

In [8]:
steam_users_table.shape

(8503, 5)

In [9]:
steam_users_table.head()

Unnamed: 0,AppID,Name,Current,24h Peak,All-Time Peak
0,753.0,Steam,9469112,14752493,18537490
1,578080.0,PLAYERUNKNOWN'S BATTLEGROUNDS,282809,1992584,3257248
2,570.0,Dota 2,226441,621653,1295114
3,730.0,Counter-Strike: Global Offensive,172999,421633,854801
4,359550.0,Tom Clancy's Rainbow Six Siege,40794,99486,178953


In [10]:
#steam_users_table.to_csv('steam_users_table.csv', index=False)

In [36]:
# pulling in the Scrapy data from Steam

steam_table = pd.read_json('/Users/etheredgej/ds/metis/metisgh/steam-scraper-JNE/output/products_all.json')

In [38]:
steam_table.head()

Unnamed: 0,app_name,developer,discount_price,early_access,genres,id,mature_reasons,metascore,overall_reviews,price,publisher,recent_reviews,release_date,reviews_url,sentiment,specs,tags,title,url
0,! That Bastard Is Trying To Steal Our Gold !,WTFOMGames,,False,"[Action, Adventure, Casual, Indie]",449940,,,- 46% of the 15 user reviews for this game are...,3.99,WTFOMGames,,2016-03-01,http://steamcommunity.com/app/449940/reviews/?...,Mixed,"[Single-player, Steam Trading Cards, Partial C...","[Action, Indie, Casual, Adventure, Memes, Psyc...",! That Bastard Is Trying To Steal Our Gold !,http://store.steampowered.com/app/449940/_That...
1,- Arcane RERaise -,Arcane Raise,,False,"[Adventure, Casual, Indie, RPG, Strategy]",603770,,,- 31% of the 38 user reviews for this game are...,2.99,ArcaneRaise,,2017-04-04,http://steamcommunity.com/app/603770/reviews/?...,Mostly Negative,"[Single-player, Steam Achievements, Steam Trad...","[Adventure, RPG, Indie, Casual, Strategy, RPGM...",- Arcane RERaise -,http://store.steampowered.com/app/603770/_Arca...
2,- Arcane Raise -,Arcane Raise,,False,"[Adventure, Casual, Indie, RPG, Strategy]",603750,,,- 38% of the 59 user reviews for this game are...,2.99,ArcaneRaise,,2017-03-10,http://steamcommunity.com/app/603750/reviews/?...,Mostly Negative,"[Single-player, Steam Achievements, Steam Trad...","[Adventure, RPG, Strategy, Casual, RPGMaker, J...",- Arcane Raise -,http://store.steampowered.com/app/603750/_Arca...
3,- Arcane preRaise -,Arcane Raise,,False,"[Adventure, Casual, Indie, RPG, Strategy]",603780,,,- 28% of the 25 user reviews for this game are...,2.99,ArcaneRaise,,2017-05-02,http://steamcommunity.com/app/603780/reviews/?...,Mostly Negative,"[Single-player, Steam Achievements, Steam Trad...","[RPG, Casual, Adventure, Indie, Strategy, Fant...",- Arcane preRaise -,http://store.steampowered.com/app/603780/_Arca...
4,(VR)西汉帝陵 The Han Dynasty Imperial Mausoleums,,,False,,568330,,,- 72% of the 11 user reviews for this game are...,1.99,,,,http://steamcommunity.com/app/568330/reviews/?...,Mostly Positive,"[Single-player, HTC Vive, Tracked Motion Contr...","[Indie, VR]",,http://store.steampowered.com/app/568330/VR_Th...


In [39]:
steam_combined = pd.merge(steam_table, steam_users_table, left_on='app_name', right_on='Name', how='inner', sort=False)

In [40]:
steam_combined.head()

Unnamed: 0,app_name,developer,discount_price,early_access,genres,id,mature_reasons,metascore,overall_reviews,price,...,sentiment,specs,tags,title,url,AppID,Name,Current,24h Peak,All-Time Peak
0,! That Bastard Is Trying To Steal Our Gold !,WTFOMGames,,False,"[Action, Adventure, Casual, Indie]",449940,,,- 46% of the 15 user reviews for this game are...,3.99,...,Mixed,"[Single-player, Steam Trading Cards, Partial C...","[Action, Indie, Casual, Adventure, Memes, Psyc...",! That Bastard Is Trying To Steal Our Gold !,http://store.steampowered.com/app/449940/_That...,449940.0,! That Bastard Is Trying To Steal Our Gold !,1,27,1951
1,- Arcane RERaise -,Arcane Raise,,False,"[Adventure, Casual, Indie, RPG, Strategy]",603770,,,- 31% of the 38 user reviews for this game are...,2.99,...,Mostly Negative,"[Single-player, Steam Achievements, Steam Trad...","[Adventure, RPG, Indie, Casual, Strategy, RPGM...",- Arcane RERaise -,http://store.steampowered.com/app/603770/_Arca...,603770.0,- Arcane RERaise -,21,262,3017
2,- Arcane preRaise -,Arcane Raise,,False,"[Adventure, Casual, Indie, RPG, Strategy]",603780,,,- 28% of the 25 user reviews for this game are...,2.99,...,Mostly Negative,"[Single-player, Steam Achievements, Steam Trad...","[RPG, Casual, Adventure, Indie, Strategy, Fant...",- Arcane preRaise -,http://store.steampowered.com/app/603780/_Arca...,603780.0,- Arcane preRaise -,34,76,3016
3,#WarGames,Eko,,False,"[Adventure, Indie]",779420,,,- 66% of the 54 user reviews for this game are...,2.99,...,Mixed,"[Single-player, Steam Achievements, Full contr...","[Adventure, Indie, FMV]",#WarGames,http://store.steampowered.com/app/779420/WarGa...,779420.0,#WarGames,2,5,36
4,$1 Ride,BeHappy Studios,,False,"[Action, Casual, Indie]",508290,,,- 58% of the 80 user reviews for this game are...,0.98,...,Mixed,"[Single-player, Steam Trading Cards]","[Casual, Indie, Action]",$1 Ride,http://store.steampowered.com/app/508290/1_Ride/,508290.0,$1 Ride,1,4,2002


In [41]:
steam_table.shape

(20311, 19)

In [42]:
steam_users_table.shape

(8503, 5)

In [43]:
steam_combined.shape

(5446, 24)

I noticed after making a list of games in the steam_users_table dataframe that didn't match with games in the steam_table dataframe that some were due to the ® symbol. Let's see how much that cleans things up:

In [44]:
steam_table['app_name'].replace(regex=True,inplace=True,to_replace=r'®',value=r'')

In [45]:
steam_combined = pd.merge(steam_table, steam_users_table, left_on='app_name', right_on='Name', how='inner', sort=False)

In [46]:
steam_combined.shape

(5508, 24)

So, the ® symbol was responsible for some of the matching problems, but a pretty underwhelming percentage.

Let's try fuzzy matching since we lost 3000 of our entries on that inner join:

In [48]:
# Creating my merged data frame
#data = df_pitches.merge(df_salaries, 
#                        left_on='player_name', 
#                        right_on='player_name', 
#                        how='outer', 
#                        suffixes=["","_salary"] #since they are the same name, we need a suffix
#                       )

steam_combined_outer_join = pd.merge(steam_table, steam_users_table, left_on='app_name', right_on='Name', how='outer', sort=False)

#Selecing people with missing salaries
missing_app_name = steam_combined_outer_join[steam_combined_outer_join.app_name.isnull()]

#Displaying results
missing_app_name.reset_index(inplace=True,drop='index')
missing_app_name.head(15)

Unnamed: 0,app_name,developer,discount_price,early_access,genres,id,mature_reasons,metascore,overall_reviews,price,...,sentiment,specs,tags,title,url,AppID,Name,Current,24h Peak,All-Time Peak
0,,,,,,,,,,,...,,,,,,753.0,Steam,9469112,14752493,18537490
1,,,,,,,,,,,...,,,,,,578080.0,PLAYERUNKNOWN'S BATTLEGROUNDS,282809,1992584,3257248
2,,,,,,,,,,,...,,,,,,359550.0,Tom Clancy's Rainbow Six Siege,40794,99486,178953
3,,,,,,,,,,,...,,,,,,230410.0,Warframe,36867,57184,121862
4,,,,,,,,,,,...,,,,,,252490.0,Rust,25119,33933,68434
5,,,,,,,,,,,...,,,,,,271590.0,Grand Theft Auto V,20699,54384,364548
6,,,,,,,,,,,...,,,,,,218620.0,PAYDAY 2,16978,25456,247709
7,,,,,,,,,,,...,,,,,,552520.0,Far Cry 5,16718,19383,92445
8,,,,,,,,,,,...,,,,,,238960.0,Path of Exile,16473,25685,98445
9,,,,,,,,,,,...,,,,,,289070.0,Sid Meier's Civilization VI,15597,19301,162657


In [None]:
steam_table['app_name'].replace(regex=True,inplace=True,to_replace=r'®',value=r'')

In [32]:
# pip install fuzzywuzzy
from fuzzywuzzy import fuzz

def match_name(name, list_names, min_score=0):
    # -1 score incase we don't get any matches
    max_score = -1
    # Returning empty name for no match as well
    max_name = ""
    # Iternating over all names in the other
    for name2 in list_names:
        #Finding fuzzy match score
        score = fuzz.ratio(name, name2)
        # Checking if we are above our threshold and have a better score
        if (score > min_score) & (score > max_score):
            max_name = name2
            max_score = score
    return (max_name, max_score)

In [35]:
# List for dicts for easy dataframe creation
dict_list = []

# iterating over our games without user info found above

for game_name in missing_app_name.Name:

    # Use our method to find best match, we can set a threshold here
    match = match_name(game_name, steam_table.app_name, 75)
    
    # New dict for storing data
    dict_ = {}
    dict_.update({"game_name" : game_name})
    dict_.update({"match_name" : match[0]})
    dict_.update({"score" : match[1]})
    dict_list.append(dict_)
    
merge_table = pd.DataFrame(dict_list)
# Display results
merge_table

KeyboardInterrupt: 

In [28]:
#steam_combined['overall_reviews'] = str(steam_combined['overall_reviews'])[2:]

In [29]:
#steam_combined['overall_reviews'].head()

0                                                  ...
1                                                  ...
2                                                  ...
3                                                  ...
4                                                  ...
Name: overall_reviews, dtype: object

In [30]:
steam_combined.head()

Unnamed: 0,app_name,developer,discount_price,early_access,genres,id,mature_reasons,metascore,overall_reviews,price,...,sentiment,specs,tags,title,url,AppID,Name,Current,24h Peak,All-Time Peak
0,! That Bastard Is Trying To Steal Our Gold !,WTFOMGames,,False,"[Action, Adventure, Casual, Indie]",449940,,,...,3.99,...,Mixed,"[Single-player, Steam Trading Cards, Partial C...","[Action, Indie, Casual, Adventure, Memes, Psyc...",! That Bastard Is Trying To Steal Our Gold !,http://store.steampowered.com/app/449940/_That...,449940.0,! That Bastard Is Trying To Steal Our Gold !,1,27,1951
1,- Arcane RERaise -,Arcane Raise,,False,"[Adventure, Casual, Indie, RPG, Strategy]",603770,,,...,2.99,...,Mostly Negative,"[Single-player, Steam Achievements, Steam Trad...","[Adventure, RPG, Indie, Casual, Strategy, RPGM...",- Arcane RERaise -,http://store.steampowered.com/app/603770/_Arca...,603770.0,- Arcane RERaise -,21,262,3017
2,- Arcane preRaise -,Arcane Raise,,False,"[Adventure, Casual, Indie, RPG, Strategy]",603780,,,...,2.99,...,Mostly Negative,"[Single-player, Steam Achievements, Steam Trad...","[RPG, Casual, Adventure, Indie, Strategy, Fant...",- Arcane preRaise -,http://store.steampowered.com/app/603780/_Arca...,603780.0,- Arcane preRaise -,34,76,3016
3,#WarGames,Eko,,False,"[Adventure, Indie]",779420,,,...,2.99,...,Mixed,"[Single-player, Steam Achievements, Full contr...","[Adventure, Indie, FMV]",#WarGames,http://store.steampowered.com/app/779420/WarGa...,779420.0,#WarGames,2,5,36
4,$1 Ride,BeHappy Studios,,False,"[Action, Casual, Indie]",508290,,,...,0.98,...,Mixed,"[Single-player, Steam Trading Cards]","[Casual, Indie, Action]",$1 Ride,http://store.steampowered.com/app/508290/1_Ride/,508290.0,$1 Ride,1,4,2002


Some resources used:

http://srome.github.io/Parsing-HTML-Tables-in-Python-with-BeautifulSoup-and-pandas/

https://medium.com/@rtjeannier/combining-data-sets-with-fuzzy-matching-17efcb510ab2

In [99]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re

In [53]:
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

driver = webdriver.Chrome(chromedriver)
#driver.get('https://store.steampowered.com/search/?sort_by=Name_ASC&category1=998')

# Alternatives to Chrome: Firefox, phantomjs

In [None]:
select_dropdown = driver.find_element_by_id('table-apps_length')
for option in select_dropdown.find_elements_by_tag_name('option'):
    if option.text == 'All':
        option.click() # select() in earlier versions of webdriver
        break

In [336]:
driver = webdriver.Chrome(chromedriver)
driver.get('https://store.steampowered.com/search/?sort_by=Name_ASC&category1=998')

In [362]:
import time
import csv
import pickle

driver = webdriver.Chrome(chromedriver)

page_number = 1
listoflinks = []

for page_number in range(1,938):
    driver.get(('https://store.steampowered.com/search/?sort_by=Name_ASC&category1=998&page='+str(page_number)))
    #print(driver.current_url)
    time.sleep(1)

    for link in driver.find_elements_by_xpath('//a[contains(@class,"search_result_row")]'):
        listoflinks.append(link.get_attribute('href'))
    #for link in listoflinks:
    #    print(link)

    with open('listoflinks.pkl', 'wb') as f:
        pickle.dump(listoflinks, f)

with open("listoflinks.txt", "w") as output:
    output.write(str(listoflinks))

In [363]:
#len(driver.find_elements_by_xpath('//a[contains(@class,"search_result_row")]'))
len(listoflinks)

23418

In [356]:
#with open('listoflinks.csv', 'w', newline='') as output:
#    writer = csv.writer(output, lineterminator='\n')
#    for val in listoflinks:
#        writer.writerow([val])

#with open('listoflinks.csv') as output:
#    writer = csv.writer(output)
#    writer.write(listoflinks)
    
#import simplejson
#f = open('listoflinks.txt', 'w')
#simplejson.dump(listoflinks, f)
#f.close()

with open('listoflinks.csv', 'r') as f:
    reader = csv.reader(f)
    listoflinks = list(reader)
len(listoflinks)
for link in listoflinks:
    print(link)

['https://store.steampowered.com/app/449940/_That_Bastard_Is_Trying_To_Steal_Our_Gold/?snr=1_7_7_230_150_1']
['https://store.steampowered.com/app/493790/Archery/?snr=1_7_7_230_150_1']
['https://store.steampowered.com/app/841860/Have_A_Sticker/?snr=1_7_7_230_150_1']
['https://store.steampowered.com/app/303720/KILLALLZOMBIES/?snr=1_7_7_230_150_1']
['https://store.steampowered.com/app/614910/monstercakes/?snr=1_7_7_230_150_1']
['https://store.steampowered.com/app/782100/RunningSnake/?snr=1_7_7_230_150_1']
['https://store.steampowered.com/app/834180/RunningSnake_2/?snr=1_7_7_230_150_1']
['https://store.steampowered.com/app/392190/SelfieTennis/?snr=1_7_7_230_150_1']
['https://store.steampowered.com/app/392150/SkiJump/?snr=1_7_7_230_150_1']
['https://store.steampowered.com/app/779420/WarGames/?snr=1_7_7_230_150_1']
['https://store.steampowered.com/app/508290/1_Ride/?snr=1_7_7_230_150_1']
['https://store.steampowered.com/app/557330/SystemHack/?snr=1_7_7_230_150_1']
['https://store.steampowere

In [69]:
driver.get('http://store.steampowered.com/app/303720/agecheck')

In [55]:
# testing that getting past /agecheck works:

viewpage_button = driver.find_element_by_xpath('//a[contains(@class,"btn_grey_white_innerfade")]')
viewpage_button.click()

In [71]:
driver.get('http://store.steampowered.com/agecheck/app/200510/')

In [None]:
# testing that getting past /agecheck/app works:

select_dropdown = driver.find_element_by_xpath('//select[@name="ageDay"]')
for option in select_dropdown.find_elements_by_tag_name('option'):
    if option.text == '1':
        option.click() # select() in earlier versions of webdriver
        break
select_dropdown = driver.find_element_by_xpath('//select[@name="ageMonth"]')
for option in select_dropdown.find_elements_by_tag_name('option'):
    if option.text == 'January':
        option.click() # select() in earlier versions of webdriver
        break
select_dropdown = driver.find_element_by_xpath('//select[@name="ageYear"]')
for option in select_dropdown.find_elements_by_tag_name('option'):
    if option.text == '1950':
        option.click() # select() in earlier versions of webdriver
        break
enter_button = driver.find_element_by_xpath('//a[contains(@class,"btnv6")]')
enter_button.click()

In [72]:
if 'agecheck/app' in driver.current_url:
    select_dropdown = driver.find_element_by_xpath('//select[@name="ageDay"]')
    for option in select_dropdown.find_elements_by_tag_name('option'):
        if option.text == '1':
            option.click() # select() in earlier versions of webdriver
            break
    select_dropdown = driver.find_element_by_xpath('//select[@name="ageMonth"]')
    for option in select_dropdown.find_elements_by_tag_name('option'):
        if option.text == 'January':
            option.click() # select() in earlier versions of webdriver
            break
    select_dropdown = driver.find_element_by_xpath('//select[@name="ageYear"]')
    for option in select_dropdown.find_elements_by_tag_name('option'):
        if option.text == '1950':
            option.click() # select() in earlier versions of webdriver
            break
    enter_button = driver.find_element_by_xpath('//a[contains(@class,"btnv6")]')
    enter_button.click()
    try:
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "apphub_AppName"))
        )
    finally:
        print(driver.current_url)
        soup=BeautifulSoup(driver.page_source, 'lxml') # Selenium hands the page to Beautiful soup

elif 'agecheck' in driver.current_url:
    viewpage_button = driver.find_element_by_xpath('//a[contains(@class,"btn_grey_white_innerfade")]')
    viewpage_button.click()
    try:
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "apphub_AppName"))
        )
    finally:
        print(driver.current_url)
        soup=BeautifulSoup(driver.page_source, 'lxml') # Selenium hands the page to Beautiful soup

else:
    print(driver.current_url)
    soup=BeautifulSoup(driver.page_source, 'lxml') # Selenium hands the page to Beautiful soup

http://store.steampowered.com/app/200510/XCOM_Enemy_Unknown/


In [392]:
del str

In [466]:
# putting everything together: going through each page to make a dictionary
    # and then appending that dictionary to a json file

import json
first_run = True
import newlinejson as nlj
import jsonlines
#from bson import json_util

game_dict_list = []

# new json file started with open list bracket
with open('steam_games_dicts_selenium.json', 'w') as f:
    f.write('[\n')
f.close()
    
for link in listoflinks:
    
    # set default values so they aren't carried over in loop iterations:
    url = ''
    reviews_url = ''
    metascore = 0
    app_name = ''
    app_id = ''
    early_access = False
    esrb = ''
    steam_Achievement_n = 0
    price = 0.00
    discount_price = 0.00
    recent_rev_pos_perc = 0
    recent_reviews_n = 0
    overall_rev_pos_perc = 0
    overall_reviews_n = 0
    mature_reasons = []
    specs = []
    tags = []
    
    game_dict={}
    details = ''
    details_split = []

    driver.get(link)
    if 'agecheck/app' in driver.current_url:
        select_dropdown = driver.find_element_by_xpath('//select[@name="ageDay"]')
        for option in select_dropdown.find_elements_by_tag_name('option'):
            if option.text == '1':
                option.click() # select() in earlier versions of webdriver
                break
        select_dropdown = driver.find_element_by_xpath('//select[@name="ageMonth"]')
        for option in select_dropdown.find_elements_by_tag_name('option'):
            if option.text == 'January':
                option.click() # select() in earlier versions of webdriver
                break
        select_dropdown = driver.find_element_by_xpath('//select[@name="ageYear"]')
        for option in select_dropdown.find_elements_by_tag_name('option'):
            if option.text == '1950':
                option.click() # select() in earlier versions of webdriver
                break
        enter_button = driver.find_element_by_xpath('//a[contains(@class,"btnv6")]')
        enter_button.click()
        try:
            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "apphub_AppName")))
        finally:
            print(current_index, driver.current_url)
            soup=BeautifulSoup(driver.page_source, 'lxml') # Selenium hands the page to Beautiful soup

    elif 'agecheck' in driver.current_url:
        viewpage_button = driver.find_element_by_xpath('//a[contains(@class,"btn_grey_white_innerfade")]')
        viewpage_button.click()
        try:
            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "apphub_AppName")))
        finally:
            print(current_index, driver.current_url)
            soup=BeautifulSoup(driver.page_source, 'lxml') # Selenium hands the page to Beautiful soup

    else:
        print(current_index, driver.current_url)
        soup=BeautifulSoup(driver.page_source, 'lxml') # Selenium hands the page to Beautiful soup

    try:
        details = soup.select('.details_block')
        #details = details[0]

        if '<br/>' in str(details):
            details_split = str(details).split('<br/>')
        elif '<br>' in str(details):
            details_split = str(details).split('<br>')
        
        for line in details_split:

            line = re.sub('<[^<]+?>', '', line)  # Remove tags.
            line = re.sub("[\r\t\n]", '', line).strip()

            for prop in ['Title:','Genre:','Developer:','Publisher:','Release Date:']:
                if prop in line:
                    if prop in ['Genre:','Developer:','Publisher:']:
                        names = (line.split(prop)[1]).strip().split(',')
                        names = [name.strip() for name in names]
                        game_dict[prop[:-1]] = names
                    else:
                        #item = line.replace(prop, '').strip()
                        name = (line.split(prop)[1]).strip()
                        game_dict[prop[:-1]] = name
                    
    except:
        pass
    
    #game_dict
   
    # Define string to int to use a few times in the future (mostly to deal with atypical commas)

    def str_to_int(string):
        return int(re.sub('[^0-9]','', string))

    # Get the reviews details - recent and overall - number of reviews and percent positive:
    if (len(soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")}))>1):
        recent_rev_pos_perc = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[0].get_text().strip()[2:4]
        recent_rev_pos_perc = str_to_int(recent_rev_pos_perc)
        overall_rev_pos_perc = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[1].get_text().strip()[2:4]
        overall_rev_pos_perc = str_to_int(overall_rev_pos_perc)
        
        try:
            recent_reviews_n = soup.findAll(attrs={'class' : "responsive_hidden"})[0].get_text().strip().strip('(),')
            recent_reviews_n = str_to_int(recent_reviews_n)
        except:
            recent_reviews_n = recent_reviews_n = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[0].get_text().strip().split(' ')[4]
            recent_reviews_n = str_to_int(recent_reviews_n)

        try:
            overall_reviews_n = soup.findAll(attrs={'class' : "responsive_hidden"})[1].get_text().strip().strip('(),')
            overall_reviews_n = str_to_int(overall_reviews_n)
        except:
            overall_reviews_n = recent_reviews_n = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[1].get_text().strip().split(' ')[4]
            overall_reviews_n = str_to_int(overall_reviews_n)

    elif (len(soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")}))==1):
        if (soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[0].get_text().strip() != "- Need more user reviews to generate a score"):
            overall_rev_pos_perc = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[0].get_text().strip()[2:4]
            overall_rev_pos_perc = str_to_int(overall_rev_pos_perc)

            try:
                overall_reviews_n = soup.findAll(attrs={'class' : "responsive_hidden"})[0].get_text().strip().strip('(),')
                overall_reviews_n = str_to_int(overall_reviews_n)
            except:
                overall_reviews_n = recent_reviews_n = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[0].get_text().strip().split(' ')[4]
                overall_reviews_n = str_to_int(overall_reviews_n)
        
    # get metacritic score
    try:
        metascore = driver.find_element_by_xpath('//div[@id="game_area_metascore"]/div[contains(@class, "score")]').text
        metascore = str_to_int(metascore)
    except:
        pass
    
    # get url
    url = str(driver.current_url)

    # get reasons for rating as a list
    try:
        mature_reasons = soup.find("p", {"id": "descriptorText"}).get_text().strip()
        mature_reasons = re.sub('\n',',', mature_reasons)
        mature_reasons = re.sub(',,',', ', mature_reasons)
        mature_reasons = mature_reasons.split(',')
        mature_reasons = [reason.strip() for reason in mature_reasons]
    except:
        pass

    # get user tags for each game
    tags = []
    tags_list = soup.find_all("a", {"class": "app_tag"})
    for tag in tags_list:
        tags.append(tag.get_text().strip())

    # get app name (should be same as title most (all?) of the time) for each game
    try:
        app_name = driver.find_element_by_css_selector('div.apphub_AppName').text
    except:
        pass

    # get specs for each game as a list
    specs = []
    specs_list = driver.find_elements_by_css_selector('div.game_area_details_specs')
    for spec in specs_list:
        specs.append(spec.text.strip())

    #div.game_purchase_price.price

    #prices = []
    #price_list = driver.find_elements_by_css_selector('div.game_purchase_price.price')
    #for price in price_list:
    #    prices.append(price.text.strip())
    #price = driver.find_elements_by_css_selector('div.game_purchase_price')

    #prices = []
    price = soup.find("div", {"class": "game_purchase_price"})
    if price:
        if ('free to play' in price.text.strip().lower()):
            price = 0.00
        else:
            price = price.text.strip().strip('$')
            price = float(price)
            discount_price = price
    else:
        try:
            price = soup.find("div", {"class": "discount_original_price"}).text.strip().strip('$')
            price = float(price)
            discount_price = soup.find("div", {"class": "discount_final_price"}).text.strip().strip('$')
            discount_price = float(discount_price)
        except:
            pass
                

    early_access = soup.select('div.early_access_header')
    if early_access:
        early_access = True
    else:
        early_access = False

    # add rating (everybody, mature, teen, etc):

    rating = soup.findAll(attrs={'class' : re.compile("block responsive_apppage_details_right")})
    if 'esrb' in str(rating):
        esrb = str(rating).split("esrb")[1][1] 

    # add number of steam achievements:

    if 'Steam Achievements' in str(rating):
        try:
            steam_Achievement_n = int(str(rating).split("Steam Achievements")[0][-3:-1])
        except:
            pass

    # figure out how to get this working again:
    # reviews_url = f"http://steamcommunity.com/app/{id}/reviews/?browsefilter=mostrecent&p=1"
    # response.add_value('reviews_url', reviews_url)

    try:
        app_id = url.split('store.steampowered.com/app/')[1].split('/')[0]
        reviews_url = "http://steamcommunity.com/app/"+app_id+"/reviews/?browsefilter=mostrecent&p=1"
    except:
        pass
    
    game_dict['url']=url
    game_dict['reviews_url']=reviews_url
    game_dict['metascore']=metascore
    game_dict['app_name']=app_name
    game_dict['app_id']=app_id
    game_dict['early_access']=early_access
    game_dict['esrb']=esrb
    game_dict['steam_Achievement_n']=steam_Achievement_n
    game_dict['price']=price
    game_dict['discount_price']=discount_price
    game_dict['recent_rev_pos_perc']=recent_rev_pos_perc
    game_dict['recent_reviews_n']=recent_reviews_n
    game_dict['overall_rev_pos_perc']=overall_rev_pos_perc
    game_dict['overall_reviews_n']=overall_reviews_n
    game_dict['mature_reasons']=mature_reasons
    game_dict['specs']=specs
    game_dict['tags']=tags

    game_dict_list.append(game_dict)
    
    # pickle the dictionary list just in case
    
    with open('game_dict_list.pkl', 'wb') as f:
        pickle.dump(game_dict_list, f)

    # append json file
    
    with open('steam_games_dicts_selenium.json', 'a') as f:
        f.write(json.dumps(game_dict)+',\n')

# remove final comma
with open('steam_games_dicts_selenium.json', 'rb+') as filehandle:
    filehandle.seek(-2, os.SEEK_END)
    filehandle.truncate()
filehandle.close()

# add end list bracket
with open('steam_games_dicts_selenium.json', 'a') as f:
    f.write('\n]')
f.close()
    

https://store.steampowered.com/app/449940/_That_Bastard_Is_Trying_To_Steal_Our_Gold/
https://store.steampowered.com/app/493790/Archery/
https://store.steampowered.com/app/841860/Have_A_Sticker/
https://store.steampowered.com/app/303720/KILLALLZOMBIES/
https://store.steampowered.com/app/614910/monstercakes/
https://store.steampowered.com/app/782100/RunningSnake/
https://store.steampowered.com/app/834180/RunningSnake_2/
https://store.steampowered.com/app/392190/SelfieTennis/
https://store.steampowered.com/app/392150/SkiJump/
https://store.steampowered.com/app/779420/WarGames/
https://store.steampowered.com/app/508290/1_Ride/
https://store.steampowered.com/app/557330/SystemHack/
https://store.steampowered.com/app/754610/Mars_Taken/
https://store.steampowered.com/app/514900/observer/
https://store.steampowered.com/app/439260/BUTTS_The_VR_Experience/
https://store.steampowered.com/app/388390/Glow_Ball__The_billiard_puzzle_game/
https://store.steampowered.com/app/720840/TWO_DRAW/
https://sto

ValueError: could not convert string to float: 'Free Demo'

In [489]:
len(game_dict_list)
#game_dict_list[-1]

{'Developer': ['Quite Different Mechanics'],
 'Genre': ['Action', 'Strategy', 'Early Access'],
 'Publisher': ['Quite Different Mechanics'],
 'Release Date': 'Jul 28, 2016',
 'Title': 'Uncrewed',
 'app_id': '474890',
 'app_name': 'Uncrewed',
 'discount_price': 11.99,
 'early_access': True,
 'esrb': '',
 'mature_reasons': [],
 'metascore': 0,
 'overall_rev_pos_perc': 78,
 'overall_reviews_n': 32,
 'price': 11.99,
 'recent_rev_pos_perc': 0,
 'recent_reviews_n': 0,
 'reviews_url': 'http://steamcommunity.com/app/474890/reviews/?browsefilter=mostrecent&p=1',
 'specs': ['Single-player',
  'Multi-player',
  'Online Multi-Player',
  'Steam Achievements',
  'Steam Workshop'],
 'steam_Achievement_n': 21,
 'tags': ['Early Access',
  'Action',
  'Strategy',
  'RTS',
  'Base Building',
  'Sandbox',
  'Building',
  'Multiplayer',
  'Singleplayer',
  'Physics',
  'Simulation',
  'Real-Time'],
 'url': 'https://store.steampowered.com/app/474890/Uncrewed/'}

In [492]:
# restarting from exception:

driver = webdriver.Chrome(chromedriver)
driver.set_page_load_timeout(15)

# skipping due to timeout exceptions: 
# https://store.steampowered.com/app/520810/Metal_Noise/
# https://store.steampowered.com/app/282560/RollerCoaster_Tycoon_World/
# https://store.steampowered.com/app/420360/Spectrum/
# https://store.steampowered.com/app/464880/Stars_in_Shadow/
# https://store.steampowered.com/app/676340/Undarkened/
# https://store.steampowered.com/app/676340/Undarkened/ # in the list twice?
# https://store.steampowered.com/app/263680/Unearthed_Trail_of_Ibn_Battuta__Episode_1__Gold_Edition/
# https://store.steampowered.com/app/436820/Waltz_of_the_Wizard/

import json

for link in listoflinks[len(game_dict_list)+8:]: # skipping 8 as mentioned above
    
    # set default values so they aren't carried over in loop iterations:
    url = ''
    reviews_url = ''
    metascore = 0
    app_name = ''
    app_id = ''
    early_access = False
    esrb = ''
    steam_Achievement_n = 0
    price = 0.00
    discount_price = 0.00
    recent_rev_pos_perc = 0
    recent_reviews_n = 0
    overall_rev_pos_perc = 0
    overall_reviews_n = 0
    mature_reasons = []
    specs = []
    tags = []
    
    game_dict={}
    details = ''
    details_split = []
    
    current_index = len(game_dict_list)-1

    # driver.get(link)

    try:
        driver.get(link)
    except:
        print("couldn't load page:", current_index, driver.current_url)
        continue
    
    if 'agecheck/app' in driver.current_url:
        select_dropdown = driver.find_element_by_xpath('//select[@name="ageDay"]')
        for option in select_dropdown.find_elements_by_tag_name('option'):
            if option.text == '1':
                option.click() # select() in earlier versions of webdriver
                break
        select_dropdown = driver.find_element_by_xpath('//select[@name="ageMonth"]')
        for option in select_dropdown.find_elements_by_tag_name('option'):
            if option.text == 'January':
                option.click() # select() in earlier versions of webdriver
                break
        select_dropdown = driver.find_element_by_xpath('//select[@name="ageYear"]')
        for option in select_dropdown.find_elements_by_tag_name('option'):
            if option.text == '1950':
                option.click() # select() in earlier versions of webdriver
                break
        enter_button = driver.find_element_by_xpath('//a[contains(@class,"btnv6")]')
        enter_button.click()
        try:
            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "apphub_AppName")))
        finally:
            print(current_index, driver.current_url)
            soup=BeautifulSoup(driver.page_source, 'lxml') # Selenium hands the page to Beautiful soup

    elif 'agecheck' in driver.current_url:
        viewpage_button = driver.find_element_by_xpath('//a[contains(@class,"btn_grey_white_innerfade")]')
        viewpage_button.click()
        try:
            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "apphub_AppName")))
        finally:
            print(current_index, driver.current_url)
            soup=BeautifulSoup(driver.page_source, 'lxml') # Selenium hands the page to Beautiful soup

    else:
        print(current_index, driver.current_url)
        soup=BeautifulSoup(driver.page_source, 'lxml') # Selenium hands the page to Beautiful soup

    try:
        details = soup.select('.details_block')
        #details = details[0]

        if '<br/>' in str(details):
            details_split = str(details).split('<br/>')
        elif '<br>' in str(details):
            details_split = str(details).split('<br>')
        
        for line in details_split:

            line = re.sub('<[^<]+?>', '', line)  # Remove tags.
            line = re.sub("[\r\t\n]", '', line).strip()

            for prop in ['Title:','Genre:','Developer:','Publisher:','Release Date:']:
                if prop in line:
                    if prop in ['Genre:','Developer:','Publisher:']:
                        names = (line.split(prop)[1]).strip().split(',')
                        names = [name.strip() for name in names]
                        game_dict[prop[:-1]] = names
                    else:
                        #item = line.replace(prop, '').strip()
                        name = (line.split(prop)[1]).strip()
                        game_dict[prop[:-1]] = name
                    
    except:
        pass
    
    #game_dict
   
    # Define string to int to use a few times in the future (mostly to deal with atypical commas)

    def str_to_int(string):
        return int(re.sub('[^0-9]','', string))

    # Get the reviews details - recent and overall - number of reviews and percent positive:
    if (len(soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")}))>1):
        recent_rev_pos_perc = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[0].get_text().strip()[2:4]
        recent_rev_pos_perc = str_to_int(recent_rev_pos_perc)
        overall_rev_pos_perc = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[1].get_text().strip()[2:4]
        overall_rev_pos_perc = str_to_int(overall_rev_pos_perc)
        
        try:
            recent_reviews_n = soup.findAll(attrs={'class' : "responsive_hidden"})[0].get_text().strip().strip('(),')
            recent_reviews_n = str_to_int(recent_reviews_n)
        except:
            recent_reviews_n = recent_reviews_n = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[0].get_text().strip().split(' ')[4]
            recent_reviews_n = str_to_int(recent_reviews_n)

        try:
            overall_reviews_n = soup.findAll(attrs={'class' : "responsive_hidden"})[1].get_text().strip().strip('(),')
            overall_reviews_n = str_to_int(overall_reviews_n)
        except:
            overall_reviews_n = recent_reviews_n = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[1].get_text().strip().split(' ')[4]
            overall_reviews_n = str_to_int(overall_reviews_n)

    elif (len(soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")}))==1):
        if (soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[0].get_text().strip() != "- Need more user reviews to generate a score"):
            overall_rev_pos_perc = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[0].get_text().strip()[2:4]
            overall_rev_pos_perc = str_to_int(overall_rev_pos_perc)

            try:
                overall_reviews_n = soup.findAll(attrs={'class' : "responsive_hidden"})[0].get_text().strip().strip('(),')
                overall_reviews_n = str_to_int(overall_reviews_n)
            except:
                overall_reviews_n = recent_reviews_n = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[0].get_text().strip().split(' ')[4]
                overall_reviews_n = str_to_int(overall_reviews_n)
        
    # get metacritic score
    try:
        metascore = driver.find_element_by_xpath('//div[@id="game_area_metascore"]/div[contains(@class, "score")]').text
        metascore = str_to_int(metascore)
    except:
        pass
    
    # get url
    url = str(driver.current_url)

    # get reasons for rating as a list
    try:
        mature_reasons = soup.find("p", {"id": "descriptorText"}).get_text().strip()
        mature_reasons = re.sub('\n',',', mature_reasons)
        mature_reasons = re.sub(',,',', ', mature_reasons)
        mature_reasons = mature_reasons.split(',')
        mature_reasons = [reason.strip() for reason in mature_reasons]
    except:
        pass

    # get user tags for each game
    tags = []
    tags_list = soup.find_all("a", {"class": "app_tag"})
    for tag in tags_list:
        tags.append(tag.get_text().strip())

    # get app name (should be same as title most (all?) of the time) for each game
    try:
        app_name = driver.find_element_by_css_selector('div.apphub_AppName').text
    except:
        pass

    # get specs for each game as a list
    specs = []
    specs_list = driver.find_elements_by_css_selector('div.game_area_details_specs')
    for spec in specs_list:
        specs.append(spec.text.strip())

    # get price(s) for each game as a list
    price = soup.find("div", {"class": "game_purchase_price"})
    if price:
        if ('free' in price.text.strip().lower() or not (any(char.isdigit() for char in price.text.strip()))):
            price = 0.00
        else:
            price = price.text.strip().strip('$')
            price = float(price)
            discount_price = price
    else:
        try:
            price = soup.find("div", {"class": "discount_original_price"}).text.strip().strip('$')
            price = float(price)
            discount_price = soup.find("div", {"class": "discount_final_price"}).text.strip().strip('$')
            discount_price = float(discount_price)
        except:
            pass
                
    # check if it's early access:
    
    early_access = soup.select('div.early_access_header')
    if early_access:
        early_access = True
    else:
        early_access = False

    # add rating (everybody, mature, teen, etc):

    rating = soup.findAll(attrs={'class' : re.compile("block responsive_apppage_details_right")})
    if 'esrb' in str(rating):
        esrb = str(rating).split("esrb")[1][1] 

    # add number of steam achievements:

    if 'Steam Achievements' in str(rating):
        try:
            steam_Achievement_n = int(str(rating).split("Steam Achievements")[0][-3:-1])
        except:
            pass

    # figure out how to get this working again:
    # reviews_url = f"http://steamcommunity.com/app/{id}/reviews/?browsefilter=mostrecent&p=1"
    # response.add_value('reviews_url', reviews_url)

    try:
        app_id = url.split('store.steampowered.com/app/')[1].split('/')[0]
        reviews_url = "http://steamcommunity.com/app/"+app_id+"/reviews/?browsefilter=mostrecent&p=1"
    except:
        pass
    
    game_dict['url']=url
    game_dict['reviews_url']=reviews_url
    game_dict['metascore']=metascore
    game_dict['app_name']=app_name
    game_dict['app_id']=app_id
    game_dict['early_access']=early_access
    game_dict['esrb']=esrb
    game_dict['steam_Achievement_n']=steam_Achievement_n
    game_dict['price']=price
    game_dict['discount_price']=discount_price
    game_dict['recent_rev_pos_perc']=recent_rev_pos_perc
    game_dict['recent_reviews_n']=recent_reviews_n
    game_dict['overall_rev_pos_perc']=overall_rev_pos_perc
    game_dict['overall_reviews_n']=overall_reviews_n
    game_dict['mature_reasons']=mature_reasons
    game_dict['specs']=specs
    game_dict['tags']=tags

    game_dict_list.append(game_dict)
    
    # pickle the dictionary list just in case
    
    with open('game_dict_list.pkl', 'wb') as f:
        pickle.dump(game_dict_list, f)

    # append json file
    
    with open('steam_games_dicts_selenium.json', 'a') as f:
        f.write(json.dumps(game_dict)+',\n')

# remove final comma and new line
with open('steam_games_dicts_selenium.json', 'rb+') as filehandle:
    filehandle.seek(-2, os.SEEK_END)
    filehandle.truncate()
filehandle.close()

# add final new line and end list bracket
with open('steam_games_dicts_selenium.json', 'a') as f:
    f.write('\n]')
f.close()
    

22179 https://store.steampowered.com/app/422110/Wand_Wars/
22180 https://store.steampowered.com/app/765730/Wand_Wars_VR/
22181 https://store.steampowered.com/app/386490/Wanda__A_Beautiful_Apocalypse/
22182 https://store.steampowered.com/app/463020/Wander_No_More/
22183 https://store.steampowered.com/app/477910/Wanderer_of_Teandria/
22184 https://store.steampowered.com/app/568250/Wanderer_The_Rebirth/
22185 https://store.steampowered.com/app/37270/Wandering_Willows/
22186 https://store.steampowered.com/app/434860/Wanderjahr/
22187 https://store.steampowered.com/app/545820/Wanderland/
22188 https://store.steampowered.com/app/729670/Wanderlust/
22189 https://store.steampowered.com/app/240620/Wanderlust_Adventures/
22190 https://store.steampowered.com/app/211580/Wanderlust_Rebirth/
22191 https://store.steampowered.com/app/530320/Wandersong/
22192 https://store.steampowered.com/app/741400/Wands/
22193 https://store.steampowered.com/app/782070/Wangan_Warrior_X/
22194 https://store.steampower

In [493]:
len(game_dict_list)

23410

In [470]:
# putting everything together: going through each page to make a dictionary
    # and then appending that dictionary to a json file

def scrape_games(self, listoflinks):
    import json
    first_run = True
    import newlinejson as nlj
    import jsonlines
    #from bson import json_util

    game_dict_list = []

    # new json file started with open list bracket
    with open('steam_games_dicts_selenium.json', 'w') as f:
        f.write('[\n')
    f.close()

    for link in listoflinks:

        # set default values so they aren't carried over in loop iterations:
        url = ''
        reviews_url = ''
        metascore = 0
        app_name = ''
        app_id = ''
        early_access = False
        esrb = ''
        steam_Achievement_n = 0
        price = 0.00
        discount_price = 0.00
        recent_rev_pos_perc = 0
        recent_reviews_n = 0
        overall_rev_pos_perc = 0
        overall_reviews_n = 0
        mature_reasons = []
        specs = []
        tags = []

        game_dict={}
        details = ''
        details_split = []

        driver.get(link)
        if 'agecheck/app' in driver.current_url:
            select_dropdown = driver.find_element_by_xpath('//select[@name="ageDay"]')
            for option in select_dropdown.find_elements_by_tag_name('option'):
                if option.text == '1':
                    option.click() # select() in earlier versions of webdriver
                    break
            select_dropdown = driver.find_element_by_xpath('//select[@name="ageMonth"]')
            for option in select_dropdown.find_elements_by_tag_name('option'):
                if option.text == 'January':
                    option.click() # select() in earlier versions of webdriver
                    break
            select_dropdown = driver.find_element_by_xpath('//select[@name="ageYear"]')
            for option in select_dropdown.find_elements_by_tag_name('option'):
                if option.text == '1950':
                    option.click() # select() in earlier versions of webdriver
                    break
            enter_button = driver.find_element_by_xpath('//a[contains(@class,"btnv6")]')
            enter_button.click()
            try:
                element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "apphub_AppName")))
            finally:
                print(driver.current_url)
                soup=BeautifulSoup(driver.page_source, 'lxml') # Selenium hands the page to Beautiful soup

        elif 'agecheck' in driver.current_url:
            viewpage_button = driver.find_element_by_xpath('//a[contains(@class,"btn_grey_white_innerfade")]')
            viewpage_button.click()
            try:
                element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "apphub_AppName")))
            finally:
                print(driver.current_url)
                soup=BeautifulSoup(driver.page_source, 'lxml') # Selenium hands the page to Beautiful soup

        else:
            print(driver.current_url)
            soup=BeautifulSoup(driver.page_source, 'lxml') # Selenium hands the page to Beautiful soup

        try:
            details = soup.select('.details_block')
            #details = details[0]

            if '<br/>' in str(details):
                details_split = str(details).split('<br/>')
            elif '<br>' in str(details):
                details_split = str(details).split('<br>')

            for line in details_split:

                line = re.sub('<[^<]+?>', '', line)  # Remove tags.
                line = re.sub("[\r\t\n]", '', line).strip()

                for prop in ['Title:','Genre:','Developer:','Publisher:','Release Date:']:
                    if prop in line:
                        if prop in ['Genre:','Developer:','Publisher:']:
                            names = (line.split(prop)[1]).strip().split(',')
                            names = [name.strip() for name in names]
                            game_dict[prop[:-1]] = names
                        else:
                            #item = line.replace(prop, '').strip()
                            name = (line.split(prop)[1]).strip()
                            game_dict[prop[:-1]] = name

        except:
            pass

        #game_dict

        # Define string to int to use a few times in the future (mostly to deal with atypical commas)

        def str_to_int(string):
            return int(re.sub('[^0-9]','', string))

        # Get the reviews details - recent and overall - number of reviews and percent positive:
        if (len(soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")}))>1):
            recent_rev_pos_perc = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[0].get_text().strip()[2:4]
            recent_rev_pos_perc = str_to_int(recent_rev_pos_perc)
            overall_rev_pos_perc = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[1].get_text().strip()[2:4]
            overall_rev_pos_perc = str_to_int(overall_rev_pos_perc)

            try:
                recent_reviews_n = soup.findAll(attrs={'class' : "responsive_hidden"})[0].get_text().strip().strip('(),')
                recent_reviews_n = str_to_int(recent_reviews_n)
            except:
                recent_reviews_n = recent_reviews_n = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[0].get_text().strip().split(' ')[4]
                recent_reviews_n = str_to_int(recent_reviews_n)

            try:
                overall_reviews_n = soup.findAll(attrs={'class' : "responsive_hidden"})[1].get_text().strip().strip('(),')
                overall_reviews_n = str_to_int(overall_reviews_n)
            except:
                overall_reviews_n = recent_reviews_n = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[1].get_text().strip().split(' ')[4]
                overall_reviews_n = str_to_int(overall_reviews_n)

        elif (len(soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")}))==1):
            if (soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[0].get_text().strip() != "- Need more user reviews to generate a score"):
                overall_rev_pos_perc = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[0].get_text().strip()[2:4]
                overall_rev_pos_perc = str_to_int(overall_rev_pos_perc)

                try:
                    overall_reviews_n = soup.findAll(attrs={'class' : "responsive_hidden"})[0].get_text().strip().strip('(),')
                    overall_reviews_n = str_to_int(overall_reviews_n)
                except:
                    overall_reviews_n = recent_reviews_n = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[0].get_text().strip().split(' ')[4]
                    overall_reviews_n = str_to_int(overall_reviews_n)

        # get metacritic score
        try:
            metascore = driver.find_element_by_xpath('//div[@id="game_area_metascore"]/div[contains(@class, "score")]').text
            metascore = str_to_int(metascore)
        except:
            pass

        # get url
        url = str(driver.current_url)

        # get reasons for rating as a list
        try:
            mature_reasons = soup.find("p", {"id": "descriptorText"}).get_text().strip()
            mature_reasons = re.sub('\n',',', mature_reasons)
            mature_reasons = re.sub(',,',', ', mature_reasons)
            mature_reasons = mature_reasons.split(',')
            mature_reasons = [reason.strip() for reason in mature_reasons]
        except:
            pass

        # get user tags for each game
        tags = []
        tags_list = soup.find_all("a", {"class": "app_tag"})
        for tag in tags_list:
            tags.append(tag.get_text().strip())

        # get app name (should be same as title most (all?) of the time) for each game
        try:
            app_name = driver.find_element_by_css_selector('div.apphub_AppName').text
        except:
            pass

        # get specs for each game as a list
        specs = []
        specs_list = driver.find_elements_by_css_selector('div.game_area_details_specs')
        for spec in specs_list:
            specs.append(spec.text.strip())

        #div.game_purchase_price.price

        #prices = []
        #price_list = driver.find_elements_by_css_selector('div.game_purchase_price.price')
        #for price in price_list:
        #    prices.append(price.text.strip())
        #price = driver.find_elements_by_css_selector('div.game_purchase_price')

        #prices = []
        price = soup.find("div", {"class": "game_purchase_price"})
        if price:
            if (price.text.strip().lower() == 'free to play' or price.text.strip().lower() == 'free'):
                price = 0.00
            else:
                price = price.text.strip().strip('$')
                price = float(price)
                discount_price = price
        else:
            try:
                price = soup.find("div", {"class": "discount_original_price"}).text.strip().strip('$')
                price = float(price)
                discount_price = soup.find("div", {"class": "discount_final_price"}).text.strip().strip('$')
                discount_price = float(discount_price)
            except:
                pass


        early_access = soup.select('div.early_access_header')
        if early_access:
            early_access = True
        else:
            early_access = False

        # add rating (everybody, mature, teen, etc):

        rating = soup.findAll(attrs={'class' : re.compile("block responsive_apppage_details_right")})
        if 'esrb' in str(rating):
            esrb = str(rating).split("esrb")[1][1] 

        # add number of steam achievements:

        if 'Steam Achievements' in str(rating):
            steam_Achievement_n = int(str(rating).split("Steam Achievements")[0][-3:-1])

        # figure out how to get this working again:
        # reviews_url = f"http://steamcommunity.com/app/{id}/reviews/?browsefilter=mostrecent&p=1"
        # response.add_value('reviews_url', reviews_url)

        try:
            app_id = url.split('store.steampowered.com/app/')[1].split('/')[0]
            reviews_url = "http://steamcommunity.com/app/"+app_id+"/reviews/?browsefilter=mostrecent&p=1"
        except:
            pass

        game_dict['url']=url
        game_dict['reviews_url']=reviews_url
        game_dict['metascore']=metascore
        game_dict['app_name']=app_name
        game_dict['app_id']=app_id
        game_dict['early_access']=early_access
        game_dict['esrb']=esrb
        game_dict['steam_Achievement_n']=steam_Achievement_n
        game_dict['price']=price
        game_dict['discount_price']=discount_price
        game_dict['recent_rev_pos_perc']=recent_rev_pos_perc
        game_dict['recent_reviews_n']=recent_reviews_n
        game_dict['overall_rev_pos_perc']=overall_rev_pos_perc
        game_dict['overall_reviews_n']=overall_reviews_n
        game_dict['mature_reasons']=mature_reasons
        game_dict['specs']=specs
        game_dict['tags']=tags

        game_dict_list.append(game_dict)

        # pickle the dictionary list just in case

        with open('game_dict_list.pkl', 'wb') as f:
            pickle.dump(game_dict_list, f)

        # append json file

        with open('steam_games_dicts_selenium.json', 'a') as f:
            f.write(json.dumps(game_dict)+',\n')

    # remove final comma
    with open('steam_games_dicts_selenium.json', 'rb+') as filehandle:
        filehandle.seek(-2, os.SEEK_END)
        filehandle.truncate()
    filehandle.close()

    # add end list bracket
    with open('steam_games_dicts_selenium.json', 'a') as f:
        f.write('\n]')
    f.close()
    

In [449]:
type(json.dumps(game_dict))

str

In [457]:
recent_reviews_n = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[0].get_text().strip().split(' ')[4]
print(recent_reviews_n[4])

11


In [464]:
driver.get('https://store.steampowered.com/sub/32847/')
soup=BeautifulSoup(driver.page_source, 'lxml')
details = ''
details = soup.select('.details_block')
details = details
str(details).split('<br/>')

game_dict = {}

if '<br/>' in str(details):
    details_split = str(details).split('<br/>')
elif '<br>' in str(details):
    details_split = str(details).split('<br>')

for line in details_split:

    line = re.sub('<[^<]+?>', '', line)  # Remove tags.
    line = re.sub("[\r\t\n]", '', line).strip()

    for prop in ['Title:','Genre:','Developer:','Publisher:','Release Date:']:
        if prop in line:
            if prop in ['Genre:','Developer:','Publisher:']:
                names = (line.split(prop)[1]).strip().split(',')
                names = [name.strip() for name in names]
                game_dict[prop[:-1]] = names
            else:
                #item = line.replace(prop, '').strip()
                name = (line.split(prop)[1]).strip()
                game_dict[prop[:-1]] = name

try:
    app_id = url.split('store.steampowered.com/app/')[1].split('/')[0]
    reviews_url = "http://steamcommunity.com/app/"+app_id+"/reviews/?browsefilter=mostrecent&p=1"
except:
    pass

game_dict

{'Developer': ['Trilobyte Games'],
 'Genre': ['Adventure'],
 'Publisher': ['Nightdive Studios'],
 'Title': '7th Guest and 11th Hour Bundle'}

In [None]:
    # Make the json file if it's the first run:
#    if first_run==True:
        #with open('steam_games_dicts_selenium.json', 'w') as fp:
        #    json.dump(game_dict, fp)
        #    first_run=False
#    else:
        #with open('steam_games_dicts_selenium.json','a') as f:
        #    str = json.dumps(game_dict).replace('{', ',', 1)
        #    f.seek(-2,2)
        #    f.write(str)
        
#    with jsonlines.Writer('steam_games_dicts_selenium.json') as writer:
#        writer.write_all('game_dict_list')

#    with open('steam_games_dicts_selenium.jl', 'ab') as f:
#        for game_dict in game_dict_list:
#           f.write(json.dumps(game_dict)+'\n')
    

In [None]:
#    with nlj.open('steam_games_dicts_selenium.json') as src, \
#        with nlj.open('out.json', 'w') as dst:
#    for line in src:
#        dst.write(line)

#    with open('out.json') as f:
#        print(f.read()))

In [441]:
driver.get('https://store.steampowered.com/app/779420/WarGames/')
soup=BeautifulSoup(driver.page_source, 'lxml')
details = ''
details = soup.select('.details_block')
details = details
str(details).split('<br/>')

game_dict = {}

if '<br/>' in str(details):
    details_split = str(details).split('<br/>')
elif '<br>' in str(details):
    details_split = str(details).split('<br>')

for line in details_split:

    line = re.sub('<[^<]+?>', '', line)  # Remove tags.
    line = re.sub("[\r\t\n]", '', line).strip()

    for prop in ['Title:','Genre:','Developer:','Publisher:','Release Date:']:
        if prop in line:
            if prop in ['Genre:','Developer:','Publisher:']:
                names = (line.split(prop)[1]).strip().split(',')
                names = [name.strip() for name in names]
                game_dict[prop[:-1]] = names
            else:
                #item = line.replace(prop, '').strip()
                name = (line.split(prop)[1]).strip()
                game_dict[prop[:-1]] = name

game_dict

{'Developer': ['Eko'],
 'Genre': ['Adventure', 'Indie'],
 'Publisher': ['Eko'],
 'Release Date': 'Mar 14, 2018',
 'Title': '#WarGames'}

In [None]:
with open('steam_games_dicts_selenium.jl') as f:
    for line in f:
        game_dict_list = json_util.loads(line)

In [376]:
print(url)
print(url.split('store.steampowered.com/app/')[1].split('/')[0])

https://store.steampowered.com/app/449940/_That_Bastard_Is_Trying_To_Steal_Our_Gold/
449940


<div class="details_block">
<b>Title:</b> XCOM: Enemy Unknown<br/>
<b>Genre:</b> <a href="http://store.steampowered.com/genre/Strategy/?snr=1_5_9__408">Strategy</a><br/>
<b>Developer:</b>
<a href="http://store.steampowered.com/search/?developer=Firaxis%20Games&amp;snr=1_5_9__408">Firaxis Games</a>
					,			<a href="http://store.steampowered.com/search/?developer=Feral%20Interactive%20%28Mac%29&amp;snr=1_5_9__408">Feral Interactive (Mac)</a>
					,			<a href="http://store.steampowered.com/search/?developer=Feral%20Interactive%20%28Linux%29&amp;snr=1_5_9__408">Feral Interactive (Linux)</a>
<br/>
<b>Publisher:</b>
<a href="http://store.steampowered.com/search/?publisher=2K Games">2K Games</a>, <a href="http://store.steampowered.com/search/?publisher=Feral Interactive (Mac)">Feral Interactive (Mac)</a>, <a href="http://store.steampowered.com/search/?publisher=Feral Interactive (Linux)">Feral Interactive (Linux)</a> <br/>
<b>Release Date:</b> Oct 8, 2012<br/>
</div>


In [313]:
soup=BeautifulSoup(driver.page_source, 'lxml') # Selenium hands the page to Beautiful soup

details = soup.select('.details_block')
details = details[0]

game_dict={}

for line in str(details).split('<br/>'):

    line = re.sub('<[^<]+?>', '', line)  # Remove tags.
    line = re.sub("[\r\t\n]", '', line).strip()

    for prop in ['Title:','Genre:','Developer:','Publisher:','Release Date:']:
        if prop in line:
            item = line.replace(prop, '').strip()
            name = (line.split(prop)[1]).strip()
            game_dict[prop[:-1]] = name
#game_dict

{'Developer': 'Firaxis Games,Feral Interactive (Mac),Feral Interactive (Linux)',
 'Genre': 'Strategy',
 'Publisher': '2K Games, Feral Interactive (Mac), Feral Interactive (Linux)',
 'Release Date': 'Oct 8, 2012',
 'Title': 'XCOM: Enemy Unknown'}

In [310]:
# Define string to int to use a few times in the future (mostly to deal with atypical commas)

def str_to_int(string):
    return int(re.sub('[^0-9]','', string))

# Get the reviews details - recent and overall - number of reviews and percent positive:
if soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[1]:
    recent_rev_pos_perc = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[0].get_text().strip()[2:4]
    recent_rev_pos_perc = str_to_int(recent_rev_pos_perc)
    recent_reviews_n = soup.findAll(attrs={'class' : "responsive_hidden"})[0].get_text().strip().strip('(),')
    recent_reviews_n = str_to_int(recent_reviews_n)
    overall_rev_pos_perc = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[1].get_text().strip()[2:4]
    overall_rev_pos_perc = str_to_int(overall_rev_pos_perc)
    overall_reviews_n = soup.findAll(attrs={'class' : "responsive_hidden"})[1].get_text().strip().strip('(),')
    overall_reviews_n = str_to_int(overall_reviews_n)
else:
    overall_rev_pos_perc = soup.findAll(attrs={'class' : re.compile("nonresponsive_hidden responsive_reviewdesc")})[0].get_text().strip()[2:4]
    overall_rev_pos_perc = str_to_int(overall_rev_pos_perc)
    overall_reviews_n = soup.findAll(attrs={'class' : "responsive_hidden"})[0].get_text().strip().strip('(),')
    overall_reviews_n = str_to_int(overall_reviews_n)

# get metacritic score
metascore = driver.find_element_by_xpath('//div[@id="game_area_metascore"]/div[contains(@class, "score")]').text
metascore = str_to_int(metascore)

# get url
url = str(driver.current_url)

# get reasons for rating as a list
mature_reasons = soup.find("p", {"id": "descriptorText"}).get_text().strip()
mature_reasons = re.sub('\n',',', mature_reasons)
mature_reasons = re.sub(',,',', ', mature_reasons)
mature_reasons = mature_reasons.split(', ')

# get user tags for each game
tags = []
tags_list = soup.find_all("a", {"class": "app_tag"})
for tag in tags_list:
    tags.append(tag.get_text().strip())

# get app name (should be same as title most (all?) of the time) for each game
app_name = driver.find_element_by_css_selector('div.apphub_AppName').text

# get specs for each game as a list
specs = []
specs_list = driver.find_elements_by_css_selector('div.game_area_details_specs')
for spec in specs_list:
    specs.append(spec.text.strip())
    
#div.game_purchase_price.price

#prices = []
#price_list = driver.find_elements_by_css_selector('div.game_purchase_price.price')
#for price in price_list:
#    prices.append(price.text.strip())
#price = driver.find_elements_by_css_selector('div.game_purchase_price')

#prices = []
price = soup.find("div", {"class": "game_purchase_price"}).text.strip().strip('$')
price = float(price)
discount_price = price
if not price:
    price = soup.find("div", {"class": "discount_original_price"}).text.strip().strip('$')
    price = float(price)
    discount_price = soup.find("div", {"class": "discount_final_price"}).text.strip().strip('$')
    discount_price = float(discount_price)

early_access = soup.select('div.early_access_header')
if early_access:
    early_access = True
else:
    early_access = False

# add rating (everybody, mature, teen, etc):

rating = soup.findAll(attrs={'class' : re.compile("block responsive_apppage_details_right")})
if 'esrb' in str(rating):
    esrb = str(rating).split("esrb")[1][1] 

# add number of steam achievements:
   
if 'Steam Achievements' in str(rating):
    steam_Achievement_n = int(str(rating).split("Steam Achievements")[0][-3:-1])

# figure out how to get this working again:
# reviews_url = f"http://steamcommunity.com/app/{id}/reviews/?browsefilter=mostrecent&p=1"
# response.add_value('reviews_url', reviews_url)

app_id = url.split('http://store.steampowered.com/app/')[1].split('/')[0]
reviews_url = "http://steamcommunity.com/app/"+app_id+"/reviews/?browsefilter=mostrecent&p=1"

game_dict['url']=url
game_dict['reviews_url']=reviews_url
game_dict['metascore']=metascore
game_dict['app_name']=app_name
game_dict['app_id']=app_id
game_dict['early_access']=early_access
game_dict['esrb']=esrb
game_dict['steam_Achievement_n']=steam_Achievement_n
game_dict['price']=price
game_dict['discount_price']=discount_price
game_dict['recent_rev_pos_perc']=recent_rev_pos_perc
game_dict['recent_reviews_n']=recent_reviews_n
game_dict['overall_rev_pos_perc']=overall_rev_pos_perc
game_dict['overall_reviews_n']=overall_reviews_n
game_dict['mature_reasons']=mature_reasons
game_dict['specs']=specs
game_dict['tags']=tags

yield(game_dict_list.append(game_dict))



 url: http://store.steampowered.com/app/200510/XCOM_Enemy_Unknown/ 
 reviews_url: http://steamcommunity.com/app/200510/reviews/?browsefilter=mostrecent&p=1 
 metascore: 89 
 app_name: XCOM: Enemy Unknown 
 app_id: 200510 
 early_access: False 
 esrb: m 
 steam_Achievement_n: 85 
 price: 29.99 
 discount_price: 29.99 
 recent_rev_pos_perc: 85 
 recent_reviews_n: 135 
 overall_rev_pos_perc: 94 
 overall_reviews_n: 24493 
 mature_reasons: ['Blood and Gore', 'Strong Language', 'Violence'] 
 specs: ['Single-player', 'Multi-player', 'Cross-Platform Multiplayer', 'Steam Achievements', 'Full controller support', 'Steam Trading Cards', 'Steam Cloud'] 
 tags: ['Turn-Based Strategy', 'Tactical', 'Strategy', 'Sci-fi', 'Turn-Based', 'Aliens', 'Singleplayer', 'Base Building', 'Turn-Based Tactics', 'Replay Value', 'Multiplayer', 'Action', 'Perma Death', 'RPG', 'Difficult', 'Character Customization', 'Atmospheric', 'Futuristic', 'Isometric', 'Great Soundtrack']


In [177]:
with open('game_data.json', 'w') as fp:
    json.dump(game_dict, fp)

In [178]:
#with open('game_data.json', 'r') as fp:
#    game_data = json.load(fp)

In [179]:
game_data

{'Developer:': 'Firaxis Games,Feral Interactive (Mac),Feral Interactive (Linux)',
 'Genre:': 'Strategy',
 'Publisher:': '2K Games, Feral Interactive (Mac), Feral Interactive (Linux)',
 'Release Date:': 'Oct 8, 2012',
 'Title:': 'XCOM: Enemy Unknown'}

In [None]:
print('\n','url:',url, 
    '\n','reviews_url:',reviews_url,
    '\n','metascore:',metascore, 
    '\n','app_name:',app_name,
    '\n','app_id:',app_id,
    '\n','early_access:',early_access, 
    '\n','esrb:',esrb,       
    '\n','steam_Achievement_n:',steam_Achievement_n, 
    '\n','price:',price,       
    '\n','discount_price:',discount_price, 
    '\n','recent_rev_pos_perc:',recent_rev_pos_perc,
    '\n','recent_reviews_n:',recent_reviews_n, 
    '\n','overall_rev_pos_perc:',overall_rev_pos_perc, 
    '\n','overall_reviews_n:',overall_reviews_n,
    '\n','mature_reasons:',mature_reasons,
    '\n','specs:', specs,     
    '\n','tags:',tags)

In [314]:
#for name in []:
#game_dict[str(name)]=name

game_dict['url']=url
game_dict['reviews_url']=reviews_url
game_dict['metascore']=metascore
game_dict['app_name']=app_name
game_dict['app_id']=app_id
game_dict['early_access']=early_access
game_dict['esrb']=esrb
game_dict['steam_Achievement_n']=steam_Achievement_n
game_dict['price']=price
game_dict['discount_price']=discount_price
game_dict['recent_rev_pos_perc']=recent_rev_pos_perc
game_dict['recent_reviews_n']=recent_reviews_n
game_dict['overall_rev_pos_perc']=overall_rev_pos_perc
game_dict['overall_reviews_n']=overall_reviews_n
game_dict['mature_reasons']=mature_reasons
game_dict['specs']=specs
game_dict['tags']=tags

print(game_dict)

{'Title': 'XCOM: Enemy Unknown', 'Genre': 'Strategy', 'Developer': 'Firaxis Games,Feral Interactive (Mac),Feral Interactive (Linux)', 'Publisher': '2K Games, Feral Interactive (Mac), Feral Interactive (Linux)', 'Release Date': 'Oct 8, 2012', 'url': 'http://store.steampowered.com/app/200510/XCOM_Enemy_Unknown/', 'reviews_url': 'http://steamcommunity.com/app/200510/reviews/?browsefilter=mostrecent&p=1', 'metascore': 89, 'app_name': 'XCOM: Enemy Unknown', 'app_id': '200510', 'early_access': False, 'esrb': 'm', 'steam_Achievement_n': 85, 'price': 29.99, 'discount_price': 29.99, 'recent_rev_pos_perc': 85, 'recent_reviews_n': 135, 'overall_rev_pos_perc': 94, 'overall_reviews_n': 24493, 'mature_reasons': ['Blood and Gore', 'Strong Language', 'Violence'], 'specs': ['Single-player', 'Multi-player', 'Cross-Platform Multiplayer', 'Steam Achievements', 'Full controller support', 'Steam Trading Cards', 'Steam Cloud'], 'tags': ['Turn-Based Strategy', 'Tactical', 'Strategy', 'Sci-fi', 'Turn-Based', '

In [298]:
return {
    'title':title,
    'url': url,
    'price' : price,
    'early_access' : early_access,
    'recent_rev_pos_perc:' : recent_rev_pos_perc,
    'recent_reviews_n:' : recent_reviews_n, 
    'overall_rev_pos_perc:' : overall_rev_pos_perc, 
    'overall_reviews_n:' : overall_reviews_n,
    'specs' : specs,
    'metascore' : metascore,
    'mature_reasons' : mature_reasons
    }

85 <class 'int'>


[]


InvalidSelectorException: Message: invalid selector: Unable to locate an element with the xpath expression //div[@class="game_area_details_specs"]] because of the following error:
SyntaxError: Failed to execute 'evaluate' on 'Document': The string '//div[@class="game_area_details_specs"]]' is not a valid XPath expression.
  (Session info: chrome=65.0.3325.181)
  (Driver info: chromedriver=2.37.544337 (8c0344a12e552148c185f7d5117db1f28d6c9e85),platform=Mac OS X 10.12.6 x86_64)


['Turn-Based Strategy', 'Tactical', 'Strategy', 'Sci-fi', 'Turn-Based', 'Aliens', 'Singleplayer', 'Base Building', 'Turn-Based Tactics', 'Replay Value', 'Multiplayer', 'Action', 'Perma Death', 'RPG', 'Difficult', 'Character Customization', 'Atmospheric', 'Futuristic', 'Isometric', 'Great Soundtrack']


In [None]:
import os, json

with open('game_data.json', 'a') as fp:
    json.dump(j_data, fp, indent = 2)
fp.close()

f = open('game_data.json','r')
old_game_data = f.read()
f.close()

new_game_data = old_game_data.replace("][", ",")

f = open('game_data_new.json','w')
f.write(new_game_data)
f.close()

os.remove('game_data.json')
os.rename('game_data_new.json', 'game_data.json')

In [None]:
import newlinejson as nlj

with nlj.open('sample-data/dictionaries.json') as src, \
        with nlj.open('out.json', 'w') as dst:
    for line in src:
        dst.write(line)

with open('out.json') as f:
    print(f.read()))

Oct 8, 2012


 


(135)






<selenium.webdriver.remote.webelement.WebElement (session="d84946154312d12b3bc5762ed4272875", element="0.9452479479082618-1")>

In [None]:
import json

a_dict = {'new_key': 'new_value'}

with open('test.json') as f:
    data = json.load(f)

data.update(a_dict)

with open('test.json', 'w') as f:
    json.dump(data, f)