<a href="https://colab.research.google.com/github/Matheus-Chaves/web-scraping/blob/main/web_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Web Scraping - Collecting 60k lines of video game data

> Kaggle Dataset -> https://www.kaggle.com/datasets/matheusfonsecachaves/popular-video-games

> Inspiration -> https://www.kaggle.com/datasets/arnabchaki/popular-video-games-1980-2023

In [6]:
#@title Imports

import requests
from bs4 import BeautifulSoup
import soupsieve as sv
import random
from tqdm.notebook import tqdm
import time
import pickle
import pandas as pd

In [7]:
#@title Constants

BASE_URL = 'https://www.backloggd.com'
PAGE_URL = BASE_URL + '/games/lib/popular?page='
REQUEST_DELAY = 0.75
USER_AGENT_LIST = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
                   'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1']

## Getting total number of pages

In [9]:
response = requests.get(PAGE_URL)
response.status_code

200

In [13]:
soup = BeautifulSoup(response.content, "html.parser")
soup.contents

['html',
 '\n',
 <html>
 <head>
 <!-- Google Tag Manager -->
 <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
 		new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
 		j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
 		'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
 		})(window,document,'script','dataLayer','GTM-TTLBSBS');</script>
 <!-- End Google Tag Manager -->
 <!-- HTML Meta Tags -->
 <title>Backloggd - A Video Game Collection Tracker</title>
 <meta content="Keep a virtual backlog of your video game collection, then rate and review the ones you've played to share with your friends!" name="description"/>
 <!-- Google / Search Engine Tags -->
 <meta content="Backloggd - A Video Game Collection Tracker" itemprop="name"/>
 <meta content="Keep a virtual backlog of your video game collection, then rate and review the ones you've played to share with your friends!" itemprop="description"/>


In [None]:
css_pagination_selector = 'nav.pagination > span:nth-last-child(2) > a'
num_pages = int(sv.select_one(css_pagination_selector, soup).text)
print(num_pages)

3850


## Web Scraping - Getting game links

In [18]:
def request(url):
  user_agent = random.choice(USER_AGENT_LIST)
  header = {'User-Agent': user_agent}
  response = requests.get(url, headers=header)
  time.sleep(REQUEST_DELAY)
  return response

In [None]:
css_link_selector = 'div.col-2.my-2.px-1.px-md-2 > a [href]'
game_links = []

for page in tqdm(range(1, num_pages + 1)):
  page_response = request(f'{PAGE_URL}{page}')

  if page_response.status_code != 200:
    print(f"Page: {page}\nStatus code: {page_response.status_code}")
    break

  soup = BeautifulSoup(page_response.content, "html.parser")
  links = sv.select(css_link_selector, soup)
  game_links.extend(f"{BASE_URL}{link['href']}" for link in links)

  0%|          | 0/3813 [00:00<?, ?it/s]

### Saving game links - Google Drive

Good to avoid unexpected problems.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
GAME_LINKS_PATH = '/content/drive/MyDrive/Colab Notebooks/ZRP_WebScraping/game_links.pkl'

In [None]:
# with open(GAME_LINKS_PATH, 'wb') as file:
#   pickle.dump(game_links, file)

In [None]:
with open(GAME_LINKS_PATH, 'rb') as file:
  game_links = pickle.load(file)

In [None]:
game_links[:10]

['https://www.backloggd.com/games/elden-ring/',
 'https://www.backloggd.com/games/the-legend-of-zelda-breath-of-the-wild/',
 'https://www.backloggd.com/games/hades--1/',
 'https://www.backloggd.com/games/hollow-knight/',
 'https://www.backloggd.com/games/undertale/',
 'https://www.backloggd.com/games/minecraft/',
 'https://www.backloggd.com/games/omori/',
 'https://www.backloggd.com/games/the-legend-of-zelda-tears-of-the-kingdom/',
 'https://www.backloggd.com/games/resident-evil-4--1/',
 'https://www.backloggd.com/games/nier-automata/']

In [None]:
len(game_links)

137253

## Web Scraping - Getting game info

In [None]:
#@title Testing selectors

# title, release_date, developers, summary, platforms, genres, rating, plays, playing, backlogs, wishlist, lists, reviews
print('Title:', sv.select_one('#title h1', soup).text)
print('Release Date:', sv.select_one('.sub-title a', soup).text)
developers_element = sv.select_one('.col-auto.pl-lg-1.sub-title', soup)
developers = [i.text.strip() for i in developers_element.select('a')] if developers_element else []
print(developers)
try:
  print('Developers:', [i.text.strip() for i in sv.select_one('.col-auto.pl-lg-1.sub-title', soup).select('a')])
except:
  print('Developers:', [])
print('Summary:', sv.select_one('#collapseSummary', soup).text.strip())
print('Platforms:', [i.text.strip() for i in sv.select('.game-page-platform', soup)])
print('Genres:', [i.text.strip() for i in sv.select('.genre-tag', soup)])
print('Rating:', sv.select_one('#score > h1', soup).text)
plays, playing, backlogs, wishlist = [i.text.strip() for i in sv.select('.col-auto.ml-auto.pl-0 p', soup)]
print('Plays:', plays)
print('Playing:', playing)
print('Backlogs:', backlogs)
print('Wishlist:', wishlist)
lists, reviews = [i.text.strip().split()[0] for i in sv.select('.game-page-sidecard', soup)]
print('Lists:', lists)
print('Reviews:', reviews)

In [None]:
#@title Creating Pandas data frame
cols = ['Title', 'Release_Date', 'Developers', 'Summary', 'Platforms', 'Genres', 'Rating', 'Plays', 'Playing', 'Backlogs', 'Wishlist', 'Lists', 'Reviews']
df_games = pd.DataFrame(columns=cols)

### Getting 60k of game data

In [None]:
def scrap_game_data(soup):
  title = sv.select_one('#title h1', soup).text
  release_date = sv.select_one('.sub-title a', soup).text
  developers_element = sv.select_one('.col-auto.pl-lg-1.sub-title', soup)
  developers = [i.text.strip() for i in developers_element.select('a')] if developers_element else []
  summary = sv.select_one('#collapseSummary', soup).text.strip()
  platforms = [i.text.strip() for i in sv.select('.game-page-platform', soup)]
  genres = [i.text.strip() for i in sv.select('.genre-tag', soup)]
  rating = sv.select_one('#score > h1', soup).text
  plays, playing, backlogs, wishlist = [i.text.strip() for i in sv.select('.col-auto.ml-auto.pl-0 p', soup)]
  lists, reviews = [i.text.strip().split()[0] for i in sv.select('.game-page-sidecard', soup)]

  return [title, release_date, developers, summary, platforms, genres, rating,
          plays, playing, backlogs, wishlist, lists, reviews]

In [None]:
NUMBER_OF_GAMES = 60000
ERROR_404_TEXT = 'Welp, this is awkward...'

# if 'df_games' was already saved in Google Drive, you can continue
# where you left off using the range below
for link in tqdm(game_links[len(df_games):NUMBER_OF_GAMES]):
  game_response = request(link)

  if game_response.status_code != 200:
    print(f"Link: {link}\nStatus: {game_response.status_code}")
    break

  soup = BeautifulSoup(game_response.content, "html.parser")

  if ERROR_404_TEXT not in game_response.text:
    # adding a new line of data into the data frame
    df_games.loc[len(df_games)] = scrap_game_data(soup)

  0%|          | 0/3 [00:00<?, ?it/s]

### Saving game data - Google Drive

In [20]:
GAME_DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/ZRP_WebScraping/df_games.pkl'

In [None]:
# with open(GAME_DATA_PATH, 'wb') as file:
#   pickle.dump(df_games, file)

In [21]:
with open(GAME_DATA_PATH, 'rb') as file:
  df_games = pickle.load(file)

In [22]:
df_games

Unnamed: 0,Title,Release_Date,Developers,Summary,Platforms,Genres,Rating,Plays,Playing,Backlogs,Wishlist,Lists,Reviews
0,Elden Ring,"Feb 25, 2022","[FromSoftware, Bandai Namco Entertainment]","Elden Ring is a fantasy, action and open world...","[Windows PC, PlayStation 4, Xbox One, PlayStat...","[Adventure, RPG]",4.5,21K,4.1K,5.6K,5.5K,4.6K,3K
1,The Legend of Zelda: Breath of the Wild,"Mar 03, 2017","[Nintendo, Nintendo EPD Production Group No. 3]",The Legend of Zelda: Breath of the Wild is the...,"[Wii U, Nintendo Switch]","[Adventure, Puzzle]",4.4,35K,3.1K,5.6K,3K,5.1K,3K
2,Hades,"Dec 07, 2018",[Supergiant Games],A rogue-lite hack and slash dungeon crawler in...,"[Windows PC, Mac, PlayStation 4, Xbox One, Pla...","[Adventure, Brawler, Indie, RPG]",4.3,25K,3.5K,7.3K,4K,3.2K,2.1K
3,Hollow Knight,"Feb 24, 2017",[Team Cherry],A 2D metroidvania with an emphasis on close co...,"[Windows PC, Mac, Linux, Nintendo Switch]","[Adventure, Indie, Platform]",4.4,25K,2.7K,9.6K,2.6K,3.4K,2.1K
4,Undertale,"Sep 15, 2015","[tobyfox, 8-4]","A small child falls into the Underground, wher...","[Windows PC, Mac, Linux, PlayStation 4, Xbox O...","[Adventure, Indie, RPG, Turn Based Strategy]",4.2,32K,728,5.7K,2.1K,3.9K,2.5K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,Dragon Spirits,"Apr 25, 2023","[FHNBHJ, indienova]",While constantly working on a video game witho...,[Windows PC],[RPG],,2,0,1,5,3,0
59996,Pathfinder: Kingmaker - Definitive Edition,"Aug 18, 2020","[Owlcat Games, Deep Silver]",The Definitive Edition will have all of the DL...,"[PlayStation 4, Xbox One]","[RPG, Tactical]",3.7,9,0,19,1,7,1
59997,Sainth,"Apr 11, 2008",[Neo Kuriyo],"Years after the events of ""Wraith"", a young gi...",[Windows PC],[],,1,0,0,2,0,0
59998,Dragon Spirits,"Apr 25, 2023","[FHNBHJ, indienova]",While constantly working on a video game witho...,[Windows PC],[RPG],,2,0,1,5,3,0


## Transforming dataframe to CSV


In [None]:
df_games.to_csv("backloggd_games.csv")