In [16]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import regex as re
from lxml import etree as et
from itertools import repeat
import csv
import random
import time
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')

In [None]:
'''
Un agente de usuario es una cadena que identifica la aplicación y la versión del software que está realizando la solicitud a un servidor web. 
En el contexto del código que estamos revisando, se utiliza para enviar solicitudes HTTP a la página web que estamos extrayendo.
Cada elemento de la lista header_list representa un agente de usuario simulado para un navegador web específico. 
El objetivo de usar múltiples agentes de usuario es simular el comportamiento de diferentes navegadores y evitar ser bloqueado por el servidor debido a solicitudes automatizadas o scraping.
En resumen, esta parte del código ayuda a realizar scraping de manera más amigable y a evitar posibles bloqueos por parte del servidor
'''

In [17]:
# Lista de agentes de usuario
header_list = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,                           like Gecko) Chrome/103.0.5060.66 Safari/537.36",
              "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0",
              "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"]

# URL base para la página de juegos
base_url = 'https://www.backloggd.com/games/lib/popular?page='
games_list = []
d = 0

In [18]:
# Lista para almacenar enlaces de juegos
game_links = []

for page_no in tqdm_notebook(range(4510)): # en range se indica el número de páginas sobre las que se quiere obtener enlaces, en cada página hay unos 36 juegos. En este caso añado el total de páginas disponibles
    page_url = base_url + str(page_no)
    #print(page_url)
    user_agent = random.choice(header_list)
    header = {"User-Agent": user_agent}
    webpage = requests.get(page_url, headers = header)
    if webpage.status_code == 200:
        soup1 = BeautifulSoup(webpage.content, 'html.parser')
        soup2 = BeautifulSoup(soup1.prettify(), 'html.parser')
        g = soup2.find('div', {'class' : 'row show-release toggle-fade'})
        games = g.find_all('div', {'class' : 'col-2 my-2 px-1 px-md-2'})
        game_ref = [game.find('a').get('href') for game in games]
        game_links.extend(['https://www.backloggd.com' + ref for ref in game_ref])

  0%|          | 0/4510 [00:00<?, ?it/s]

In [19]:
# check del número de resultados (links de juegos) que se han scrapeado y almacenado correctamente
print(len(game_links))

162324


In [20]:
# Definir las columnas para el DataFrame
cols = ['Title', 'Release Date', 'Team', 'Rating', 'Times Listed', 'Number of Reviews', 'Genres', 'Summary', 'Reviews','Platforms', 'Plays', 'Playing', 'Backlogs', 'Wishlist']
backloggd = pd.DataFrame(columns=cols)

In [21]:
# Iterar sobre los enlaces de juegos para obtener información detallada
for link in tqdm_notebook(game_links): # tqdm_notebook es una herramienta que permite mostrar barra de progreso al trabajar con bucles grandes
    user_agent = random.choice(header_list)
    header = {"User-Agent": user_agent}
    webpage = requests.get(link, headers = header)
     # Verificar si la página se cargó correctamente
    if webpage.status_code == 200:
        soup1 = BeautifulSoup(webpage.content, 'html.parser')
        soup2 = BeautifulSoup(soup1.prettify(), 'html.parser')
        
        # Obtener información básica del juego (título, fecha de lanzamiento, equipos, calificación, etc.)
        title = soup2.find('div', {'class' : 'col-auto pr-1'}).get_text().strip()
        
        release_date = ' '.join(soup2.find('div', {'class' : 'col-auto mt-auto pr-0'}).get_text().strip().split()[-3:])

        try:
            teams = soup2.find('div', {'class' : 'col-auto pl-lg-1 sub-title'})
            teams = teams.find_all('a')
            teams = [i.get_text().strip() for i in teams]
        except:
            teams = np.nan
            #print(teams)
        
        try:
            rating = float(soup2.find(id = 'score').get_text().strip()[-3:])
        except:
            rating = np.nan

        table = soup2.find_all('div', {'class' : 'col-12 mb-1'})#.get_text().strip()
        feats = [f.get_text().strip().split('\n')[0].strip() for f in table] # feats contiene las características de la tabla
        # las variables 'Plays', 'Playing', 'Backlogs', y 'Wishlist' se llenan con los valores de la lista results
        results = [r.get_text().strip().split('\n')[-1].strip() for r in table] # results recoge la info
        #print(feats,results)

        # dicted = {}
        # for i in range(len(feats)):
        #     dicted[feats[i]] = results[i]
        
        nlists = soup2.find('p', {'class' : 'game-page-sidecard'}).get_text().strip().split()[0]
        
        nreviews = soup2.find('p', {'class' : 'game-page-sidecard'}).get_text().strip().split()[0]

        genres = soup2.find_all('p',{'class' : 'genre-tag'})
        genres = [genre.get_text().strip() for genre in genres]

        summary = soup2.find(id = 'collapseSummary').get_text().strip()

        review_section = soup2.find(id = 'game-reviews-section')
        reviews = review_section.find_all('div', {'class' : 'row pt-2 pb-1 review-card'})
        reviews = [r.find('div', {'class' : 'formatted-text'}).get_text().strip() for r in reviews]

        # Obtener las plataformas desde el HTML
        platform_tags = soup2.find_all('a', class_='game-page-platform')
        platforms = ', '.join([platform.get_text(strip=True) for platform in platform_tags])
        platforms_row = [platforms]

        # 'row' contiene la lista de variables con la información correspondiente de cada variable hasta 'platform'
        # IMPORTANTE --> si se quieren añadir nuevas variables, colocarlas en la lista de la variable 'cols' de arriba del todo antes que 'plays' para que no se solapen los datos
        row = [title, release_date, teams, rating, nlists, nreviews, genres, summary, reviews, platforms_row]
        row.extend(results)
        backloggd.loc[len(backloggd.index)] = row

  0%|          | 0/162324 [00:00<?, ?it/s]

In [15]:
backloggd.head()

Unnamed: 0,Title,Release Date,Team,Rating,Times Listed,Number of Reviews,Genres,Summary,Reviews,Platforms,Plays,Playing,Backlogs,Wishlist
0,Elden Ring,"Feb 25, 2022","[FromSoftware, Bandai Namco Entertainment]",4.5,6.2K,6.2K,"[Adventure, RPG]","Elden Ring is a fantasy, action and open world...","[👍, A good effort from konami, but at the end ...","[Windows PC, PlayStation 4, Xbox One, PlayStat...",33K,5.2K,8.7K,7.3K
1,The Legend of Zelda: Tears of the Kingdom,"May 12, 2023","[Nintendo, Nintendo EPD Production Group No. 3]",4.5,4.2K,4.2K,"[Adventure, RPG]",The Legend of Zelda: Tears of the Kingdom is t...,"[nice addition to the first game, played only ...",[Nintendo Switch],16K,6.5K,6.3K,6.9K
2,The Legend of Zelda: Breath of the Wild,"Mar 03, 2017","[Nintendo EPD Production Group No. 3, Nintendo]",4.4,6.6K,6.6K,"[Adventure, Puzzle, RPG]",The Legend of Zelda: Breath of the Wild is the...,"[👍, very nice game, my first Legend of Zelda, ...","[Wii U, Nintendo Switch]",48K,3.7K,7.6K,3.9K
3,Hades,"Dec 07, 2018",[Supergiant Games],4.3,4.2K,4.2K,"[Adventure, Brawler, Indie, RPG]",A rogue-lite hack and slash dungeon crawler in...,"[I enjoyed it quite a bit, but with games like...","[Windows PC, Mac, PlayStation 4, Xbox One, Pla...",35K,4.4K,10K,5.3K
4,Hollow Knight,"Feb 24, 2017",[Team Cherry],4.4,4.6K,4.6K,"[Adventure, Indie, Platform]",A 2D metroidvania with an emphasis on close co...,"[didnt got the true ending, thats a hard metro...","[Windows PC, Mac, Wii U, Linux, Nintendo Switch]",35K,3.6K,13K,3.6K


In [None]:
backloggd.to_csv('backloggd.csv')