In [2]:
import requests
import pandas as pd
from time import sleep
import numpy as np
import re
import random
import os
from fuzzywuzzy import process, fuzz

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import WebDriverException



import warnings
warnings.filterwarnings('ignore')



### Web Scraping Genres

In [499]:
opciones= Options()
opciones.add_experimental_option('excludeSwitches', ['enable-automation'])
opciones.add_experimental_option('useAutomationExtension', False)
opciones.add_argument('--start-maximized')
opciones.add_argument('user.data-dir=selenium')
opciones.add_argument('--incognito')

In [2]:
# For this data, we are going to web scrape a web page to obtain the genres of different games. Due to the
# length of the project, we will focus on the games from the twitch data that appear in the esports file. 

df = pd.read_csv("../data/twitch/twitch_raw")
df.drop("Unnamed: 0", axis=1, inplace=True)

In [3]:
juegos_twitch = df["Game"].to_list()

In [4]:
dff = pd.read_csv("../data/csv/all_games.csv")
dff.drop("Unnamed: 0", axis=1, inplace=True)
juegos_meta = dff["name"].to_list()

In [5]:
juegos = []
for juego in juegos_twitch:
    if juego in juegos_meta:
        juegos.append(juego)

In [6]:
juegos = set(juegos)

In [7]:
len(juegos)

522

In [514]:
# In order to avoid "Not Found Error" in the web page, we will try to obtain the exact url for each game.

juegos_url = []
for juego in juegos:
    if "'" in juego or ":" in juego:
        juegos_url.append(re.sub("\s", "-", juego.replace("'", "").replace(":","").lower()))
    else:
        juegos_url.append(re.sub("\s", "-", juego).lower())

In [517]:
# Here the code to Web Scrape the Metacritics webpage.

opciones= Options()
opciones.add_experimental_option('excludeSwitches', ['enable-automation'])
opciones.add_experimental_option('useAutomationExtension', False)
opciones.add_argument('--start-maximized')
opciones.add_argument('user.data-dir=selenium')
opciones.add_argument('--incognito')

driver = webdriver.Chrome(ChromeDriverManager().install())

dicc = {"juego": [], "datos": []}

for juego, juego_url in zip(juegos, juegos_url):
        try:
                driver.get(f"https://www.metacritic.com/game/pc/{juego_url}")
                sleep(6)
                dicc["juego"].append(juego)
                dicc["datos"].append(driver.find_element("xpath", '//*[@id="main"]/div/div[1]/div[1]/div[3]/div/div/div[2]/div[2]/div[2]/ul').text)
        except NoSuchElementException:
                dicc["datos"].append("no hay data")

In [518]:
df = pd.DataFrame(dicc)
df.head()

Unnamed: 0,juego,datos
0,Injustice 2,"Developer: QLOC\nGenre(s): Action, Fighting, 2..."
1,Saints Row,Developer: Volition Inc.\nGenre(s): Action Adv...
2,Shenmue III,"Developer: Ys Net\nGenre(s): Action Adventure,..."
3,L.A. Noire,no hay data
4,Assassin's Creed Valhalla,Developer: Ubisoft Montreal\nGenre(s): Action ...


In [519]:
df.to_csv("generos_raw.csv")

### Data Wrangling $ Data Cleaning

In [11]:
df = pd.read_csv("../data/generos/generos_raw.csv")
df.drop(["Unnamed: 0"],axis=1, inplace=True)

In [12]:
df.head(2)

Unnamed: 0,juego,datos
0,Injustice 2,"Developer: QLOC\nGenre(s): Action, Fighting, 2..."
1,Saints Row,Developer: Volition Inc.\nGenre(s): Action Adv...


In [13]:
df["datos"] = df["datos"].str.split("\n")
df.head()

Unnamed: 0,juego,datos
0,Injustice 2,"[Developer: QLOC, Genre(s): Action, Fighting, ..."
1,Saints Row,"[Developer: Volition Inc., Genre(s): Action Ad..."
2,Shenmue III,"[Developer: Ys Net, Genre(s): Action Adventure..."
3,L.A. Noire,[no hay data]
4,Assassin's Creed Valhalla,"[Developer: Ubisoft Montreal, Genre(s): Action..."


In [14]:
dicci = {"Developer": [], "Genre(s)": [], "# of players": [], "Cheats": [], "Available On": [], "Rating": [], "More Details and Credits »": []}

In [15]:
df_data = pd.DataFrame(dicci)
df_data.head()

Unnamed: 0,Developer,Genre(s),# of players,Cheats,Available On,Rating,More Details and Credits »


In [16]:
df = pd.concat([df, df_data])
df.head(5)

Unnamed: 0,juego,datos,Developer,Genre(s),# of players,Cheats,Available On,Rating,More Details and Credits »
0,Injustice 2,"[Developer: QLOC, Genre(s): Action, Fighting, ...",,,,,,,
1,Saints Row,"[Developer: Volition Inc., Genre(s): Action Ad...",,,,,,,
2,Shenmue III,"[Developer: Ys Net, Genre(s): Action Adventure...",,,,,,,
3,L.A. Noire,[no hay data],,,,,,,
4,Assassin's Creed Valhalla,"[Developer: Ubisoft Montreal, Genre(s): Action...",,,,,,,


In [17]:
# We need to fill the above df with the data we have available (some of them will be NaN). This code will fill
# each row with the data available.

for index, row in df.iterrows():
    for dato in row["datos"]:
        for columna in list(df.columns):
            if dato == "More Details and Credits »":
                pass
            elif dato.split(":")[0] == columna:
                df.loc[index, columna] = dato.split(":")[1]

In [18]:
df.head(2)

Unnamed: 0,juego,datos,Developer,Genre(s),# of players,Cheats,Available On,Rating,More Details and Credits »
0,Injustice 2,"[Developer: QLOC, Genre(s): Action, Fighting, ...",QLOC,"Action, Fighting, 2D",Online Multiplayer,On GameFAQs,,T,
1,Saints Row,"[Developer: Volition Inc., Genre(s): Action Ad...",Volition Inc.,"Action Adventure, Open-World",,On GameFAQs,Stadia,M,


In [19]:
# We need to decide if we keep the genre in one row or we separate it. The unique values are 184 with genres
# grouped. We choose to separate them so we can study them one by one.

df["Genre(s)"].unique().size

184

In [20]:
df.drop(["datos", "More Details and Credits »", "Cheats", "Available On"], axis=1, inplace=True)

In [21]:
df.head(1)

Unnamed: 0,juego,Developer,Genre(s),# of players,Rating
0,Injustice 2,QLOC,"Action, Fighting, 2D",Online Multiplayer,T


In [22]:
df_generos = df[["juego", "Genre(s)"]]
df_generos.head(10)

Unnamed: 0,juego,Genre(s)
0,Injustice 2,"Action, Fighting, 2D"
1,Saints Row,"Action Adventure, Open-World"
2,Shenmue III,"Action Adventure, Open-World"
3,L.A. Noire,
4,Assassin's Creed Valhalla,"Action Adventure, Open-World"
5,Monster Hunter Generations,
6,Planet Coaster,"Strategy, Management, Business / Tycoon"
7,DOOM Eternal,"Action, Shooter, First-Person, Arcade"
8,NBA 2K20,
9,Madden NFL 20,"Action, Sports, General, Team, Football, Sim"


In [23]:
df_info = df[["juego", "Developer", "# of players", "Rating"]]
df_info.head(1)

Unnamed: 0,juego,Developer,# of players,Rating
0,Injustice 2,QLOC,Online Multiplayer,T


In [24]:
df_generos["Genre(s)"] = df_generos["Genre(s)"].str.split(",")

In [37]:
df_generos = df_generos.explode("Genre(s)")
df_generos["Genre(s)"] = df_generos["Genre(s)"].str.strip()
df_generos.head()

Unnamed: 0,juego,Genre(s)
0,Injustice 2,Action
0,Injustice 2,Fighting
0,Injustice 2,2D
1,Saints Row,Action Adventure
1,Saints Row,Open-World


In [38]:
df_generos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1384 entries, 0 to 521
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   juego     1384 non-null   object
 1   Genre(s)  1384 non-null   object
dtypes: object(2)
memory usage: 32.4+ KB


In [39]:
df_generos.dropna(inplace=True)

In [40]:
df_generos.describe().T

Unnamed: 0,count,unique,top,freq
juego,1384,372,World of Tanks,12
Genre(s),1384,86,Action,126


In [41]:
df_generos.shape

(1384, 2)

In [42]:
df_generos.to_csv("../data/generos/generos.csv")

In [542]:
df_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   juego         522 non-null    object
 1   Developer     372 non-null    object
 2   # of players  320 non-null    object
 3   Rating        310 non-null    object
dtypes: object(4)
memory usage: 16.4+ KB


In [543]:
df_info.describe().T

Unnamed: 0,count,unique,top,freq
juego,522,522,Injustice 2,1
Developer,372,257,Capcom,9
# of players,320,22,No Online Multiplayer,153
Rating,310,4,M,166


In [547]:
df_info.dropna(subset=["Developer", "# of players", "Rating"], how="all", inplace=True)

In [548]:
df_info.to_csv("info_general.csv")