# DATA102 Homework 1: Web Scraping

**Group Number**: 7 <br/>
**Members**:
- Jose Maria Angelo Guerra
- Kyle Carlo Lasala
- Katrina Bianca Roco
- Antonio Jose Maria Lorenzo
- Josh Angelo Theodore Borro
- Charles Joseph Hinolan

**Section**: S11

**Instructor**: Mr. Jude Michael Teves

### Import Libraries

In [1]:
import requests
import numpy as np
import pandas as pd
import time
from urllib3.exceptions import ReadTimeoutError
from threading import Thread
from IPython.display import clear_output

# Selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException, TimeoutException
from selenium.webdriver.chrome.service import Service #microsft edge, change to webdriver.chrome.service for chrome

### Parse Using BeautifulSoup

In [3]:
from bs4 import BeautifulSoup
page = "https://itch.io/games"
#contact is my personal email
headers = {"User-Agent": "EducationalScraper/1.0 (contact: hinolancj@gmail.com)"}

# Disallowed paths based on itch.io/robots.txt
disallowed_paths = ["/embed/", "/embed-upload/", "/search", "/checkout/", "/game/download/", "/bundle/download/", "/register-for-purchase/", "/email-feedback/"]

# Function to check if URL is allowed
def is_allowed(url):
    for path in disallowed_paths:
        if path in url:
            return False
    return True


url = f"{page}"
if is_allowed(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        print(soup.prettify())
        time.sleep(2)
    except requests.exceptions.RequestException as e:
        print("Error: Unable to fetch the page.")
else:
    print(f"Skipping disallowed URL: {url}")

<!DOCTYPE HTML>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="IBM HomePage Builder 2001 V5.0.0 for Windows" name="GENERATOR"/>
  <meta content="3BB4D18369B9C21326AF7A99FCCC5A09" name="msvalidate.01">
   <meta content="537395183072744" property="fb:app_id"/>
   <title>
    Top games - itch.io
   </title>
   <meta name="csrf_token" value="WyJQSjlRIiwxNzM4MDEyNjUyLCJvZ2VaaTRYcWw4U0hROUciXQ==.f0kCF7h+jvnkbFXpBSBe4ArJh1E="/>
   <meta content="Top games" property="og:title"/>
   <meta content="itch.io" property="og:site_name"/>
   <meta content="4503599627724030" property="twitter:account_id"/>
   <link href="?page=2" rel="next"/>
   <link href="/static/manifest.json" rel="manifest"/>
   <meta content="@itchio" name="twitter:creator"/>
   <meta content="Top games" name="twitter:title"/>
   <meta content="Explore games on itch.io" name="twitter:description"/>
   <meta content="@itchio" name="twitter:site"/>
   <meta content="summary_large_image" name="twitter:card"/>
   

### Setup Browser Automation

In [4]:
driver_path = "C:/Users/Kyle Carlo C. Lasala/Documents/CODING/Python/DATA102/driver/chromedriver.exe" #edit your driver's path
url = "https://itch.io/games" 

#service = Service(driver_path)
driver = webdriver.Chrome()

driver.get(url)

### Extracting the Data

data to extract (for now):
1. game id (class=game_cell has_cover)
2. game title (class=game_title)
3. genre (class=game_genre)
4. author (class=game_author)
5. game text (class=game_text)
6. link

-------------- need to click on the game to get the following data below--------------

7. status
8. average rating
9. rating count
10. tags 
11. average session time 
12. platforms 

#### Auto Scrolling Algorithm 

In [6]:
#auto scrolling algorithm 
#NOTE: max_game_count limits the number of games to parse through -> limits both the auto scrolling and the games list
pause = 0.5
lastHeight = driver.execute_script("return document.body.scrollHeight")

length = 0
max_game_count = 1500

while length < max_game_count:
    game_list = driver.find_elements(By.XPATH,"//div[@class='game_cell has_cover']")
    length = len(game_list)
    
    # checking progress
    clear_output(wait=True)
    print('Games Loaded:', length)
    
    if length >= max_game_count:
        break
        
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(pause)
    newHeight = driver.execute_script("return document.body.scrollHeight")
    if newHeight == lastHeight:
        break
    lastHeight = newHeight
print('DONE!')

Games Loaded: 1508
DONE!


#### Retrieving Game Info from Homepage

In [7]:
# truncate based on max_game_count
games = game_list[:max_game_count]

#extract game data
#NOTE: some games do not have all the data, so we need to account for that by adding N/A if the data is not present in order for the lists to be the same length
#NOTE: moved the code for obtaining game links here because for some reason it always got different lengths of game lists
def append_to_data(*args, game_id, link, data_list):
    data_list.append(game_id)
    for each in args:
        # check for null data
        data_list.append("N/A" if not each else each[0].text)
    data_list.append(link)
        
def retrieve_games_info(start_index, end_index, games_info):
    for game in games[start_index:end_index]:
        data = []
        # all games are guaranteed to have a game_id
        game_id = game.get_attribute("data-game_id")    
        title = game.find_elements(By.XPATH, ".//a[@class='title game_link']")
        genre = game.find_elements(By.XPATH, ".//div[@class='game_genre']")
        author = game.find_elements(By.XPATH, ".//div[@class='game_author']")
        text = game.find_elements(By.XPATH, ".//div[@class='game_text']")
        link = game.find_element(By.XPATH, ".//a[@class='title game_link']").get_attribute('href')
        
        # append the game_id, title, genre, author, and text to data array
        append_to_data(title, genre, author, text, game_id=game_id, link=link, data_list=data)
    
        # append the data array to games_info numpy array
        games_info = np.vstack((games_info, data))
    return games_info

In [8]:
# create a thread to retrieve the game info from scraped games
class RetrieveThread(Thread):
    def __init__(self, start_index, end_index): 
        Thread.__init__(self) 
        self.start_index = start_index
        self.end_index = end_index
        self.games_info = np.empty(shape=[0,6])

    def run(self):
        self.games_info = retrieve_games_info(self.start_index, self.end_index, games_info=self.games_info)

In [9]:
num_threads = 2
index_interval = max_game_count // num_threads
threads = []

# initializing the threads
for each in range(num_threads):
    start_index = each * index_interval
    # making sure the end_index is the last index
    end_index = max_game_count if each == num_threads - 1 else each * index_interval + index_interval
    t = RetrieveThread(start_index, end_index)
    t.start()
    threads.append(t)

# sync
for each in threads:
    each.join()

In [10]:
# combine all data from threads
games_info = np.empty(shape=[0,6])
for each in threads:
    games_info = np.vstack((games_info, each.games_info))

games_info_df = pd.DataFrame(games_info)
games_info_df.columns = ['game_id', 'title', 'genre', 'author', 'text', 'link']
games_info_df

Unnamed: 0,game_id,title,genre,author,text,link
0,2955066,Incredibox - Sprunki,,wolf_hal,,https://wolf-hal.itch.io/incredibox-sprunki
1,3224595,Little Bartmares,Adventure,David Mills,WHY YOU LITTLE!!!,https://shadowband.itch.io/little-bartmares
2,3216520,The Apartment 57,Adventure,Infinity Entertainment,is a psychological horror game set in an aband...,https://infinity-entertainment.itch.io/the-apa...
3,3148668,Pretend it's not There,Adventure,Dreadloom,"Pretend that you can't see the monster, that m...",https://dreadloom-studios.itch.io/pretend-its-...
4,2513640,Ignited Entry,Adventure,JordiBoi,The corpse is alive.,https://jordiboi.itch.io/ignited-entry
...,...,...,...,...,...,...
1495,1225066,Plactions,Platformer,rob1221,Plactions is a puzzle platformer with consumab...,https://rob1221.itch.io/plactions
1496,1453482,Thrombolo,Platformer,Autumn West,,https://autumnwest.itch.io/thrombolo
1497,127979,Dominique Pamplemousse and Dominique Pamplemou...,Adventure,Squinky,A disorienting interactive musical romp with y...,https://squinky.itch.io/dompam2
1498,855338,Baba Is You level editor beta,Puzzle,Hempuli,A limited-time beta version of the upcoming ed...,https://hempuli.itch.io/baba-is-you-level-edit...


In [11]:
# close the current instance of driver
driver.quit()

In [80]:
# checkpoint
games_info_df.to_csv('Games List 5 Feat.csv')

#### Retrieving More Game Info from Game Site

In [17]:
# load checkpoint if needed
games_info_df = pd.read_csv("Games List 5 Feat.csv")
max_game_count = len(games_info_df)
games_info_df.head()

Unnamed: 0.1,Unnamed: 0,game_id,title,genre,author,text,link
0,0,2955066,Incredibox - Sprunki,,wolf_hal,,https://wolf-hal.itch.io/incredibox-sprunki
1,1,3224595,Little Bartmares,Adventure,David Mills,WHY YOU LITTLE!!!,https://shadowband.itch.io/little-bartmares
2,2,3216520,The Apartment 57,Adventure,Infinity Entertainment,is a psychological horror game set in an aband...,https://infinity-entertainment.itch.io/the-apa...
3,3,3148668,Pretend it's not There,Adventure,Dreadloom,"Pretend that you can't see the monster, that m...",https://dreadloom-studios.itch.io/pretend-its-...
4,4,2513640,Ignited Entry,Adventure,JordiBoi,The corpse is alive.,https://jordiboi.itch.io/ignited-entry


In [3]:
# check the number of links
id_list = games_info_df['game_id']
link_list = games_info_df['link']
print(len(link_list))

1500


In [4]:
# extract more game data
def retrieve_more_games_info(start_index, end_index, more_info):
    driver = webdriver.Chrome()
    # extend page load timeout to 5 mins.
    driver.set_page_load_timeout(300)
    for game_id, url in zip(id_list[start_index:end_index], link_list[start_index:end_index]):
        data = []
        
        try:
            driver.get(url)
            
            # scroll and click 'more information' button 
            info_button = driver.find_element(By.XPATH, "//a[@class='toggle_info_btn']")
            driver.execute_script("arguments[0].scrollIntoView();", info_button)
            info_button.click()
            time.sleep(2) # pause for it load a bit

            status = driver.find_elements(By.XPATH, "//tr[td[text()='Status']]/td[2]")
            rating_row = driver.find_element(By.XPATH, "//tr[td[text()='Rating']]/td[2]")
            rating = rating_row.find_element(By.XPATH, "//div[@class='star_value']").get_attribute("content")
            rating_count = rating_row.find_element(By.XPATH, "//span[@class='rating_count']").get_attribute("content")
            tags = driver.find_elements(By.XPATH, "//tr[td[text()='Tags']]/td[2]")
            sesh_time = driver.find_elements(By.XPATH, "//tr[td[text()='Average session']]/td[2]")
            platforms = driver.find_elements(By.XPATH, "//tr[td[text()='Platforms']]/td[2]")
    
            # check if the element is empty
            data.append(game_id)
            data.append("N/A" if not status else status[0].text)
            data.append("N/A" if not rating else rating)
            data.append("N/A" if not rating_count else rating_count)
            data.append("N/A" if not tags else tags[0].text)
            data.append("N/A" if not sesh_time else sesh_time[0].text)
            data.append("N/A" if not platforms else platforms[0].text)

            # append the data array to games_info numpy array
            more_info = np.vstack([more_info, data])

        except NoSuchElementException:
            print("No Such Element Error for GAME ID:", game_id)
            data.extend([game_id, "N/A", "N/A", "N/A", "N/A", "N/A", "N/A"])
            more_info = np.vstack([more_info, data])
            continue

        except ElementNotInteractableException:
            print("Element Not Interactable Error for GAME ID:", game_id)
            data.extend([game_id, "N/A", "N/A", "N/A", "N/A", "N/A", "N/A"])
            more_info = np.vstack([more_info, data])
            continue

        except ReadTimeoutError:
            print("Read Timeout Error for GAME ID:", game_id)
            data.extend([game_id, "N/A", "N/A", "N/A", "N/A", "N/A", "N/A"])
            more_info = np.vstack([more_info, data])
            continue

        except TimeoutException:
            print("Timeout Error for GAME ID:", game_id)
            data.extend([game_id, "N/A", "N/A", "N/A", "N/A", "N/A", "N/A"])
            more_info = np.vstack([more_info, data])
            continue

        except Exception:
            print("Uknown Error for Game ID:", game_id)
            data.extend([game_id, "N/A", "N/A", "N/A", "N/A", "N/A", "N/A"])
            more_info = np.vstack([more_info, data])
            continue
    
    driver.close()
    return more_info

In [5]:
class RetrieveMoreThread(Thread):
    def __init__(self, start_index, end_index): 
        Thread.__init__(self) 
        self.start_index = start_index
        self.end_index = end_index
        self.more_info = np.empty(shape=[0,7])

    def run(self):
        self.more_info = retrieve_more_games_info(self.start_index, self.end_index, more_info=self.more_info)

In [6]:
num_threads = 8
index_interval = max_game_count // num_threads
threads = []

# initializing the threads
thread_count = 1
for each in range(num_threads):
    start_index = each * index_interval
    # making sure the end_index is the last index
    end_index = max_game_count if each == num_threads - 1 else each * index_interval + index_interval
    t = RetrieveMoreThread(start_index, end_index)
    t.start()
    threads.append(t)
    print(thread_count, start_index, end_index)
    thread_count += 1

# sync
for each in threads:
    each.join()

1 0 187
2 187 374
3 374 561
4 561 748
5 748 935
6 935 1122
7 1122 1309
8 1309 1500
No Such Element Error for GAME ID: 2869923
No Such Element Error for GAME ID: 1370318
No Such Element Error for GAME ID: 2384541
Read Timeout Error for GAME ID: 3079599
Read Timeout Error for GAME ID: 877352
Element Not Interactable Error for GAME ID: 589627
Read Timeout Error for GAME ID: 129425
Element Not Interactable Error for GAME ID: 1208403
Read Timeout Error for GAME ID: 65181
Read Timeout Error for GAME ID: 1948914
Read Timeout Error for GAME ID: 1559343
Read Timeout Error for GAME ID: 1881272
Read Timeout Error for GAME ID: 1511140
No Such Element Error for GAME ID: 1581512
No Such Element Error for GAME ID: 1975309
Read Timeout Error for GAME ID: 1365045
Read Timeout Error for GAME ID: 1109093
Read Timeout Error for GAME ID: 3223767
Read Timeout Error for GAME ID: 1022835
Read Timeout Error for GAME ID: 749912
Read Timeout Error for GAME ID: 857480
Uknown Error for Game ID: 117955
Read Timeout

In [14]:
# combine all data from threads
more_info = np.empty(shape=[0,7])
for each in threads:
    more_info = np.vstack((more_info, each.more_info))

more_info_df = pd.DataFrame(more_info)
more_info_df.columns = ['game_id', 'status','rating','rating_count','tags','ave_session_time','platforms']
more_info_df

Unnamed: 0,game_id,status,rating,rating_count,tags,ave_session_time,platforms
0,2955066,Released,4.6,1178,"Cute, Fangame, Horror, Incredibox, minigames, ...",,"HTML5, Windows, macOS, Linux, Android"
1,3224595,Released,4.5,35,"3D, Atmospheric, Horror, PSX (PlayStation), Ps...",,"Windows, macOS, Linux"
2,3216520,Released,4.1,92,"Creepy, Dark, Horror, Indie, Multiple Endings,...",,Windows
3,3148668,Released,4.3,280,"3D, Atmospheric, First-Person, Horror, PSX (Pl...",A few seconds,"Windows, macOS"
4,2513640,Released,4.8,301,"Atmospheric, Horror, Low-poly, PSX (PlayStatio...",About an hour,Windows
...,...,...,...,...,...,...,...
1495,1225066,Released,4.7,91,"2D, Puzzle-Platformer, Singleplayer, Stencyl",A few minutes,HTML5
1496,1453482,Released,4.8,11,"2D, Controller, Funny, Short, Singleplayer, weird",About an hour,Windows
1497,127979,Released,4.1,73,"artgame, brass, Detective, Gender, musical, No...",About an hour,"Windows, macOS, Linux"
1498,855338,In development,4.7,15,,,"Windows, macOS, Linux"


### Manually Check the Games where a Scraping Error Occurred.

In [187]:
error_list = [
    2869923,1370318,2384541,3079599,877352,589627,129425,1208403,65181,
    1948914,1559343,1881272,1511140,1581512,1975309,1365045,1109093,3223767,
    1022835,749912,857480,117955,1029510,583081,1534262,2362775,1522359,329428,2008749
]
index_error_list = []
for each in error_list:
    game_data = games_info_df[games_info_df['game_id'] == each]
    index = game_data.index[0]
    link = game_data.iloc[0,5]
    index_error_list.append(index)
    print(each,'\t', index, '\t', link)

2869923 	 407 	 https://drenindoku.itch.io/deep-snow-delivery
1370318 	 974 	 https://616games.itch.io/midnight-tales-vol1
2384541 	 431 	 https://endysis.itch.io/winterknight
3079599 	 50 	 https://capybaraforce.itch.io/meet-santa
877352 	 1190 	 https://figglewatts.itch.io/vignettes
589627 	 1389 	 https://bilgetbd.itch.io/gun-night
129425 	 1193 	 https://jasongodbey.itch.io/the-search
1208403 	 854 	 https://zahranworrell.itch.io/necrow-co
65181 	 1194 	 https://letigame.itch.io/up-until-the-end
1948914 	 478 	 https://diabolodev.itch.io/solitaire-battle
1559343 	 1195 	 https://carsonk.itch.io/crunky
1881272 	 479 	 https://thymos81.itch.io/iframe-frameborder0-srchttpsitchioembed1881272bg-colorbdf9fcampf
1511140 	 872 	 https://act-novel.itch.io/reality-layer-zero
1581512 	 1211 	 https://mhze.itch.io/the-true-ingredients
1975309 	 122 	 https://bbboxxx.itch.io/mad-room-no3
1365045 	 486 	 https://liquidream.itch.io/undune2
1109093 	 879 	 https://hakuenstudio.itch.io/hakuen-studi

In [188]:
"""
    Follow the order:
    [game_id, status, rating, rating_count, tags, sesh_time, platforms]
"""

retrieved_info = [
    ['2869923', 'In development', 'N/A', 'N/A','death-stranding, deliveries, Driving, Exploration, job, job-system, Post-apocalyptic, Tanks, upgrades, Working Simulator', 'N/A', 'Windows'],
    ['1370318','Released', 'N/A', 'N/A', '	3D, Creepy, First-Person, Gore, Horror, PSX (PlayStation), Psychological Horror, Retro, Unity', 'N/A', 'Windows'],
    ['2384541','In development', 'N/A', 'N/A', 'Atmospheric, Dark Fantasy, Fairy Tale, Fantasy, Ghosts, Gothic, Medieval, Mystery, Point & Click', 'A few minutes', 'N/A'],
    ['3079599','Released', '4.8', '67', '3D, Christmas, Comedy, Creepy, Dark, Dark Humor, Horror, Pixel Art, PSX (PlayStation)', 'A few minutes', 'Windows'],
    ['877352' ,'Released', '4.1', '27', 'Atmospheric, Dreams, Experimental, Exploration, non-eucledian, psychedelic', 'A few minutes', 'Windows, macOS, Linux'],
    ['589627','N/A','N/A','N/A','N/A','N/A','N/A'], # no additional information available
    ['129425', 'Released', '4.0', '181', 'artgame, Atmospheric, Casual, Experimental, Mystery, Photorealistic, Point & Click, Surreal, Voice Acting, Walking simulator', 'N/A', 'Windows'],
    ['1208403','N/A','N/A','N/A','N/A','N/A','N/A'], # no additional information available
    ['65181','Released', '4.8','36',"2D, Anime, Dating Sim, Multiple Endings, otoge, Otome, Ren'Py, Romance",'A few hours','Windows, macOS, Linux, Android'],
    ['1948914','In development', '4.8', '29','Anime, Arcade, Casual, Deck Building, Idle, Roguelike, Roguelite, solitaire','About a half-hour', 'Windows'],
    ['1559343', 'Released','4.7','29','Arcade, crank, Endless, jumping, Playdate','N/A','N/A'],
    ['1881272','In development', '4.0', '32','inflation, pregnant, Slime','N/A', 'Windows, macOS'],
    ['1511140','In development', '4.9', '32', '3D, debate, Horror, JRPG, Mystery, Pixel Art, Singleplayer, Thriller', 'About a half-hour', 'Windows'],
    ['1581512','Released', 'N/A', 'N/A', '3D, Funny, Horror, Short, shrek, silly, Singleplayer', 'A few seconds', 'Windows, macOS, Linux'],
    ['1975309', 'Released', 'N/A', 'N/A', '2D, Anime, Female Protagonist, Otome, Romance, Short, Singleplayer', 'A few seconds', 'HTML5, Windows, macOS'],
    ['1365045','Released', '4.8','64','8-Bit, Demake, dune2, PICO-8, Pixel Art, Real time strategy, Retro','About a half-hour', 'HTML5, Windows, macOS, Linux'],
    ['1109093','Released', '5.0','7','eli, eliaquim, hakuen-studio, rmmv, rmmz, RPG Maker, sample-project','A few seconds', 'N/A'],
    ['3223767','Released', '5.0', '2', 'N/A', 'N/A', 'Windows, macOS, Linux, Android'],
    ['1022835','Released', '4.7','137','Casual, Cute, Exploration, Mystery, Narrative, Point & Click, Short, Story Rich','About an hour', 'HTML5, Windows, macOS, Linux'],
    ['749912','Released', '5.0','20','2D, 8bits, mistery, MSX, Pixel Art, Retro, Singleplayer, ZX Spectrum','About a half-hour','Windows, macOS, Linux, Android'],
    ['857480','Released', '4.9','16','Exploration, model, Sandbox','A few minutes', 'HTML5'],
    ['117955','N/A','N/A','N/A','N/A','N/A','N/A'], # no additional information available
    ['1029510','Released', '4.3','56','Arcade, First-Person, FPS, Sci-fi, Short, Unity, Voxel','N/A','HTML5, Windows, Android'],
    ['583081','Released', '4.5','213','Atmospheric, Colorful, Cozy, First-Person, nature, photography, Procedural Generation, Relaxing, Short, Walking simulator','About a half-hour','Windows, macOS, Linux'],
    ['1534262','Released', 'N/A', 'N/A', '3D, Black and White, footage, Horror, PSX (PlayStation), Psychological Horror, Retro, Singleplayer, vhs', 'N/A', 'Windows'],
    ['2362775','Released', '4.6','61','Atmospheric, Christmas, Creepy, Dark, Horror, Indie, Low-poly, PSX (PlayStation), Retro, weird','N/A','Windows'],
    ['1522359','In development', '4.6', '23', "3D, analog, backrooms, Creepy, Five Nights at Freddy's, Horror, liminal, ps1, scp, Singleplayer", 'Days or more', 'HTML5, Windows'],
    ['329428','Released', '3.9','38','Co-op, Crafting, Massively multiplayer, Post-apocalyptic, Real-Time, Team-Based, Top-Down, upgrades','About a half-hour', 'HTML5'],
    ['2008749','In development', '4.7','174','N/A','N/A', 'HTML5']    
]

retrieved_info = np.array(retrieved_info)
retrieved_info_df = pd.DataFrame(retrieved_info)
retrieved_info_df.columns = ['game_id', 'status','rating','rating_count','tags','ave_session_time','platforms']
retrieved_info_df

Unnamed: 0,game_id,status,rating,rating_count,tags,ave_session_time,platforms
0,2869923,In development,,,"death-stranding, deliveries, Driving, Explorat...",,Windows
1,1370318,Released,,,"\t3D, Creepy, First-Person, Gore, Horror, PSX ...",,Windows
2,2384541,In development,,,"Atmospheric, Dark Fantasy, Fairy Tale, Fantasy...",A few minutes,
3,3079599,Released,4.8,67.0,"3D, Christmas, Comedy, Creepy, Dark, Dark Humo...",A few minutes,Windows
4,877352,Released,4.1,27.0,"Atmospheric, Dreams, Experimental, Exploration...",A few minutes,"Windows, macOS, Linux"
5,589627,,,,,,
6,129425,Released,4.0,181.0,"artgame, Atmospheric, Casual, Experimental, My...",,Windows
7,1208403,,,,,,
8,65181,Released,4.8,36.0,"2D, Anime, Dating Sim, Multiple Endings, otoge...",A few hours,"Windows, macOS, Linux, Android"
9,1948914,In development,4.8,29.0,"Anime, Arcade, Casual, Deck Building, Idle, Ro...",About a half-hour,Windows


In [210]:
# drop the error index list
more_info_df.drop(index_error_list, inplace=True)
more_info_df = pd.concat([more_info_df, retrieved_info_df])
more_info_df

Unnamed: 0.1,Unnamed: 0,game_id,status,rating,rating_count,tags,ave_session_time,platforms
0,0.0,2955066,Released,4.6,1178.0,"Cute, Fangame, Horror, Incredibox, minigames, ...",,"HTML5, Windows, macOS, Linux, Android"
1,1.0,3224595,Released,4.5,35.0,"3D, Atmospheric, Horror, PSX (PlayStation), Ps...",,"Windows, macOS, Linux"
2,2.0,3216520,Released,4.1,92.0,"Creepy, Dark, Horror, Indie, Multiple Endings,...",,Windows
3,3.0,3148668,Released,4.3,280.0,"3D, Atmospheric, First-Person, Horror, PSX (Pl...",A few seconds,"Windows, macOS"
4,4.0,2513640,Released,4.8,301.0,"Atmospheric, Horror, Low-poly, PSX (PlayStatio...",About an hour,Windows
...,...,...,...,...,...,...,...,...
24,,1534262,Released,,,"3D, Black and White, footage, Horror, PSX (Pla...",,Windows
25,,2362775,Released,4.6,61,"Atmospheric, Christmas, Creepy, Dark, Horror, ...",,Windows
26,,1522359,In development,4.6,23,"3D, analog, backrooms, Creepy, Five Nights at ...",Days or more,"HTML5, Windows"
27,,329428,Released,3.9,38,"Co-op, Crafting, Massively multiplayer, Post-a...",About a half-hour,HTML5


In [211]:
# checkpoint
more_info_df.drop('Unnamed: 0', axis=1, inplace=True)
more_info_df.to_csv('Games List 5 More Feat.csv')

### Merging the Data into 1 DataFrame

In [212]:
# load games info checkpoint if needed
games_info_df = pd.read_csv("Games List 5 Feat.csv")
if 'Unnamed: 0' in games_info_df.columns:
    games_info_df.drop('Unnamed: 0', axis=1, inplace=True)
games_info_df.head()

Unnamed: 0,game_id,title,genre,author,text,link
0,2955066,Incredibox - Sprunki,,wolf_hal,,https://wolf-hal.itch.io/incredibox-sprunki
1,3224595,Little Bartmares,Adventure,David Mills,WHY YOU LITTLE!!!,https://shadowband.itch.io/little-bartmares
2,3216520,The Apartment 57,Adventure,Infinity Entertainment,is a psychological horror game set in an aband...,https://infinity-entertainment.itch.io/the-apa...
3,3148668,Pretend it's not There,Adventure,Dreadloom,"Pretend that you can't see the monster, that m...",https://dreadloom-studios.itch.io/pretend-its-...
4,2513640,Ignited Entry,Adventure,JordiBoi,The corpse is alive.,https://jordiboi.itch.io/ignited-entry


In [213]:
# load games info checkpoint if needed
more_info_df = pd.read_csv("Games List 5 More Feat.csv")
if 'Unnamed: 0' in more_info_df.columns:
    more_info_df.drop('Unnamed: 0', axis=1, inplace=True)
more_info_df.head()

Unnamed: 0,game_id,status,rating,rating_count,tags,ave_session_time,platforms
0,2955066,Released,4.6,1178.0,"Cute, Fangame, Horror, Incredibox, minigames, ...",,"HTML5, Windows, macOS, Linux, Android"
1,3224595,Released,4.5,35.0,"3D, Atmospheric, Horror, PSX (PlayStation), Ps...",,"Windows, macOS, Linux"
2,3216520,Released,4.1,92.0,"Creepy, Dark, Horror, Indie, Multiple Endings,...",,Windows
3,3148668,Released,4.3,280.0,"3D, Atmospheric, First-Person, Horror, PSX (Pl...",A few seconds,"Windows, macOS"
4,2513640,Released,4.8,301.0,"Atmospheric, Horror, Low-poly, PSX (PlayStatio...",About an hour,Windows


In [233]:
games_df = games_info_df.merge(more_info_df, on='game_id', how='outer')
games_df.head()

Unnamed: 0,game_id,title,genre,author,text,link,status,rating,rating_count,tags,ave_session_time,platforms
0,397,F J O R D S,Adventure,KYLE REIMERGARTIN,▲ ▲ ▲ ▲ RUN ERRANDS AND MAKE DECISIONS!!,https://mooonmagic.itch.io/fjords,Released,4.7,100.0,"computer, door, ferry, Magic, Mountains, pizza...",,"Windows, macOS"
1,463,starseed pilgrim,,droqen,a pleasant mystery box. a soft puzzle game.,https://droqen.itch.io/starseedpilgrim,Released,4.3,90.0,"Exploration, Gardening, symphonic",About a half-hour,"Windows, macOS"
2,660,GUN GODZ,Shooter,Vlambeer,GUN GODZ is a first person shooter about gangs...,https://vlambeer.itch.io/gun-godz,Released,4.5,193.0,"gangster, hiphop, rap, vlambeer, yung-venuz",,"Windows, macOS"
3,673,Dreaming Sarah,Adventure,Asteristic Game Studio,A surreal adventure platformer.,https://asteristic.itch.io/dreamingsarah,Released,4.3,118.0,"alpha, Creepy, Horror",About a half-hour,Windows
4,796,Guppy,Simulation,Christiaan Moleman,2D watercolor fish simulation,https://ninjadodo.itch.io/guppy,Released,3.9,96.0,"Animals, Atmospheric, Colorful, Experimental, ...",A few minutes,"Windows, macOS, Linux, Android"


In [234]:
# data might be duplicated
print("Games Info\t", len(games_info_df))
print("More Info\t", len(more_info_df))
print("Merged\t\t", len(games_df))

Games Info	 1500
More Info	 1500
Merged		 1542


In [235]:
games_df.isnull().sum()

game_id               0
title                 0
genre               128
author                0
text                110
link                  0
status                3
rating                9
rating_count          9
tags                 53
ave_session_time    685
platforms           120
dtype: int64

In [236]:
na_count = (games_df == "N/A").sum()
na_count

game_id             0
title               0
genre               0
author              0
text                0
link                0
status              0
rating              0
rating_count        0
tags                0
ave_session_time    0
platforms           0
dtype: int64

### Exporting the DataFrame to CSV

In [237]:
games_df.to_csv("Homework 1 - itch.io Game List.csv",index=False)