In [1]:
import requests
from bs4 import BeautifulSoup
from rich import print
import pandas as pd
from rich.progress import track
import re
import numpy as np

In [2]:
URL = 'https://www.imdb.com/search/title/?title_type=video_game&sort=user_rating,desc'

In [3]:
def request_url(url:str)->BeautifulSoup:
    """
    Takes website URL and return soup object
    """
    headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html5lib")
    return soup

In [4]:
top_games_page = request_url(URL)

In [5]:
def get_data(soap:BeautifulSoup=None)->list:
    """
Parameters
----------
soap : a bs4.BeautifulSoup

Returns
----------
List
    """
    lst = list()
    games = soap.find('div', class_='lister-list').find_all('div', class_='lister-item-content')
    for game in games:
        dic = dict()
        try:
            dic['Title'] = game.find('a').text.strip()
        except AttributeError:
            dic['Title'] = np.nan
        try:
            dic['Year'] = int(re.findall('\d+', game.find('h3').find('span', class_='lister-item-year text-muted unbold').text)[0])
        except (AttributeError, IndexError) as e:
            dic['Year'] = np.nan
        try:
            dic['Genre'] = game.find('span', class_='genre').text.strip().split(', ')
        except AttributeError:
            dic['Genre'] = np.nan
        try:
            dic['Rating'] = float(game.find('div', class_='ratings-bar').find('strong').text)
        except AttributeError:
            dic['Rating'] = np.nan  
        try:
            dic['Director'] = game.find_all('p')[2].a.text
        except AttributeError:
            dic['Director'] = np.nan
        try:
            dic['Votes'] = int(game.find('span', attrs={'name':'nv'}).text.strip().replace(',', ''))
        except AttributeError:
            dic['Votes'] = np.nan
        lst.append(dic)
    return lst
# .Book.str.split(",")).explode('Book')

In [6]:
URL + top_games_page.find('div', class_='desc').find('a', class_='lister-page-next next-page')['href']

'https://www.imdb.com/search/title/?title_type=video_game&sort=user_rating,desc/search/title/?title_type=video_game&sort=user_rating,desc&start=51&ref_=adv_nxt'

In [7]:
df_list = list()
for start in track(range(1, 9991, 50)):
    soap_ = request_url(f'https://www.imdb.com/search/title/?title_type=video_game&sort=user_rating,desc&start={start}&ref_=adv_nxt')
    df_list.append(get_data(soap_))
# x.assign(Genre=x.Genre).explode('Genre')
# type(x.Genre[0])

Output()

In [8]:
df = pd.DataFrame([r for d in df_list for r in d])
df

Unnamed: 0,Title,Year,Genre,Rating,Director,Votes
0,Kingdom Hearts II,2005.0,"[Action, Adventure, Fantasy]",9.3,Tetsuya Nomura,6914
1,Zeruda no densetsu: Kamigami no toraifôsu,1991.0,"[Action, Adventure, Fantasy]",9.3,Takashi Tezuka,3149
2,The Secret of Monkey Island,1990.0,"[Adventure, Comedy, Fantasy]",9.3,Ron Gilbert,2982
3,Super Mario World,1990.0,"[Action, Adventure, Family]",9.3,Takashi Tezuka,4480
4,Spider-Man,2018.0,"[Action, Adventure, Comedy]",9.2,Ryan Smith,16367
...,...,...,...,...,...,...
9944,Ninja Combat,1990.0,"[Action, Fantasy]",6.2,Tsutomo Maruyama,19
9945,Ghostbusters,1986.0,"[Action, Fantasy, Horror]",6.2,David Crane,377
9946,Dracula,1993.0,"[Action, Fantasy, Horror]",6.2,Mike Simpson,31
9947,Might and Magic IX,2002.0,,6.2,Dan Woren,38


In [12]:
df.to_csv('data/imbd_games.csv')

In [None]:
df.assign(Genre=df.Genre).explode('Genre').reset_index(drop=True)