In [9]:
import re, os
import requests 
import subprocess
import time

import pandas as pd

from bs4 import BeautifulSoup
import chromedriver_autoinstaller
from selenium import webdriver

In [None]:
import logging

logging.basicConfig(level=logging.INFO, 
                    filename='./log/app.log',
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%d-%b-%y %H:%M:%S')

In [6]:

def write_df(dataset:list[dict]) -> pd.DataFrame: 
    df = pd.DataFrame(dataset)
    return df

def to_csv(df:pd.DataFrame, dir): 
    if '.csv' in dir: 
        df.to_csv(dir)
    else: 
        raise "Invalid format"
    
def write_log(dir, content): 
    with open(dir, 'a') as f: 
        f.writelines(content + '\n')

In [7]:
# setup chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless') # ensure GUI is off
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')


# set path to chromedriver as per your configuration
chromedriver_autoinstaller.install()


headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"} 
BASE_URL = "https://www.instant-gaming.com/en/"

In [8]:
def get_game_url(start:int=1, end:int=141): 
    game_urls = []

    for page in range(start, end): 
        # This is the trending page of instant-gaming
        URL = f"https://www.instant-gaming.com/en/search/?gametype=trending&version=2&page={page}" 
        resource = requests.get(url=URL,
                    headers=headers) 
        soup = BeautifulSoup(resource.content, 'html5lib')

        for a in soup.find_all('a', href=True):
            game_pattern = f"^{BASE_URL}+[0-9]"
            is_game_url = re.search(pattern=game_pattern, string=a['href'])
            if isinstance(is_game_url, re.Match): 
                # print("Game link: ", a['href'])
                game_urls.append(a['href'])

    return game_urls

def get_page_content(url): 
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)

    # scrolling

    lastHeight = driver.execute_script("return document.body.scrollHeight")
    #print(lastHeight)

    pause = 1
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause)
        newHeight = driver.execute_script("return document.body.scrollHeight")
        if newHeight == lastHeight:
            break
        lastHeight = newHeight
        #print(lastHeight)

    # ---
    html = driver.page_source    
    bs4_obj = BeautifulSoup(html, 'html5lib')
    return bs4_obj


def get_game_card(url, count:int=0):
    try:
        soup = get_page_content(url)
        game_name = soup.find('h1', 'game-title').text
        img_url = soup.find('img', attrs={'alt': game_name})['src']
        description = soup.find('span', attrs={'itemprop': 'description'}).text
        categories = []
        for cat in soup.find_all('a', attrs={'itemprop': 'applicationSubCategory'}):
            if not cat['content'] == 'Publishers':
                categories.append(cat.text)

        game_card = {'name': game_name,
                    'description': description,
                    'img_url': img_url,
                     'categories': categories}
        log_data = f'Processing {url}, No. {count}'
        write_log(dir='./data/log.txt', content=log_data)
        return game_card
    
    except:
        log_data = f'Fail to prorcess {url}'
        write_log(dir='./data/log.txt', content=log_data)
        return None

In [11]:
try:
    data_dir = './data'

    if not os.path.isdir(data_dir): 
        os.mkdir(data_dir)
    write_log('./data/log.txt', "Start crawling")

    # Crawl its trending games
    start = 1
    end = 140
    game_urls = get_game_url(start=start, 
                            end=end)

    game_cards = []
    count = 0
    for url in game_urls: 
        card = get_game_card(url, 
                            count=count)
        if isinstance(card, dict): 
            game_cards.append(card)
            count += 1

    dataset_name = 'instant_gaming.csv'
    dataset_dir = f'{data_dir}/{dataset_name}'
    # Turns crawled data into .csv 
    df = write_df(game_cards)
    to_csv(df, dir=dataset_dir)
    print('Done')
except Exception as e: 
    print(e)
    logging.error("Exception occurred", exc_info=True)

In [5]:
import yaml 
def yaml_read(filename): 
    if '.yaml' not in filename: 
        raise "Invalid format"
    with open(filename) as f:
        my_dict = yaml.safe_load(f)
    return my_dict


config = yaml_read(filename="./configs/config.yaml")
config["pages"]

[1, 141]

In [6]:
import logging
##loging
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(message)s',
                    datefmt='%a, %d %b %Y %H:%M:%S',
                    filename= './log/app.log',
                    filemode='w')

console = logging.StreamHandler()
console.setLevel(logging.INFO)
# add the handler to the root logger
logging.getLogger().addHandler(console)
logging.info("\nParameters:")

for i in range(10):
    logging.info(i)

logging.info("end!")


Parameters:
0
1
2
3
4
5
6
7
8
9
end!


In [None]:

        # dataset_name = 'instant_gaming.csv'
        # dataset_dir = f'{data_dir}/{dataset_name}'

        # Turns crawled data into .csv 
        # df = write_df(game_cards)
        # to_csv(df, dir=dataset_dir)


        # subprocess.run(['bash', 'commit.sh'], text=True)