In [9]:
import re, os
import requests 
import subprocess
import time

import pandas as pd

from bs4 import BeautifulSoup
import chromedriver_autoinstaller
from selenium import webdriver

In [None]:
import logging

logging.basicConfig(level=logging.INFO, 
                    filename='./log/app.log',
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%d-%b-%y %H:%M:%S')

In [6]:

def write_df(dataset:list[dict]) -> pd.DataFrame: 
    df = pd.DataFrame(dataset)
    return df

def to_csv(df:pd.DataFrame, dir): 
    if '.csv' in dir: 
        df.to_csv(dir)
    else: 
        raise "Invalid format"
    
def write_log(dir, content): 
    with open(dir, 'a') as f: 
        f.writelines(content + '\n')

In [7]:
# setup chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless') # ensure GUI is off
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')


# set path to chromedriver as per your configuration
chromedriver_autoinstaller.install()


headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"} 
BASE_URL = "https://www.instant-gaming.com/en/"

In [8]:
def get_game_url(start:int=1, end:int=141): 
    game_urls = []

    for page in range(start, end): 
        # This is the trending page of instant-gaming
        URL = f"https://www.instant-gaming.com/en/search/?gametype=trending&version=2&page={page}" 
        resource = requests.get(url=URL,
                    headers=headers) 
        soup = BeautifulSoup(resource.content, 'html5lib')

        for a in soup.find_all('a', href=True):
            game_pattern = f"^{BASE_URL}+[0-9]"
            is_game_url = re.search(pattern=game_pattern, string=a['href'])
            if isinstance(is_game_url, re.Match): 
                # print("Game link: ", a['href'])
                game_urls.append(a['href'])

    return game_urls

def get_page_content(url): 
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)

    # scrolling

    lastHeight = driver.execute_script("return document.body.scrollHeight")
    #print(lastHeight)

    pause = 1
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause)
        newHeight = driver.execute_script("return document.body.scrollHeight")
        if newHeight == lastHeight:
            break
        lastHeight = newHeight
        #print(lastHeight)

    # ---
    html = driver.page_source    
    bs4_obj = BeautifulSoup(html, 'html5lib')
    return bs4_obj


def get_game_card(url, count:int=0):
    try:
        soup = get_page_content(url)
        game_name = soup.find('h1', 'game-title').text
        img_url = soup.find('img', attrs={'alt': game_name})['src']
        description = soup.find('span', attrs={'itemprop': 'description'}).text
        categories = []
        for cat in soup.find_all('a', attrs={'itemprop': 'applicationSubCategory'}):
            if not cat['content'] == 'Publishers':
                categories.append(cat.text)

        game_card = {'name': game_name,
                    'description': description,
                    'img_url': img_url,
                     'categories': categories}
        log_data = f'Processing {url}, No. {count}'
        write_log(dir='./data/log.txt', content=log_data)
        return game_card
    
    except:
        log_data = f'Fail to prorcess {url}'
        write_log(dir='./data/log.txt', content=log_data)
        return None

In [11]:
try:
    data_dir = './data'

    if not os.path.isdir(data_dir): 
        os.mkdir(data_dir)
    write_log('./data/log.txt', "Start crawling")

    # Crawl its trending games
    start = 1
    end = 140
    game_urls = get_game_url(start=start, 
                            end=end)

    game_cards = []
    count = 0
    for url in game_urls: 
        card = get_game_card(url, 
                            count=count)
        if isinstance(card, dict): 
            game_cards.append(card)
            count += 1

    dataset_name = 'instant_gaming.csv'
    dataset_dir = f'{data_dir}/{dataset_name}'
    # Turns crawled data into .csv 
    df = write_df(game_cards)
    to_csv(df, dir=dataset_dir)
    print('Done')
except Exception as e: 
    print(e)
    logging.error("Exception occurred", exc_info=True)

In [5]:
import yaml 
def yaml_read(filename): 
    if '.yaml' not in filename: 
        raise "Invalid format"
    with open(filename) as f:
        my_dict = yaml.safe_load(f)
    return my_dict


config = yaml_read(filename="./configs/config.yaml")
config["pages"]

[1, 141]

In [7]:
import logging
##loging
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(message)s',
                    datefmt='%a, %d %b %Y %H:%M:%S',
                    filename= './log/app.log',
                    filemode='w')

console = logging.StreamHandler()
console.setLevel(logging.INFO)
# add the handler to the root logger
logging.getLogger().addHandler(console)
logging.info("\nParameters:")

for i in range(10):
    logging.info(i)

logging.info("end!")


Parameters:
0
1
2
3
4
5
6
7
8
9
end!


In [4]:
import json 


data_dir = './data/samples.json'


people = [
  {
    "name": "Alice Jones",
    "age": 30,
    "email": "alice.jones@example.com"
  },
  {
    "name": "Bob Smith",
    "age": 25,
    "email": "bob.smith@company.com"
  },
  {
    "name": "Charlie Brown",
    "age": 42,
    "email": "charlie.brown@email.net"
  },
  {
    "name": "Diana Garcia",
    "age": 18,
    "email": "diana.garcia@school.edu"
  },
  {
    "name": "Ethan Lee",
    "age": 55,
    "email": "ethan.lee@workplace.org"
  },
  {
    "name": "Fiona Miller",
    "age": 22,
    "email": "fiona.miller@gmail.com"
  },
  {
    "name": "Gabriel Hernandez",
    "age": 38,
    "email": "gabriel.hernandez@hotmail.com"
  },
  {
    "name": "Hannah Williams",
    "age": 60,
    "email": "hannah.williams@yahoo.com"
  },
  {
    "name": "Isaac Johnson",
    "age": 15,
    "email": "isaac.johnson@student.ac.uk"
  },
  {
    "name": "Jasmine Lopez",
    "age": 45,
    "email": "jasmine.lopez@companywebsite.com"
  }
]


def json_write(content: dict, dir=data_dir):
    if not isinstance(content, dict): 
        raise TypeError
    json_obj = json.dumps(content, indent=3)
    with open(dir, 'a') as outfile: 
        outfile.write(json_obj)
        outfile.write('\n')

def csv_write(content, dir): 
    with open(dir, 'a') as writefile: 
        writefile.write(content)

def csv_read(dir): 
    with open(dir, 'r') as readfile: 
        content = readfile.readlines(dir)
    return content

# .CSV Version
def write_data(content: dict, dir=data_dir): 
    keys = content.keys()
    values = content.values()

    keys = '::'.join(keys)
    values = "::".join(values)
    # Check if the .csv file has columns
    sample = csv_read(dir)
    if keys in sample: 
        csv_write(content=values, dir=dir)
    else: 
        csv_write(content=keys, dir=dir)
        csv_write(content=values, dir=dir)
            

for person in people: 
    write_data(person, dir="./data/samples.csv")

TypeError: sequence item 1: expected str instance, int found

In [30]:
products = [
  {
    "name": "Coffee Maker",
    "description": "Brews a delicious cup of coffee in minutes. Easy to use and clean.",
    "img_url": "https://example.com/coffee_maker.jpg",
    "categories": ["Appliances", "Kitchen"]
  },
  {
    "name": "Hiking Backpack",
    "description": "Durable and comfortable backpack for all your outdoor adventures. Features multiple compartments and adjustable straps.",
    "img_url": "https://example.com/hiking_backpack.jpg",
    "categories": ["Sports & Outdoors", "Travel"]
  },
  {
    "name": "Wireless Headphones",
    "description": "Enjoy high-quality sound with these wireless headphones. Noise-canceling technology for a distraction-free listening experience.",
    "img_url": "https://example.com/wireless_headphones.jpg",
    "categories": ["Electronics", "Audio"]
  },
  {
    "name": "Fitness Tracker",
    "description": "Tracks your steps, heart rate, and sleep patterns. Helps you stay motivated and reach your fitness goals.",
    "img_url": "https://example.com/fitness_tracker.jpg",
    "categories": ["Health & Wellness", "Electronics"]
  },
  {
    "name": "Novel",
    "description": "A captivating story that will transport you to another world. Perfect for book lovers of all ages.",
    "img_url": "https://example.com/novel.jpg",
    "categories": ["Books", "Fiction"]
  },
  {
    "name": "Cozy Blanket",
    "description": "Made from super soft fleece, this blanket will keep you warm and comfortable all year round.",
    "img_url": "https://example.com/cozy_blanket.jpg",
    "categories": ["Home & Living", "Bedding"]
  },
  {
    "name": "Mechanical Keyboard",
    "description": "Offers a satisfying typing experience with a classic clicky sound. Perfect for gamers and writers alike.",
    "img_url": "https://example.com/mechanical_keyboard.jpg",
    "categories": ["Electronics", "Computers"]
  },
  {
    "name": "Sunglasses",
    "description": "Protects your eyes from the sun while looking stylish. Available in a variety of colors and styles.",
    "img_url": "https://example.com/sunglasses.jpg",
    "categories": ["Fashion", "Accessories"]
  },
  {
    "name": "T-Shirt",
    "description": "Made from high-quality cotton for a comfortable fit. Perfect for everyday wear or a casual outing.",
    "img_url": "https://example.com/t_shirt.jpg",
    "categories": ["Fashion", "Clothing"]
  },
  {
    "name": "Laptop Stand",
    "description": "Elevates your laptop for improved ergonomics and better posture. Helps to keep your laptop cool.",
    "img_url": "https://example.com/laptop_stand.jpg",
    "categories": ["Electronics", "Computers"]
  }
]
from pydantic import BaseModel, Field

class GameCard(BaseModel): 
    name: str
    description: str
    img_url: str
    categories: list[str]

    @staticmethod
    def to_str(content:dict) -> str: 
        game_card = GameCard(**content)
        categories = ','.join(game_card.categories)
        result = "::".join([game_card.name, game_card.description, 
                            game_card.img_url, categories])
        return result

    
for product in products: 
    product_inf = GameCard.to_str(product)
    print(product_inf)
    # product_inf = product_inf.squeeze()

Coffee Maker::Brews a delicious cup of coffee in minutes. Easy to use and clean.::https://example.com/coffee_maker.jpg::Appliances,Kitchen
Hiking Backpack::Durable and comfortable backpack for all your outdoor adventures. Features multiple compartments and adjustable straps.::https://example.com/hiking_backpack.jpg::Sports & Outdoors,Travel
Wireless Headphones::Enjoy high-quality sound with these wireless headphones. Noise-canceling technology for a distraction-free listening experience.::https://example.com/wireless_headphones.jpg::Electronics,Audio
Fitness Tracker::Tracks your steps, heart rate, and sleep patterns. Helps you stay motivated and reach your fitness goals.::https://example.com/fitness_tracker.jpg::Health & Wellness,Electronics
Novel::A captivating story that will transport you to another world. Perfect for book lovers of all ages.::https://example.com/novel.jpg::Books,Fiction
Cozy Blanket::Made from super soft fleece, this blanket will keep you warm and comfortable all y

In [36]:
import requests, os

headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"} 

def dir_exist(dir, create:bool=True) -> bool: 
    existence = os.path.exists(dir)
    if existence ==False and create == True: 
        os.mkdir(dir)
        existence = True
    return existence

def download_image(image_name, image_url, headers=headers, save_dir:str='./data/imgs'): 
    try: 
        img_data = requests.get(url=image_url, headers=headers).content
        image_name = '-'.join(image_name.strip().split())
        if dir_exist(save_dir, create=True): 
            image_name = image_name + ".png"
            img_dir = os.path.join(save_dir, image_name)
            with open(img_dir, 'wb') as img_handler: 
                img_handler.write(img_data)

    except Exception as e: 
        raise e

download_image(image_name="Dragon's Dogma 2 Xbox Series X|S", 
               image_url="https://gaming-cdn.com/images/products/15666/616x353/dragon-s-dogma-2-xbox-series-x-s-xbox-series-x-s-game-microsoft-store-cover.jpg?v=1704801871")

In [None]:

        # dataset_name = 'instant_gaming.csv'
        # dataset_dir = f'{data_dir}/{dataset_name}'

        # Turns crawled data into .csv 
        # df = write_df(game_cards)
        # to_csv(df, dir=dataset_dir)


        # subprocess.run(['bash', 'commit.sh'], text=True)

In [4]:
import os 


NameError: name '__file__' is not defined