In [1]:
import os
import json
from bs4 import BeautifulSoup
import pandas as pd
from joblib import Parallel, delayed
import numpy as np

pd.set_option('display.max_columns', None)

In [2]:
def get_soup(file_path: str):
    with open(file_path, 'r') as f:
        return BeautifulSoup(f, 'html.parser')


In [3]:
def get_item_data(item_page_soup: BeautifulSoup):
    item = {}

    item['id'] = item_page_soup.select_one(
        'input[type=hidden][name=id]').attrs['value']
    item['name'] = item_page_soup.select_one('.workshopItemTitle').text
    item['discussions_count'] = 0
    item['comments_count'] = 0

    item['submitter'] = item_page_soup.select_one('.breadcrumbs').select('a')[-1].text.replace('\'s Workshop','')

    for section_tab in item_page_soup.select('.sectionTab'):
        for count_name in ['Discussions', 'Comments']:
            if section_tab.text.startswith(count_name):
                item[f'{count_name.lower()}_count'] = int(
                    section_tab.text[len(count_name):])

    item['movie_count'] = len(item_page_soup.select('.highlight_strip_movie'))
    item['screenshot_count'] = len(
        item_page_soup.select('.highlight_strip_screenshot'))

    item['awards_count'] = sum([int(i.attrs['data-reactioncount'])
                               for i in item_page_soup.select('.review_award')])

    item['link_count'] = len(item_page_soup.select('.general_btn.panel_btn'))

    item['file_size_mb'] = 0
    
    for i,details_stat_right in enumerate(item_page_soup.select('.detailsStatRight')):
        if i == 0:
            item['file_size_mb'] = float(details_stat_right.text[:-len(' MB')])
        if i == 1:
            item['posted_date'] = details_stat_right.text
        if i == 2:
            item['updated_date'] = details_stat_right.text

    num_change_notes_element = item_page_soup.select_one('.detailsStatNumChangeNotes')

    item['change_notes'] = int(num_change_notes_element.text.strip().split()[0]) if num_change_notes_element else 0

    item['certified_compatible'] = 1 if any([t.text.strip().upper(
    ) == 'CERTIFIED COMPATIBLE' for t in item_page_soup.select('.title')]) else 0
    item['creator_count'] = len(item_page_soup.select_one(
        '.creatorsBlock').find_all(recursive=False))

    parent_collections_num_others_element = item_page_soup.select_one(
        '.parentCollectionsNumOthers')

    if not parent_collections_num_others_element:
        item['featured_collection_count'] = 0
    elif parent_collections_num_others_element.a.text == 'collection':
        item['featured_collection_count'] = 1
    else:
        item['featured_collection_count'] = int(
            parent_collections_num_others_element.a.text.split(' ')[0])

    for tr in item_page_soup.select_one('.stats_table').find_all('tr'):
        tr_text = ''

        for td in reversed(tr.find_all('td')):
            tr_text += f" {td.text.lower()}"

        tr_text = tr_text.strip()

        if tr_text.startswith('unique visitors'):
            item['visitors_count'] = int(
                tr_text[len('unique visitors '):].replace(',', ''))
        if tr_text.startswith('current favorites'):
            item['favorites_count'] = int(
                tr_text[len('current favorites '):].replace(',', ''))

    for workshop_tag in item_page_soup.select('.workshopTags'):
        item[workshop_tag.select_one('.workshopTagsTitle').text.strip().lower().replace(' ', '_')[
            :-1]] = [a.text.strip().lower() for a in workshop_tag.find_all('a', recursive=False)]

    item['description'] = item_page_soup.select_one('#highlightContent').getText()
            
    return item


In [4]:
def create_item(file_path: str):
    file_soup = get_soup(file_path)

    return get_item_data(file_soup)


In [5]:
def create_folder_json(folder_name: str):
    items = Parallel(n_jobs=-1)(delayed(create_item)(f'{folder_name}/{file_name}') for file_name in os.listdir(folder_name))
    
    with open(f'{folder_name}.json', 'w') as f:
        json.dump(items, f)


In [6]:
create_folder_json('accepted')


In [7]:
create_folder_json('not_accepted')
