In [None]:
import re
import os
import csv
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup

file_categories = 'Categories_3.csv'
file_parts = 'Parts_3.csv'
file_colors = 'Colors_3.csv'

In [None]:
force = False
if force:
    for file in [file_categories, file_parts, file_colors]:
        try:
            os.remove(file)
        except:
            pass

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

# Categories

In [None]:
if file_categories in os.listdir():
    df_cat = pd.read_csv(file_categories, index_col='Cat_Num')    
else:
    df_cat = pd.DataFrame(columns=['Cat_Num', 'Cat_Name', 'Parts']).set_index('Cat_Num')
print(df_cat.shape)
df_cat.tail(1)

In [None]:
url = 'https://www.bricklink.com'
bs = BeautifulSoup(requests.get(url+'/catalogTree.asp?itemType=P&itemBrand=1000', headers=headers).text, 'lxml')

cat = re.compile('catString=(\d+)')

new = []
for link in bs.find('table', {'class':'bg-color--white catalog-list__category-list--internal catalog-tree__category-list--internal'}).findAll('a'):
    if 'href' in link.attrs:
        category_name = link.get_text()
        category_number = int(re.search(cat, link['href'])[1])
        category_elements = int(link.next_element.findNext('span').get_text()[1:-1])
        
        if category_number not in df_cat.index or df_cat.loc[category_number]['Parts']!=category_elements:
            df_cat.loc[category_number] = [category_name, category_elements]
            new.append(category_number)

df_cat.to_csv(file_categories)

# Parts

In [None]:
if file_parts in os.listdir():
    df_par = pd.read_csv(file_parts, index_col='Part_Num')    
else:
    df_par = pd.DataFrame(columns=['Cat_Num', 'Cat_Name', 'Part_Num', 'Part_Name']).set_index('Part_Num')
print(df_par.shape)
df_par.tail(1)

In [None]:
url = 'https://www.bricklink.com/catalogList.asp?v=0&pg={}&catString={}&itemBrand=1000&catType=P&v=1&viewPrint=Y'
num = re.compile('\?P=(.+)')
parts = re.compile('\\n(\d+) Items')

for category_number, category_name, category_elements in tqdm(df_cat.reset_index().values):
    do = True if category_number in new else False
    if not do and not (category_number in df_par['Cat_Num'].values and df_par[df_par['Cat_Num']==category_number].shape[0]>=category_elements):
        do = True

    if do:
        pages = -(-int(category_elements)//50)
        for page in range(1, pages+1):
            bs = BeautifulSoup(requests.get(url.format(page, category_number), headers=headers).text, 'lxml')

            for tr in bs.find('table', {'class':'bg-color--white catalog-list__body-main catalog-list__body-main--alternate-row'}).findAll('tr'):
                try:
                    part_num = re.search(num, tr.find('a')['href'])[1]
                    part_name = tr.find('strong').get_text()
                    df_par.loc[part_num] = [category_number, category_name, part_name]
                except:
                    pass
df_par.to_csv(file_parts)

# Color

In [None]:
if file_colors in os.listdir():
    df_col = pd.read_csv(file_colors, index_col=['Part_Num', 'Col_Num'])    
else:
    df_col = pd.DataFrame(columns=['Cat_Num', 'Cat_Name', 'Part_Num', 'Part_Name', 'Col_Num', 'Col_Name', 'Num_Sets', 'Num_Sold']).set_index(['Part_Num', 'Col_Num'])
print(df_col.shape)
df_col.tail(1)

In [None]:
def scrape(part_num, cat_num, cat_name, part_name):
    bs = BeautifulSoup(requests.get(url.format(part_num), headers=headers).text, 'lxml')
    td = bs.find('table', {'class':'pciColorInfoTable'}).findAll('td')
    
    col_dict = {}
    for span in td[2].findAll('span'):
        try:
            col_num = span.find('a')['href'].split('colorID=')[-1].split('&')[0]
            sold = span.get_text().rsplit('(', 1)[-1].strip()[:-1]
            col_dict[col_num] =  {'sold':sold, 'col_name':None, 'sets':0}
        except:
            pass

    for span in td[3].findAll('span'):
        try:
            col_num = span.find('a')['href'].split('colorID=')[-1].split('&')[0]
            col_name = span.find('a').get_text().strip()
            sets = span.get_text().rsplit('(', 1)[-1].strip()[:-1]
            
            if col_num in col_dict.keys():
                col_dict[col_num]['col_name'] = col_name
                col_dict[col_num]['sets'] = sets
            else:
                col_dict[col_num] = {'sold':0, 'col_name':col_name, 'sets':sets}
        except:
            pass
    try:
        for item in col_dict.items():
            df_col.loc[(part_num, item[0]), ['Cat_Num', 'Cat_Name', 'Part_Name', 'Col_Name', 'Num_Sets', 'Num_Sold']] = ([cat_num, cat_name, part_name, item[1]['col_name'], item[1]['sets'], item[1]['sold']])
    except:
        pass
        

url = 'https://www.bricklink.com/v2/catalog/catalogitem.page?P={}#T=C'

for part_num in tqdm(set(df_par.index).difference(df_col.index.levels[0])):
    cat_num, cat_name, part_name = df_par.loc[part_num].values
    scrape(part_num, cat_num, cat_name, part_name)

In [None]:
df_col.to_csv(file_colors)

In [None]:
df = pd.read_csv(file_colors, header=None)
df.columns = ['Cat_Num', 'Cat_Name', 'Part_Num', 'Part_Name', 'Col_Num', 'Col_Name', 'Num_Sets', 'Num_Sold']
print(df.shape)
df.head(10)