<a href="https://colab.research.google.com/github/JamesEBall/GMAC-Cambodia-DB-Scraper/blob/main/GMAC_DB_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
!pip install tqdm progress

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting progress
  Downloading progress-1.6.tar.gz (7.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: progress
  Building wheel for progress (setup.py) ... [?25l[?25hdone
  Created wheel for progress: filename=progress-1.6-py3-none-any.whl size=9630 sha256=cc6c7a0314b7dbd2f0beefa8ace94f9e9dc97850b418c5814e2555d8488b635b
  Stored in directory: /root/.cache/pip/wheels/4c/9b/0a/a78ff56725af3ef70792f9ed0f8dbbc4c0315edc62cbc4a6b8
Successfully built progress
Installing collected packages: progress
Successfully installed progress-1.6


In [41]:
import concurrent.futures
import os
import sys
import time
import pandas as pd
from progress.bar import Bar
import urllib.parse

member_id = 1
all_data = []
num_threads = 10
empty_pages_in_succession = 0
max_empty_pages_in_succession = 100

def generate_google_maps_link(address):
    base_url = "https://www.google.com/maps/search/"
    encoded_address = urllib.parse.quote(address)
    return f"{base_url}{encoded_address}"

def get_individual_data(member_id):
    url = f'https://gmac-cambodia.org/member_front_detail/{member_id}'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    company_li = soup.find('strong', string='Company:').parent
    owner_from_li = soup.find('strong', string='Owner From:').parent
    telephone_li = soup.find('strong', string='Telephone:').parent
    email_li = soup.find('strong', string='Email:').parent
    category_li = soup.find('strong', string='Category:').parent
    number_of_workers_li = soup.find('strong', string='Number of Workers:').parent
    product_li = soup.find('strong', string='Product:').parent
    location_li = soup.find('strong', string='Location:').parent
    last_updated_li = soup.find('strong', string='Last Updated:').parent

    if any(el is None for el in [company_li, owner_from_li, telephone_li, email_li, category_li, number_of_workers_li, product_li, location_li, last_updated_li]):
        return None

    data = {
        'company': company_li.contents[-1].strip(),
        'owner_from': owner_from_li.contents[-1].strip(),
        'telephone': telephone_li.contents[-1].strip(),
        'email': email_li.a['href'].replace('mailto:', ''),
        'category': category_li.span.text.strip(),
        'number_of_workers': int(number_of_workers_li.contents[-1].strip()),
        'product': ', '.join([badge.text.strip() for badge in product_li.find_all('a', class_='badge')]),
        'location': location_li.contents[-1].strip(),
        'last_updated': last_updated_li.contents[-1].strip()
    }

    print(data)
    return data

def process_member(member_id):
    print(f'Scraping member {member_id}')
    try:
        individual_data = get_individual_data(member_id)
        return individual_data
    except Exception as e:
        print(f'Error scraping member {member_id}: {e}')
        return None

total_bar = Bar('Total progress', max=num_threads * entries_per_thread)
progress_bars = [Bar(f'Thread {i + 1}: Scraping member {member_id + i * entries_per_thread}', max=entries_per_thread) for i in range(num_threads)]

with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    while empty_pages_in_succession < max_empty_pages_in_succession:
        future_results = {executor.submit(process_member, member_id + i): member_id + i for i in range(num_threads)}
        results = [future.result() for future in concurrent.futures.as_completed(future_results)]

        if all(result is None for result in results):
            empty_pages_in_succession += num_threads
        else:
            all_data.extend(filter(None, results))
            empty_pages_in_succession = 0

        member_id += num_threads

df = pd.DataFrame(all_data)
df['Google Maps Link'] = df['location'].apply(generate_google_maps_link)
df.to_csv('gmac_cambodia_data.csv', index=False)

print('Data saved to gmac_cambodia_data.csv')

Scraping member 1
Scraping member 2
Scraping member 3
Scraping member 4
Scraping member 5
Scraping member 6
Scraping member 7Scraping member 8

Scraping member 9
Scraping member 10
Error scraping member 2: 'NoneType' object has no attribute 'parent'
Error scraping member 1: 'NoneType' object has no attribute 'parent'Error scraping member 6: 'NoneType' object has no attribute 'parent'

Error scraping member 5: 'NoneType' object has no attribute 'parent'
Error scraping member 4: 'NoneType' object has no attribute 'parent'Error scraping member 3: 'NoneType' object has no attribute 'parent'Error scraping member 7: 'NoneType' object has no attribute 'parent'Error scraping member 8: 'NoneType' object has no attribute 'parent'



{'company': 'Evergreen Garment Co., Ltd.', 'owner_from': 'Korea', 'telephone': '(855)-23-722571', 'email': 'evergreen@junkwang.com', 'category': 'Garment', 'number_of_workers': 804, 'product': 'Polo-shirt, Pants, T-Shirt, Jacket', 'location': 'NR 5,                  