# Testing

[Link to the website](https://www.thomasnet.com/)

In [1]:
import requests
from bs4 import BeautifulSoup
from traceback import print_exc

from datetime import datetime
import time

import numpy as np 
import pandas as pd
import math

In [2]:
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}

payload = {
    # "WTZO": "Find Suppliers",
    "cov": "NA",
    "heading": 21650809,
    "searchsource": "suppliers",
    "searchterm": "hydraulic cylinders",
    # "searchx": "true",
    "what": "hydraulic cylinders",
    # "which": "prod",
    "pg": 1
}

BASE_URL = "https://www.thomasnet.com/nsearch.html"

In [3]:
page = requests.get(BASE_URL, params=payload)
soup = BeautifulSoup(page.text,"lxml")

total_suppliers = soup.find('p',class_="supplier-search-results__subheader").findAll('b')[-1].text
print(f"{total_suppliers} suppliers found")

n_suppliers = len(soup.findAll('div',class_="supplier-search-results__card"))
print(f"{n_suppliers} found on this page")

number_of_pages = math.ceil(float(total_suppliers)/float(n_suppliers))
print(f"Total Pages: {number_of_pages}")

772 suppliers found
25 found on this page
Total Pages: 31


In [4]:
def generate_payloads(keyword: str, page_num: int):
    payloads = []
    for i in range(page_num):
        payload = {
            "cov": "NA",
            "heading": 21650809,
            "searchsource": "suppliers",
            "searchterm": keyword,
            "what": keyword,
            "pg": i+1
        }
        payloads.append(payload)

    return payloads

x = generate_payloads('hydraulic cylinders',number_of_pages)

In [8]:
collected_data = []

keyword = 'hydraulic cylinders'
page = requests.get(BASE_URL, params=x[-1])
soup = BeautifulSoup(page.text,"lxml")

suppliers = soup.findAll('div',class_="supplier-search-results__card")
for sup in suppliers:
    card_data = {
        "company_id":"",
        "company_name":"",
        "company_type":"",
        "annual_revenue":"",
        "year_founded":"",
        "num_employees":"",
        "location":"",
        "company_url":"",
        "brands":"",
        "description":"",
        "url":"",
        "telephone":"",
        "searchterm":keyword
    }
    try:
        header = sup.find('header',class_='profile-card__header')
        card_data['company_id'] = eval(sup.get('data-impression-tracking'))['company_id']
        card_data['company_name'] = header.find('h2',class_='profile-card__title').text.strip()
        card_data['url'] = 'https://www.thomasnet.com'+header.find('h2',class_='profile-card__title').find('a').get('href')
        try:
            card_data['telephone'] = sup.find('a',{'data-conversion_action':'Call'}).get('href')
        except:
            pass

        sup_data = sup.find('div',class_='profile-card__supplier-data')
        card_data['location'] = sup_data.find('span',class_='profile-card__location').text.replace('ico-map','').strip()
        card_data['company_type'] = sup_data.find('span',{'data-content':'Company Type'}).text.strip()
        try:
            card_data['annual_revenue'] = sup_data.find('span',{'data-content':'Annual Revenue'}).text.strip()
        except:
            pass
        try:
            card_data['num_employees'] = sup_data.find('span',{'data-content':'Number of Employees'}).text.strip()
        except:
            pass
        try:
            card_data['year_founded'] = sup_data.find('span',{'data-content':'Year Founded'}).text.strip()
        except:
            pass

        content = sup.find('div',class_='profile-card__content')
        try:
            card_data['description'] = content.findAll('p')[0].text.strip()
        except:
            pass
        try:
            card_data['company_url'] = content.find('p',{'class':'profile-card_web-link-wrap'}).find('a').get('href')
        except:
            pass
        try:
            card_data['brands'] = content.find('p',{'class':'profile-card__brands__body'}).text.strip()
        except:
            pass
    except Exception as e:
        print("Error encountered while extraction of data\n",print_exc())
        pass

    collected_data.append(card_data)

df = pd.DataFrame(collected_data)
df

Unnamed: 0,company_id,company_name,company_type,annual_revenue,year_founded,num_employees,location,company_url,brands,description,url,telephone,searchterm
0,41982,"Lincoln Machine Co., Inc.",Manufacturer*,$5 - 9.9 Mil,1939.0,10-49,"Salem, OH",,,Manufacturer Of Air & Hydraulic Cylinders,https://www.thomasnet.com/profile/00041982/lin...,tel://330-332-4689,hydraulic cylinders
1,10090665,Columbia Chrome Industries,"Manufacturer*, Service Company",,1998.0,10-49,"Sparwood, BC",,,,https://www.thomasnet.com/profile/10090665/col...,tel://250-425-2818,hydraulic cylinders
2,10088636,Diotte's Hydraulics,"Manufacturer*, Service Company",,1978.0,,"Ottawa, ON",,,,https://www.thomasnet.com/profile/10088636/dio...,tel://613-244-4735,hydraulic cylinders
3,162303,"Precision Machine & Hydraulic, Inc.",Custom Manufacturer*,,,10-49,"Worthington, WV",,,Custom manufacturer of hydraulic components. P...,https://www.thomasnet.com/profile/00162303/pre...,tel://304-287-2117,hydraulic cylinders
4,30682980,Waterview Machine Works Ltd.,"Custom Manufacturer*, Manufacturer",$1 - 4.9 Mil,1984.0,10-49,"Yarmouth, NS",,,,https://www.thomasnet.com/profile/30682980/wat...,tel://902-742-1476,hydraulic cylinders
5,20096639,HMC Inc.,"Custom Manufacturer*, Manufacturer",$1 - 4.9 Mil,1983.0,10-49,"Mount Ida, AR",,,"Manufacturer of hydraulic cylinder parts, flui...",https://www.thomasnet.com/profile/20096639/hmc...,tel://870-867-4143,hydraulic cylinders
6,30690018,Parker-Hannifin Canada - Fluid Connectors Divi...,Manufacturer*,$25 - 49.9 Mil,1962.0,200-499,"Beamsville, ON",,,,https://www.thomasnet.com/profile/30690018/par...,tel://905-945-2274,hydraulic cylinders
7,30162771,Hydra Kinetics,Manufacturers' Rep*,Under $1 Mil,,1-9,"Heidenheimer, TX",,,"Mfr's. Rep. Of Cutom Made Hydraulic Cylinders,...",https://www.thomasnet.com/profile/30162771/hyd...,tel://254-983-1067,hydraulic cylinders
8,10104899,Morrison Hydraulics Ltd.,"Manufacturer*, Service Company",,,1-9,"Langley, BC",,,,https://www.thomasnet.com/profile/10104899/mor...,tel://604-856-6810,hydraulic cylinders
9,30701756,Granite Industries,"Custom Manufacturer*, Manufacturer",$1 - 4.9 Mil,2000.0,1-9,"McAdam, NB",,,,https://www.thomasnet.com/profile/30701756/gra...,tel://506-784-2211,hydraulic cylinders


# Meta-Harvester Class

In [57]:
import os, re, time
import traceback

import requests
from bs4 import BeautifulSoup

import math
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter(action='ignore')

from multiprocessing import Pool


class ThomasnetMetaDataScraper:
    def __init__(self, **kwargs):
        self.config = kwargs.get('config')
        self.BASE_URL = "https://www.thomasnet.com/nsearch.html"
        self.base_payload = {
            "cov": "NA",
            "heading": self.config['heading'],
            "searchsource": "suppliers",
            "searchterm": self.config['keyword'],
            "what": self.config['keyword'],
            "pg": 1
        }
        self.collected_data = []
        self.metadata = None


    def find_num_pages(self,payload):
        page = requests.get(self.BASE_URL, params=payload)
        soup = BeautifulSoup(page.text,"lxml")

        total_suppliers = soup.find('p',class_="supplier-search-results__subheader").findAll('b')[-1].text
        print(f"{total_suppliers} suppliers found")

        n_suppliers = len(soup.findAll('div',class_="supplier-search-results__card"))
        print(f"{n_suppliers} found on this page")

        number_of_pages = math.ceil(float(total_suppliers)/float(n_suppliers))
        print(f"Total Pages: {number_of_pages}")
        return number_of_pages


    def generate_payload(self, num_pages:int, keyword:str):
        payloads = []
        for i in range(num_pages):
            payload = {
                "cov": "NA",
                "heading": self.config['heading'],
                "searchsource": "suppliers",
                "searchterm": keyword,
                "what": keyword,
                "pg": i+1
            }
            payloads.append(payload)
        return payloads


    @staticmethod
    def extract_data(payload):
        BASE_URL = "https://www.thomasnet.com/nsearch.html"
        def get_html(param):
            passed = False
            retry = 0
            while not passed:
                try:
                    page = BeautifulSoup(
                        requests.get(BASE_URL,params=param).text,
                        'lxml'
                    )
                    passed = True
                    return page
                except Exception as e:
                    retry = retry + 1
                    print(f'Retrying {retry}/5 times...')
                    if retry==5:
                        break
                    else:
                        time.sleep(5)
                        pass
        try:
            soup = get_html(payload)
            suppliers_list = []
            suppliers = soup.findAll('div',class_="supplier-search-results__card")
            for sup in suppliers:
                card_data = {
                    "company_id":"",
                    "company_name":"",
                    "company_type":"",
                    "annual_revenue":"",
                    "year_founded":"",
                    "num_employees":"",
                    "location":"",
                    "company_url":"",
                    "brands":"",
                    "description":"",
                    "url":"",
                    "telephone":"",
                    "searchterm": payload['searchterm']
                }
                result = {"page_data": [card_data], "success": False}
                try:
                    header = sup.find('header',class_='profile-card__header')
                    card_data['company_id'] = eval(sup.get('data-impression-tracking'))['company_id']
                    card_data['company_name'] = header.find('h2',class_='profile-card__title').text.strip()
                    card_data['url'] = 'https://www.thomasnet.com'+header.find('h2',class_='profile-card__title').find('a').get('href')
                    card_data['telephone'] = sup.find('a',{'data-conversion_action':'Call'}).get('href')

                    sup_data = sup.find('div',class_='profile-card__supplier-data')
                    card_data['location'] = sup_data.find('span',class_='profile-card__location').text.replace('ico-map','').strip()
                    card_data['company_type'] = sup_data.find('span',{'data-content':'Company Type'}).text.strip()
                    try:
                        card_data['annual_revenue'] = sup_data.find('span',{'data-content':'Annual Revenue'}).text.strip()
                    except:
                        pass
                    try:
                        card_data['num_employees'] = sup_data.find('span',{'data-content':'Number of Employees'}).text.strip()
                    except:
                        pass
                    try:
                        card_data['year_founded'] = sup_data.find('span',{'data-content':'Year Founded'}).text.strip()
                    except:
                        pass

                    content = sup.find('div',class_='profile-card__content')
                    card_data['description'] = content.findAll('p')[0].text.strip()
                    card_data['company_url'] = content.find('p',{'class':'profile-card_web-link-wrap'}).find('a').get('href')
                    try:
                        card_data['brands'] = content.find('p',{'class':'profile-card__brands__body'}).text.strip()
                    except:
                        pass
                    suppliers_list.append(card_data)
                except Exception as e:
                    print("Error encountered while extraction of data\n", traceback.print_exc())
                    pass

            result = {"page_data": suppliers_list, "success": True}
            print(f"Successfully scraped page {payload['pg']}")
        except Exception as e:
            print(f"Error encountered scraping page {payload['pg']}:\n{e}")
        finally:
            return result


    def save_data(self):
        if not os.path.exists(self.config['saving_path']):
            os.makedirs(os.path.dirname(self.config['saving_path']))
        try:
            self.metadata = pd.DataFrame(self.collected_data)
            self.metadata.to_csv(self.config['saving_path'],index=False)
        except Exception as e:
            print(f"Error encountered saving metadata:\n\t{e}")


    def run(self):
        num_pages = self.find_num_pages(self.base_payload)
        list_of_payloads = self.generate_payload(num_pages,self.config['keyword'])
        try:
            pool = Pool(processes=10)
            final_result = pool.map(self.extract_data, list_of_payloads)
            for result in final_result:
                if result["success"]:
                    self.collected_data.extend(result["page_data"])
        except Exception as e:
            print(f"Error occurred. Closing scraping process.\n{str(e)}", traceback.print_exc())
        finally:
            # self.save_data()
            print(self.metadata)


if __name__ =='__main__':
    config = {
        "keyword": "hydraulic cylinders",
        "heading": 21650809,
        "saving_path": "data/hydraulic_cylinders/hydraulic_cylinders_suppliers_metadata.csv"
    }
    scraper = ThomasnetMetaDataScraper(
        config=config
    )
    scraper.run()

772 suppliers found
25 found on this page
Total Pages: 31
