# Testing

[Link to the website](https://www.thomasnet.com/)

In [3]:
import requests
from bs4 import BeautifulSoup
from traceback import print_exc

from datetime import datetime
import time
from tqdm import tqdm

import numpy as np 
import pandas as pd
import math

In [5]:
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}

payload = {
    # "WTZO": "Find Suppliers",
    "cov": "NA",
    "heading": 21650809,
    "searchsource": "suppliers",
    "searchterm": "hydraulic cylinders",
    # "searchx": "true",
    "what": "hydraulic cylinders",
    # "which": "prod",
    "pg": 1
}

BASE_URL = "https://www.thomasnet.com/nsearch.html"

In [6]:
page = requests.get(BASE_URL, params=payload)
soup = BeautifulSoup(page.text,"lxml")

total_suppliers = soup.find('p',class_="supplier-search-results__subheader").findAll('b')[-1].text
print(f"{total_suppliers} suppliers found")

n_suppliers = len(soup.findAll('div',class_="supplier-search-results__card"))
print(f"{n_suppliers} found on this page")

number_of_pages = math.ceil(float(total_suppliers)/float(n_suppliers))
print(f"Total Pages: {number_of_pages}")

772 suppliers found
25 found on this page
Total Pages: 31


In [7]:
def generate_payloads(keyword: str, page_num: int):
    payloads = []
    for i in range(page_num):
        payload = {
            "cov": "NA",
            "heading": 21650809,
            "searchsource": "suppliers",
            "searchterm": keyword,
            "what": keyword,
            "pg": i+1
        }
        payloads.append(payload)

    return payloads

x = generate_payloads('hydraulic cylinders',number_of_pages)

In [8]:
collected_data = []
keyword = 'hydraulic cylinders'
heading = 21650809

for i in tqdm(x):
    page = requests.get(BASE_URL, params=i)
    soup = BeautifulSoup(page.text,"lxml")

    suppliers = soup.findAll('div',class_="supplier-search-results__card")
    for sup in suppliers:
        card_data = {
            "company_id":"",
            "company_name":"",
            "company_type":"",
            "annual_revenue":"",
            "year_founded":"",
            "num_employees":"",
            "location":"",
            "company_url":"",
            "brands":"",
            "description":"",
            "url":"",
            "telephone":"",
            "searchterm":keyword
        }
        try:
            header = sup.find('header',class_='profile-card__header')
            card_data['company_id'] = eval(sup.get('data-impression-tracking'))['company_id']
            card_data['company_name'] = header.find('h2',class_='profile-card__title').text.strip()
            card_data['url'] = 'https://www.thomasnet.com'+header.find('h2',class_='profile-card__title').find('a').get('href')
            try:
                card_data['telephone'] = sup.find('a',{'data-conversion_action':'Call'}).get('href')
            except:
                pass

            sup_data = sup.find('div',class_='profile-card__supplier-data')
            card_data['location'] = sup_data.find('span',class_='profile-card__location').text.replace('ico-map','').strip()
            card_data['company_type'] = sup_data.find('span',{'data-content':'Company Type'}).text.strip()
            try:
                card_data['annual_revenue'] = sup_data.find('span',{'data-content':'Annual Revenue'}).text.strip()
            except:
                pass
            try:
                card_data['num_employees'] = sup_data.find('span',{'data-content':'Number of Employees'}).text.strip()
            except:
                pass
            try:
                card_data['year_founded'] = sup_data.find('span',{'data-content':'Year Founded'}).text.strip()
            except:
                pass

            content = sup.find('div',class_='profile-card__content')
            try:
                card_data['description'] = content.findAll('p')[0].text.strip()
            except:
                pass
            try:
                card_data['company_url'] = content.find('p',{'class':'profile-card_web-link-wrap'}).find('a').get('href')
            except:
                pass
            try:
                card_data['brands'] = content.find('p',{'class':'profile-card__brands__body'}).text.strip()
            except:
                pass
        except Exception as e:
            print("Error encountered while extraction of data\n",print_exc())
            pass

        collected_data.append(card_data)

df = pd.DataFrame(collected_data)
print(df.shape)
df.tail()

100%|██████████| 31/31 [01:55<00:00,  3.73s/it](772, 13)



Unnamed: 0,company_id,company_name,company_type,annual_revenue,year_founded,num_employees,location,company_url,brands,description,url,telephone,searchterm
767,10075759,Sharpe,Manufacturer*,,1987.0,,"Delta, BC",,,Manufacturer of servo actuators.,https://www.thomasnet.com/profile/10075759/sha...,,hydraulic cylinders
768,30701351,Norcan Fluid Power Ltd.,Distributor*,$5 - 9.9 Mil,,10-49,"Calgary, AB",,,,https://www.thomasnet.com/profile/30701351/nor...,tel://403-236-8392,hydraulic cylinders
769,10087352,Progressive Fluid Power Inc.,"Manufacturer*, Service Company",,,1-9,"Kitchener, ON",,,"Accumulators, Controls, Cylinders, Drives, Fit...",https://www.thomasnet.com/profile/10087352/pro...,tel://519-748-5886,hydraulic cylinders
770,418858,"Eastern Welding, Inc.",Manufacturer*,,1945.0,10-49,"Riverhead, NY",,,"Manufacturer of truck bodies, marine & hydraul...",https://www.thomasnet.com/profile/00418858/eas...,tel://800-772-2529,hydraulic cylinders
771,30691080,Eastern Industries & Hydraulics Ltd.,Manufacturer*,$1 - 4.9 Mil,1982.0,1-9,"Mount Pearl, NL",,,,https://www.thomasnet.com/profile/30691080/eas...,tel://709-747-3333,hydraulic cylinders


In [12]:
df.to_csv('hydraulic_cylinders_suppliers_metadata.csv',index=False)

# Meta-Harvester Class

772 suppliers found
25 found on this page
Total Pages: 31
