# Testing

[Link to the website](https://www.thomasnet.com/)

In [6]:
import requests
from bs4 import BeautifulSoup
from traceback import print_exc

from datetime import datetime
import time
from tqdm import tqdm

import numpy as np 
import pandas as pd
import math

In [7]:
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}

payload = {
    # "WTZO": "Find Suppliers",
    "cov": "NA",
    "heading": 21650809,
    "searchsource": "suppliers",
    "searchterm": "hydraulic cylinders",
    # "searchx": "true",
    "what": "hydraulic cylinders",
    # "which": "prod",
    "pg": 1
}

BASE_URL = "https://www.thomasnet.com/nsearch.html"

In [8]:
page = requests.get(BASE_URL, params=payload)
soup = BeautifulSoup(page.text,"lxml")

total_suppliers = soup.find('p',class_="supplier-search-results__subheader").findAll('b')[-1].text
print(f"{total_suppliers} suppliers found")

n_suppliers = len(soup.findAll('div',class_="supplier-search-results__card"))
print(f"{n_suppliers} found on this page")

number_of_pages = math.ceil(float(total_suppliers)/float(n_suppliers))
print(f"Total Pages: {number_of_pages}")

772 suppliers found
25 found on this page
Total Pages: 31


In [9]:
def generate_payloads(keyword: str, page_num: int):
    payloads = []
    for i in range(page_num):
        payload = {
            "cov": "NA",
            "heading": 21650809,
            "searchsource": "suppliers",
            "searchterm": keyword,
            "what": keyword,
            "pg": i+1
        }
        payloads.append(payload)

    return payloads

x = generate_payloads('hydraulic cylinders',number_of_pages)

In [10]:
collected_data = []
keyword = 'hydraulic cylinders'
heading = 21650809

for i in tqdm(x):
    page = requests.get(BASE_URL, params=i)
    soup = BeautifulSoup(page.text,"lxml")

    suppliers = soup.findAll('div',class_="supplier-search-results__card")
    for sup in suppliers:
        card_data = {
            "company_id":"",
            "company_name":"",
            "company_type":"",
            "annual_revenue":"",
            "year_founded":"",
            "num_employees":"",
            "location":"",
            "company_url":"",
            "brands":"",
            "description":"",
            "url":"",
            "telephone":"",
            "searchterm":keyword
        }
        try:
            header = sup.find('header',class_='profile-card__header')
            card_data['company_id'] = eval(sup.get('data-impression-tracking'))['company_id']
            card_data['company_name'] = header.find('h2',class_='profile-card__title').text.strip()
            card_data['url'] = 'https://www.thomasnet.com'+header.find('h2',class_='profile-card__title').find('a').get('href')
            try:
                card_data['telephone'] = sup.find('a',{'data-conversion_action':'Call'}).get('href')
            except:
                pass

            sup_data = sup.find('div',class_='profile-card__supplier-data')
            card_data['location'] = sup_data.find('span',class_='profile-card__location').text.replace('ico-map','').strip()
            card_data['company_type'] = sup_data.find('span',{'data-content':'Company Type'}).text.strip()
            try:
                card_data['annual_revenue'] = sup_data.find('span',{'data-content':'Annual Revenue'}).text.strip()
            except:
                pass
            try:
                card_data['num_employees'] = sup_data.find('span',{'data-content':'Number of Employees'}).text.strip()
            except:
                pass
            try:
                card_data['year_founded'] = sup_data.find('span',{'data-content':'Year Founded'}).text.strip()
            except:
                pass

            content = sup.find('div',class_='profile-card__content')
            try:
                card_data['description'] = content.findAll('p')[0].text.strip()
            except:
                pass
            try:
                card_data['company_url'] = content.find('p',{'class':'profile-card_web-link-wrap'}).find('a').get('href')
            except:
                pass
            try:
                card_data['brands'] = content.find('p',{'class':'profile-card__brands__body'}).text.strip()
            except:
                pass
        except Exception as e:
            print("Error encountered while extraction of data\n",print_exc())
            pass

        collected_data.append(card_data)

df = pd.DataFrame(collected_data)
print(df.shape)
df.tail()

100%|██████████| 31/31 [02:18<00:00,  4.47s/it](772, 13)



Unnamed: 0,company_id,company_name,company_type,annual_revenue,year_founded,num_employees,location,company_url,brands,description,url,telephone,searchterm
767,10100771,Motion Hydraulics,"Manufacturer*, Service Company",,1974.0,,"Burlington, ON",,,,https://www.thomasnet.com/profile/10100771/mot...,tel://905-335-1171,hydraulic cylinders
768,30701202,Gaspe Machine Works Inc.,Manufacturer*,Under $1 Mil,1984.0,1-9,"Gaspe, QC",,,,https://www.thomasnet.com/profile/30701202/gas...,tel://418-368-6574,hydraulic cylinders
769,30164910,"T-Mac Cylinders, Inc",Manufacturer*,Under $1 Mil,1996.0,1-9,"Roscoe, IL",,,Manufacturer Of Hydraulic & Pneumatic Cylinder...,https://www.thomasnet.com/profile/30164910/tma...,tel://815-877-7090,hydraulic cylinders
770,30162771,Hydra Kinetics,Manufacturers' Rep*,Under $1 Mil,,1-9,"Heidenheimer, TX",,,"Mfr's. Rep. Of Cutom Made Hydraulic Cylinders,...",https://www.thomasnet.com/profile/30162771/hyd...,tel://254-983-1067,hydraulic cylinders
771,180836,"Production Engineering, Inc.","Custom Manufacturer*, Manufacturer, Service Co...",,1974.0,50-99,"Jackson, MI",,,Manufacturer of precision flexible machines & ...,https://www.thomasnet.com/profile/00180836/pro...,tel://517-788-6800,hydraulic cylinders


In [12]:
df.to_csv('data/hydraulic_cylinders_suppliers_metadata.csv',index=False)
df.loc[:,['company_id','url']].to_csv('data/suppliers_url.csv', index=False)

# Meta-Harvester

In [17]:
meta = pd.read_csv('..\data\hydraulic_cylinders\hydraulic_cylinders_suppliers_metadata.csv')
meta.tail()

Unnamed: 0,company_id,company_name,company_type,annual_revenue,year_founded,num_employees,location,company_url,brands,description,url,telephone,searchterm
767,10100771,Motion Hydraulics,"Manufacturer*, Service Company",,1974.0,,"Burlington, ON",,,,https://www.thomasnet.com/profile/10100771/mot...,tel://905-335-1171,hydraulic cylinders
768,30701202,Gaspe Machine Works Inc.,Manufacturer*,Under $1 Mil,1984.0,1-9,"Gaspe, QC",,,,https://www.thomasnet.com/profile/30701202/gas...,tel://418-368-6574,hydraulic cylinders
769,30164910,"T-Mac Cylinders, Inc",Manufacturer*,Under $1 Mil,1996.0,1-9,"Roscoe, IL",,,Manufacturer Of Hydraulic & Pneumatic Cylinder...,https://www.thomasnet.com/profile/30164910/tma...,tel://815-877-7090,hydraulic cylinders
770,30162771,Hydra Kinetics,Manufacturers' Rep*,Under $1 Mil,,1-9,"Heidenheimer, TX",,,"Mfr's. Rep. Of Cutom Made Hydraulic Cylinders,...",https://www.thomasnet.com/profile/30162771/hyd...,tel://254-983-1067,hydraulic cylinders
771,180836,"Production Engineering, Inc.","Custom Manufacturer*, Manufacturer, Service Co...",,1974.0,50-99,"Jackson, MI",,,Manufacturer of precision flexible machines & ...,https://www.thomasnet.com/profile/00180836/pro...,tel://517-788-6800,hydraulic cylinders


In [16]:
meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 747 entries, 0 to 746
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   company_id      747 non-null    int64  
 1   company_name    747 non-null    object 
 2   company_type    747 non-null    object 
 3   annual_revenue  533 non-null    object 
 4   year_founded    624 non-null    float64
 5   num_employees   639 non-null    object 
 6   location        746 non-null    object 
 7   company_url     61 non-null     object 
 8   brands          132 non-null    object 
 9   description     658 non-null    object 
 10  url             747 non-null    object 
 11  telephone       743 non-null    object 
 12  searchterm      747 non-null    object 
dtypes: float64(1), int64(1), object(11)
memory usage: 76.0+ KB


# Master Scraper

In [18]:
meta.loc[:,['company_id','url']]

Unnamed: 0,company_id,url
0,1315677,https://www.thomasnet.com/profile/01315677/wor...
1,10055126,https://www.thomasnet.com/profile/10055126/ram...
2,1010077,https://www.thomasnet.com/profile/01010077/flu...
3,30082354,https://www.thomasnet.com/profile/30082354/ari...
4,1150392,https://www.thomasnet.com/branchloc.html?cid=1...
...,...,...
767,10100771,https://www.thomasnet.com/profile/10100771/mot...
768,30701202,https://www.thomasnet.com/profile/30701202/gas...
769,30164910,https://www.thomasnet.com/profile/30164910/tma...
770,30162771,https://www.thomasnet.com/profile/30162771/hyd...
