# Testing

[Link to the website](https://www.thomasnet.com/)

In [1]:
import requests
from bs4 import BeautifulSoup
from traceback import print_exc

from datetime import datetime
import time
from tqdm import tqdm

import numpy as np 
import pandas as pd
import math

In [2]:
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}

payload = {
    # "WTZO": "Find Suppliers",
    "cov": "NA",
    "heading": 21650809,
    "searchsource": "suppliers",
    "searchterm": "hydraulic cylinders",
    # "searchx": "true",
    "what": "hydraulic cylinders",
    # "which": "prod",
    "pg": 1
}

BASE_URL = "https://www.thomasnet.com/nsearch.html"

In [3]:
page = requests.get(BASE_URL, params=payload)
soup = BeautifulSoup(page.text,"lxml")

total_suppliers = soup.find('p',class_="supplier-search-results__subheader").findAll('b')[-1].text
print(f"{total_suppliers} suppliers found")

n_suppliers = len(soup.findAll('div',class_="supplier-search-results__card"))
print(f"{n_suppliers} found on this page")

number_of_pages = math.ceil(float(total_suppliers)/float(n_suppliers))
print(f"Total Pages: {number_of_pages}")

772 suppliers found
25 found on this page
Total Pages: 31


In [4]:
def generate_payloads(keyword: str, page_num: int):
    payloads = []
    for i in range(page_num):
        payload = {
            "cov": "NA",
            "heading": 21650809,
            "searchsource": "suppliers",
            "searchterm": keyword,
            "what": keyword,
            "pg": i+1
        }
        payloads.append(payload)

    return payloads

x = generate_payloads('hydraulic cylinders',number_of_pages)

In [10]:
collected_data = []
keyword = 'hydraulic cylinders'
heading = 21650809

for i in tqdm(x):
    page = requests.get(BASE_URL, params=i)
    soup = BeautifulSoup(page.text,"lxml")

    suppliers = soup.findAll('div',class_="supplier-search-results__card")
    for sup in suppliers:
        card_data = {
            "company_id":"",
            "company_name":"",
            "company_type":"",
            "annual_revenue":"",
            "year_founded":"",
            "num_employees":"",
            "location":"",
            "company_url":"",
            "brands":"",
            "description":"",
            "url":"",
            "telephone":"",
            "searchterm":keyword
        }
        try:
            header = sup.find('header',class_='profile-card__header')
            card_data['company_id'] = eval(sup.get('data-impression-tracking'))['company_id']
            card_data['company_name'] = header.find('h2',class_='profile-card__title').text.strip()
            card_data['url'] = 'https://www.thomasnet.com'+header.find('h2',class_='profile-card__title').find('a').get('href')
            try:
                card_data['telephone'] = sup.find('a',{'data-conversion_action':'Call'}).get('href')
            except:
                pass

            sup_data = sup.find('div',class_='profile-card__supplier-data')
            card_data['location'] = sup_data.find('span',class_='profile-card__location').text.replace('ico-map','').strip()
            card_data['company_type'] = sup_data.find('span',{'data-content':'Company Type'}).text.strip()
            try:
                card_data['annual_revenue'] = sup_data.find('span',{'data-content':'Annual Revenue'}).text.strip()
            except:
                pass
            try:
                card_data['num_employees'] = sup_data.find('span',{'data-content':'Number of Employees'}).text.strip()
            except:
                pass
            try:
                card_data['year_founded'] = sup_data.find('span',{'data-content':'Year Founded'}).text.strip()
            except:
                pass

            content = sup.find('div',class_='profile-card__content')
            try:
                card_data['description'] = content.findAll('p')[0].text.strip()
            except:
                pass
            try:
                card_data['company_url'] = content.find('p',{'class':'profile-card_web-link-wrap'}).find('a').get('href')
            except:
                pass
            try:
                card_data['brands'] = content.find('p',{'class':'profile-card__brands__body'}).text.strip()
            except:
                pass
        except Exception as e:
            print("Error encountered while extraction of data\n",print_exc())
            pass

        collected_data.append(card_data)

df = pd.DataFrame(collected_data)
print(df.shape)
df.tail()

100%|██████████| 31/31 [02:18<00:00,  4.47s/it](772, 13)



Unnamed: 0,company_id,company_name,company_type,annual_revenue,year_founded,num_employees,location,company_url,brands,description,url,telephone,searchterm
767,10100771,Motion Hydraulics,"Manufacturer*, Service Company",,1974.0,,"Burlington, ON",,,,https://www.thomasnet.com/profile/10100771/mot...,tel://905-335-1171,hydraulic cylinders
768,30701202,Gaspe Machine Works Inc.,Manufacturer*,Under $1 Mil,1984.0,1-9,"Gaspe, QC",,,,https://www.thomasnet.com/profile/30701202/gas...,tel://418-368-6574,hydraulic cylinders
769,30164910,"T-Mac Cylinders, Inc",Manufacturer*,Under $1 Mil,1996.0,1-9,"Roscoe, IL",,,Manufacturer Of Hydraulic & Pneumatic Cylinder...,https://www.thomasnet.com/profile/30164910/tma...,tel://815-877-7090,hydraulic cylinders
770,30162771,Hydra Kinetics,Manufacturers' Rep*,Under $1 Mil,,1-9,"Heidenheimer, TX",,,"Mfr's. Rep. Of Cutom Made Hydraulic Cylinders,...",https://www.thomasnet.com/profile/30162771/hyd...,tel://254-983-1067,hydraulic cylinders
771,180836,"Production Engineering, Inc.","Custom Manufacturer*, Manufacturer, Service Co...",,1974.0,50-99,"Jackson, MI",,,Manufacturer of precision flexible machines & ...,https://www.thomasnet.com/profile/00180836/pro...,tel://517-788-6800,hydraulic cylinders


In [12]:
df.to_csv('data/hydraulic_cylinders_suppliers_metadata.csv',index=False)
df.loc[:,['company_id','url']].to_csv('data/suppliers_url.csv', index=False)

# Meta-Harvester

In [5]:
meta = pd.read_csv('..\data\hydraulic_cylinders\hydraulic_cylinders_suppliers_metadata.csv')
meta.tail()

Unnamed: 0,company_id,company_name,company_type,annual_revenue,year_founded,num_employees,location,company_url,brands,description,url,telephone,searchterm
767,10100771,Motion Hydraulics,"Manufacturer*, Service Company",,1974.0,,"Burlington, ON",,,,https://www.thomasnet.com/profile/10100771/mot...,tel://905-335-1171,hydraulic cylinders
768,30701202,Gaspe Machine Works Inc.,Manufacturer*,Under $1 Mil,1984.0,1-9,"Gaspe, QC",,,,https://www.thomasnet.com/profile/30701202/gas...,tel://418-368-6574,hydraulic cylinders
769,30164910,"T-Mac Cylinders, Inc",Manufacturer*,Under $1 Mil,1996.0,1-9,"Roscoe, IL",,,Manufacturer Of Hydraulic & Pneumatic Cylinder...,https://www.thomasnet.com/profile/30164910/tma...,tel://815-877-7090,hydraulic cylinders
770,30162771,Hydra Kinetics,Manufacturers' Rep*,Under $1 Mil,,1-9,"Heidenheimer, TX",,,"Mfr's. Rep. Of Cutom Made Hydraulic Cylinders,...",https://www.thomasnet.com/profile/30162771/hyd...,tel://254-983-1067,hydraulic cylinders
771,180836,"Production Engineering, Inc.","Custom Manufacturer*, Manufacturer, Service Co...",,1974.0,50-99,"Jackson, MI",,,Manufacturer of precision flexible machines & ...,https://www.thomasnet.com/profile/00180836/pro...,tel://517-788-6800,hydraulic cylinders


In [7]:
meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 772 entries, 0 to 771
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   company_id      772 non-null    int64  
 1   company_name    772 non-null    object 
 2   company_type    772 non-null    object 
 3   annual_revenue  552 non-null    object 
 4   year_founded    645 non-null    float64
 5   num_employees   659 non-null    object 
 6   location        771 non-null    object 
 7   company_url     61 non-null     object 
 8   brands          138 non-null    object 
 9   description     683 non-null    object 
 10  url             772 non-null    object 
 11  telephone       768 non-null    object 
 12  searchterm      772 non-null    object 
dtypes: float64(1), int64(1), object(11)
memory usage: 78.5+ KB


In [8]:
ref_urls = meta.loc[:,['company_id','url']]
ref_urls.head()

Unnamed: 0,company_id,url
0,1315677,https://www.thomasnet.com/profile/01315677/wor...
1,10055126,https://www.thomasnet.com/profile/10055126/ram...
2,1010077,https://www.thomasnet.com/profile/01010077/flu...
3,30082354,https://www.thomasnet.com/profile/30082354/ari...
4,1150392,https://www.thomasnet.com/branchloc.html?cid=1...


# Master Scraper

In [10]:
refs = ref_urls.values.tolist()

In [11]:
collected_data = []

for ref in tqdm(refs[:50]):
    page = requests.get(ref[1]).text
    soup = BeautifulSoup(page,'lxml')

    page_data = {
        'company_id':ref[0],
        'company_name':'',
        'company_type':'',
        'company_url':'',
        'location':'',
        'telephone':'',
        'product_description':'',
        'about_company':'',
        'product_types':'',
        'all_product_services':'',
        'link_prod_services':'',
        'brands':'',
        'link_all_brands':'',
        'primary_company_type':'',
        'additional_activities':'',
        'key_personnel':'',
        'social_media':'',
        'other_locations':'',
        "annual_revenue":"",
        "year_founded":"",
        "num_employees":"",
        'url':ref[1]
    }
    try:
        gen_info = soup.find('div',{'class':'copro_naft'})
        page_data["company_name"] = gen_info.find('div',{'class':'codetail'}).find('h1').find('a').text.strip()
        page_data["company_url"] = gen_info.find('div',{'class':'codetail'}).find('h1').find('a').get('href')
        page_data["company_type"] = gen_info.find('div',{'class':'codetail'}).find('p').findAll('span')[2].text.strip()
        page_data["location"] = gen_info.find('p',{'class':'addrline'}).text.split('|')[0].strip()
        page_data["telephone"] = gen_info.find('a',{'data-conversion_action':'Call'}).get('href')

        ################# Business Description ####################
        business_desc = soup.find('div',{'id':'copro_description'})
        page_data['product_description'] = business_desc.find('div',{'id':'copro_pdm'}).text.strip()
        page_data['about_company'] = business_desc.find('div',{'id':'copro_about'}).text.strip()

        ################# Products and Services/Brands ####################
        prod_serv = soup.find('div',{'id':'copro_prodserv'})
        try:
            prod_cats = prod_serv.find('div',{'id':'copro_prodserv_cats'})
            prod_types = prod_cats.findAll('div',{'class':'prodserv_group'})[0]
            page_data['product_types'] = [i.text.strip() for i in prod_types.findAll('li')]
            all_prodserv = prod_cats.findAll('div',{'class':'prodserv_group'})[1]
            page_data['all_product_services'] = [i.text.strip() for i in all_prodserv.findAll('li')]
            page_data['link_prod_services'] = "https://www.thomasnet.com"+all_prodserv.find('a').get('href')
        except Exception as e:
            pass

        try:
            prod_brands = prod_serv.find('div',{'id':'copro_prodserv_brands'})
            brands = prod_brands.find('div',{'class':'prodserv_group'})
            page_data['brands'] = [i.text.strip() for i in brands.findAll('li')]
            page_data['link_all_brands'] = "https://www.thomasnet.com"+brands.find('a').get('href')
        except Exception as e:
            pass

        ################# Business Details ####################
        bus_det = soup.find('div',{'id':'copro_bizdetails'})
        col1 = bus_det.find('div',{'class':'bdcol1'})
        col1_ = col1.findAll('div',{'class':'bizdetail'})
        for div in col1_:
            label = div.find('div',{'class':'label'}).text.strip()
            if label=="Primary Company Type:":
                page_data['primary_company_type'] = div.find('li').text.strip()
            if label=="Additional Activities:":
                page_data['additional_activities'] = div.find('li').text.strip()
            if label=="Key Personnel:":
                page_data['key_personnel'] = [i.text.strip() for i in div.findAll('li')]
            if label=="Locations:":
                page_data['other_locations'] = "https://www.thomasnet.com"+div.find('a').get('href')
            if label=="Social:":
                page_data['social_media'] = [i.get('href') for i in div.findAll('a')]

        col2 = bus_det.find('div',{'class':'bdcol2'})
        # col2_ = col2.findAll('div',{'class':'certblock'})
        # for div in col2_:
        #     if div.find('h2'):
        #         if div.find('h2').text.strip()=="Diverse / Small Bus. Status":
        #             page_data['diverse_small_business_status'] = div.find('li').text.strip()
        #     if div.find('div',{'class':'label'}):
        #         label = div.find('div',{'class':'label'}).text.strip()
        #         if label=="Quality Certifications:":
        #             page_data['quality_certifications'] = div.find('li').text.strip()
        #         if label=="Registrations:":
        #             page_data['registrations'] = div.find('li').text.strip()

        col22_ = col2.findAll('div',{'class':'bizdetail'})
        for div in col22_:
            if div.find('div',{'class':'label'}):
                label = div.find('div',{'class':'label'}).text.strip()
                if label=="Annual Sales:":
                    page_data['annual_revenue'] = div.find('li').text.strip()
                if label=="No of Employees:":
                    page_data['num_employees'] = div.find('li').text.strip()
                if label=="Year Founded:":
                    page_data['year_founded'] = div.find('li').text.strip()

        ################# Consolidation of Data ####################
        collected_data.append(page_data)
    except Exception as e:
        print(f'\nError encountered while scraping {ref[1]}\n{print_exc()}')
        pass

master_df = pd.DataFrame(collected_data)

100%|██████████| 50/50 [02:43<00:00,  3.27s/it]


In [14]:
master_df.to_csv('data/hydraulic_cylinder_master_data.csv',index=False)
master_df.tail()

Unnamed: 0,company_id,company_name,company_type,company_url,location,telephone,product_description,about_company,product_types,all_product_services,...,link_all_brands,primary_company_type,additional_activities,key_personnel,social_media,other_locations,annual_revenue,year_founded,num_employees,url
45,10084727,MISUMI USA,PathCOVID-19 Response,https://us.misumi-ec.com/?mvisits=thomas,"Schaumburg, IL 60173",tel://866-984-9864,Custom manufacturer of cylinders including hyd...,Company Description by Thomasnet\nCustom manuf...,"[Cylinders: Hydraulic, Cylinders, Cylinders: A...","[Actuators, Aluminum, Belts, Extrusion Service...",...,,Manufacturer,"Distributor, Custom Manufacturer, Finishing Se...",[Not Available],[https://www.linkedin.com/company/157102?trk=v...,https://www.thomasnet.com/branches.html?what=h...,$250 Mil. and over,1963,500-999,https://www.thomasnet.com/branchloc.html?cid=1...
46,10056878,Marsh Bellofram & Marshalltown Dist. - M & M C...,Distributor,https://www.mmcontrol.com/marsh.php,"Lake Villa, IL 60046",tel://877-820-4900,"A complete selection of pressure gauges, needl...",Company Description by Thomasnet\nA complete s...,"[Cylinders: Hydraulic, Cylinders, Cylinders: A...","[Gages, Actuators, Controls and Controllers, C...",...,,Distributor,"Manufacturer, Custom Manufacturer, Service Com...",[Not Available],[https://www.linkedin.com/company/591670?trk=t...,,$5 - 9.9 Mil,1984,10-49,https://www.thomasnet.com/profile/10056878/mar...
47,10031196,"TOX Pressotechnik, LLC",PathCOVID-19 Response,https://us.tox-pressotechnik.com/,"Warrenville, IL 60555",tel://855-790-6004,Manufacturer of hydro-pneumatic cylinders. Ava...,Company Description by Thomasnet\nManufacturer...,"[Cylinders: Hydraulic, Cylinders, Cylinders: A...","[Presses, Joiner Systems, Assembly Machinery, ...",...,,Manufacturer,Custom Manufacturer,"[Stefan Gnade, Vice President of Engineering, ...",[https://www.linkedin.com/company/tox-pressote...,,$25 - 49.9 Mil,1985,50-99,https://www.thomasnet.com/profile/10031196/tox...
48,20023170,Metro Hydraulic Jack Co.,Distributor,https://www.metrohydraulic.com/,"Newark, NJ 07104",tel://800-633-8234,Distributor of air & hydraulic cylinders inclu...,Company Description by Thomasnet\nDistributor ...,"[Cylinders: Hydraulic, Cylinders, Cylinders: A...","[Hydraulic Equipment, Power Units, Pumps, Actu...",...,,Distributor,Custom Manufacturer,[Not Available],[http://www.metrohydraulic.com/blog/],,Not Available,Not Available,Not Available,https://www.thomasnet.com/profile/20023170/met...
49,1222066,M & M Control,Distributor,https://www.mmcontrol.com/,"Lake Villa, IL 60046",tel://877-820-4900,Master distributor of a wide variety of electr...,Company Description by Thomasnet\nMaster distr...,"[Cylinders: Hydraulic, Cylinders, Cylinders: A...","[Valves, Conditioners, Heaters, HVAC Equipment...",...,,Distributor,"Custom Manufacturer, Service Company",[Not Available],[https://www.linkedin.com/company/591670?trk=t...,,$5 - 9.9 Mil,1984,10-49,https://www.thomasnet.com/profile/01222066/m-m...
