# WebScraping functions to find information about companies based on their cnpj

In [1]:
import os
import psycopg2

In [2]:
def GetCNPJFromDB(conn):
    query = """
        SELECT 
            DISTINCT(cnpj)
        FROM 
            regulartrips
        """
    
    cur = conn.cursor()
    cur.execute(query)
    return cur.fetchall()

In [3]:
conn = psycopg2.connect(host='localhost', port=5432, dbname='ANTT', user=os.environ["PGDBUSER"], password=os.environ["PGDBPASSWORD"])
cnpjs = GetCNPJFromDB(conn)
conn.close()

## Create pickle file for checkpoint

In [49]:
import pickle

pickleFilePath = "../data/processedCNPJS.pickle"

if not os.path.exists(pickleFilePath):
    empty = set(['placeholder'])
    with open(pickleFilePath, 'wb') as p:
        pickle.dump(empty, p)

## Set socket timeout to avoid [hanging connection](https://github.com/psf/requests/issues/3353)

In [20]:
import socket
socket.setdefaulttimeout(None)

## Define user agents to avoid automation detection

In [7]:
import random
import itertools

userAgents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Trailer/93.3.8652.5",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36 Edg/134.0.0."
]

def GetUserAgentRotator(userAgents):
    random.shuffle(userAgents)
    return itertools.cycle(userAgents)

## Initialize variables for scraping

In [47]:
def GetProcessedCNPJs(filePath):
    processedCNPJs = set([])
    with open(filePath, 'rb') as p:
        processedCNPJs = pickle.load(p)

    return processedCNPJs

In [59]:
baseUrl = "https://cnpj.biz/"
className = "post-title.empresa-title"
result = []
processedCNPJs = GetProcessedCNPJs(pickleFilePath)

In [60]:
print(processedCNPJs)

{'02026255000159', '28812022000175', '01016179000138', '91873372000188', '20526371000119', '18538045000180', '01543354000145', '08374919000157', '07175375000131', '07549414000113', '03932339000114', '27177468000102', '88446869000105', '28789982000161', '27175975000107', '04693576000132', '32404063000108', '20146015000170', '03456707000103', '06789401000159', '01745523000120', '04801028000189', '03641223000126', '04229706000180', '05921606000183', '76544501000109', '32285454000142', '01751730000197', '78586674000107', '12191409000111', '18449504000159', '23338155000138', '80544885000129', '98593668000194', '07811161000104', '19350180000160', '41379983000104', '92667948000113', '06044464000186', '30910717000131', '01502456000112', '17063703000161', '11884579000119', '12766454000157', '07241838000116', '21642756000104', '59163162000193', '82647884000135', '79111779000172', '01016989003290', '15474486000177', '16345282000107', '01718370000121', '12423586000186', '19125863000113', '13406285

In [61]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

userAgentsCycle = GetUserAgentRotator(userAgents)

options = uc.ChromeOptions()
options.headless = True
options.timeout = { 'script': 1800 }
options.add_argument(f"--user-agent={next(userAgentsCycle)}")

driver = uc.Chrome(options=options)

## Save names to DB

In [38]:
def SaveBatchToDB(conn, tableName, data):
    cur = conn.cursor()    
    cur.executemany(
        f"""
        INSERT INTO
            {tableName}
        VALUES (%s, %s)
        """, 
        data
    )
    conn.commit()

def SaveCNPJsToPickle(filePath, cnpjs):
    with open(filePath, 'wb') as p:
        pickle.dump(cnpjs, p)

## Scrape for each CNPJ

In [62]:
import time

conn = psycopg2.connect(host='localhost', port=5432, dbname='ANTT', user=os.environ["PGDBUSER"], password=os.environ["PGDBPASSWORD"])

count = 0
checkpointSize = 10
processedCNPJs = GetProcessedCNPJs(pickleFilePath)

for cnpj in cnpjs:
    cnpj = cnpj[0]
    if cnpj in processedCNPJs:
        continue

    finalUrl = baseUrl + cnpj
    count += 1    

    try:
        driver.execute_cdp_cmd('Network.setUserAgentOverride', {
            "userAgent": next(userAgentsCycle)
        })
        driver.get(finalUrl)
        element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CLASS_NAME, className))
            )
            
        companyName = element.text
        result.append((cnpj, companyName))
        print(f"Successfully fetched name from {cnpj}: {companyName}")
    except Exception as e:
        print(e)
        print(f"Failed to fetch company name from cnpj: {cnpj}, {finalUrl}")
        
    if count % checkpointSize == 0:
        SaveBatchToDB(conn, 'companyinfo', result)
        
        for key, value in result:
            processedCNPJs.add(key)
        SaveCNPJsToPickle(pickleFilePath, processedCNPJs)
        
        result = []
        print(f"CNPJs processed: {count*100/len(cnpjs)}%")
    
    time.sleep(1)

SaveBatchToDB(conn, 'companyinfo', result)        
for key, value in result:
    processedCNPJs.add(key)
SaveCNPJsToPickle(pickleFilePath, processedCNPJs)

conn.close()
driver.close()

## Remove cnpj from company names

In [31]:
import os
import psycopg2

conn = psycopg2.connect(host='localhost', port=5432, dbname='ANTT', user=os.environ["PGDBUSER"], password=os.environ["PGDBPASSWORD"])
cur = conn.cursor()

In [32]:
query = """
    SELECT
        cnpj, name
    FROM
        companyinfo
"""
cur.execute(query)
result = cur.fetchall()

In [33]:
def PreprocessName(name):
    name = name[:-19]

    if name[-1] == '-':
        name = name[:-2]

    return name

In [28]:
processedNames = []

for entry in result:
    processedNames.append({'cnpj': entry[0], 'name': PreprocessName(entry[1])})

In [35]:
for entry in processedNames:
    query = f"""
        UPDATE
            companyinfo
        SET 
            name = (%(name)s)
        WHERE
            cnpj = (%(cnpj)s);
    """
    cur.execute(query, entry)

conn.commit()
conn.close()