# Imports

In [1]:
from buspkg.ProcessData import GetPickleCheckpoint, SavePickleCheckpoint, StripCNPJFromName
from buspkg.WebDriver import WebDriver
from buspkg import DBFunctions as db

from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Connect to DB

In [2]:
conn = db.GetDefaultConnection()
cur = conn.cursor()

## Retrieve all CNPJs from the database

Raw data from ANTT only contains CNPJ data from each company. For future analysis, we will use the companyinfo table to store various informations about registered companies. For now, this notebook is responsible for web scraping the name of each company based on their CNPJ, so that visualizations are cleaner

In [3]:
query = "SELECT DISTINCT(cnpj) FROM RegularTrips"
db.ExecuteQuery(query, cur, conn)
cnpjs = cur.fetchall()

## Set socket timeout to avoid [hanging connection](https://github.com/psf/requests/issues/3353)

There is still a little bit of confusion regarding what is causing hanging connections in my implementations, but this works for now.

In [4]:
import socket
socket.setdefaulttimeout(None)

## Web Scrape for company names based on their CNPJ

For each CNPJ registered in the database, we will scrape the company name from [this website](https://cnpj.biz). When searching for CNPJs, company names are in a text field with of class `post-title empresa-title`. Same as in other notebooks, we use a pickle file as checkpoint.

In [5]:
pickleFilePath = "../data/checkpoints/ProcessedCNPJsCheckpoint.pickle"
baseUrl = "https://cnpj.biz/"
className = "post-title.empresa-title"

tableName = "CompanyInfo"
webdriver = WebDriver()

In [6]:
processedCNPJs = GetPickleCheckpoint(pickleFilePath)
if not processedCNPJs:
    processedCNPJs = set([])

for cnpj in cnpjs:
    cnpj = cnpj[0]
    if cnpj in processedCNPJs:
        continue

    finalUrl = baseUrl + cnpj

    try:
        webdriver.get(finalUrl)
        element = WebDriverWait(webdriver, 5).until(
            EC.presence_of_element_located((By.CLASS_NAME, className))
        )

        companyName = StripCNPJFromName(element.text)
        db.WriteBatchToDB([(cnpj, companyName)], tableName, cur, conn)
    except Exception as e:
        print(e)
        print(f"Failed to fetch company name from cnpj: {cnpj}, {finalUrl}")
    else:
        processedCNPJs.add(cnpj)
        
SavePickleCheckpoint(pickleFilePath, processedCNPJs)        

In [7]:
conn.close()