In [1]:
from selenium.webdriver import Chrome
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from fake_useragent import UserAgent
#from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
import time
import requests  # For making HTTP requests
from bs4 import BeautifulSoup  # For parsing HTML content
from fake_useragent import UserAgent  # For generating random user agents
import pandas as pd  # For data manipulation and creating DataFrames
import numpy as np
import re
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor

companies = pd.read_csv('all_companies.csv')
companies = companies.sort_values(by='ID Number')
companies['ID Number'] = companies['ID Number'].astype(str).str.zfill(9)
companies

# Initialize lists to store scraped data
exact_names = []
fictitious_names = []
entity_types = []
identification_numbers = []
date_of_incorporations = []
effective_dates = []
loc_principal_offices = []
agent_resigned = []
address_maintained = []
name_address_reg_agents = []
off_dir_corps = []
name_business_add_GPs = []
LLC_managers = []
stocks = []
purposes = []
NIACS = []
business_entities = []

# Function to scrape data
def get_more_details(id_num):
    url = f'https://business.sos.ri.gov/CorpWeb/CorpSearch/CorpSummary.aspx?FEIN={id_num}&SEARCH_TYPE=3'
    print(id_num)
    headers = {'User-Agent': 'Mozilla/5.0'}
    page = requests.get(url, headers=headers)
    
    if page.status_code != 200:
        return

    soup = BeautifulSoup(page.content, "html.parser")

    def get_text_or_none(soup, tag, attrs):
        element = soup.find(tag, attrs=attrs)
        return element.text if element else 'None'

    exact_names.append(get_text_or_none(soup, 'span', {'id': "MainContent_lblEntityName"}))
    fictitious_names.append(re.sub(r'\s+', ' ', get_text_or_none(soup, 'table', {'id': "MainContent_tblFictitiousName"}).replace('he fictitious name of: ', '')))
    entity_types.append(get_text_or_none(soup, 'span', {'id': "MainContent_lblEntityType"}))
    identification_numbers.append(get_text_or_none(soup, 'span', {'id': "MainContent_lblIDNumber"}).replace('Identification Number: ', ''))
    date_of_incorporations.append(get_text_or_none(soup, 'span', {'id': "MainContent_lblOrganisationDate"}))
    effective_dates.append(get_text_or_none(soup, 'span', {'id': "MainContent_lblEffectiveDate"}))
    loc_principal_offices.append(re.sub(r'\s+', ' ', get_text_or_none(soup, 'table', {'id': "MainContent_tblPrincipleOffice"}).replace('\xa0', '').replace('The location of the Principal Office:', '').replace('Address:', '').replace('City or Town, State, Zip, Country:', ',')))
    agent_resigned.append(get_text_or_none(soup, 'span', {'id': "MainContent_lblResidentAgentFlag"}))
    address_maintained.append(get_text_or_none(soup, 'span', {'id': "MainContent_lblConsentFlag"}))
    name_address_reg_agents.append(re.sub(r'\s+', ' ', get_text_or_none(soup, 'table', {'id': "MainContent_tblResident"}).replace('Name:', '').replace('The name and address of the Registered Agent:', '').replace('Address:', ',').replace('City or Town, State, Zip, Country:', ',').replace('The name and address of the Resident Agent:', '')))
    off_dir_corps.append(re.sub(r'\s+', ' ', get_text_or_none(soup, 'table', {'id': "MainContent_tblOfficers"}).replace('TitleIndividual NameAddress', '').replace('The Officers and Directors of the Corporation:', '').replace('Address:', ',').replace('City or Town, State, Zip, Country:', ',')))
    name_business_add_GPs.append(re.sub(r'\s+', ' ', get_text_or_none(soup, 'table', {'id': "MainContent_grdManagers"}).replace('TitleIndividual nameAddress', '')))
    LLC_managers.append(re.sub(r'\s+', ' ', get_text_or_none(soup, 'table', {'id': "MainContent_tblManagers"}).replace('The limited liability company is to be managed by its ManagersThe name and business address of each Manager:', '').replace('TitleIndividual nameAddress', '')))
    stocks.append(re.sub(r'\s+', ' ', get_text_or_none(soup, 'table', {'id': "MainContent_tblStocks"}).replace('The total number of shares and the par value, if any, of each class of stock which this business entity is authorized to issue:', '').replace('Class of Stock Series Par value per share Total AuthorizedTotal issued and outstandingNo. of sharesNo. of shares', '')))
    purposes.append(re.sub(r'\s+', ' ', get_text_or_none(soup, 'table', {'id': "MainContent_tblComments"}).replace('Purpose:', '')))
    NIACS.append(get_text_or_none(soup, 'input', {'id': "MainContent_txtNIACS", "value": True}))
    business_entities.append(re.sub(r'\s+', ' ', get_text_or_none(soup, 'tr', {'id': "MainContent_tdFilingList"}).replace('\n', ',').strip(',')))

# IDs to scrape


# Use ThreadPoolExecutor to scrape in parallel
with ThreadPoolExecutor(max_workers=10) as executor:
    ####change this index below like companies['ID Number'][150000:155000]
    executor.map(get_more_details, companies['ID Number'][80000:100000])

# Create DataFrame
extra_comp_data = {
    "Exact Names": exact_names,
    "Fictitious Names and Filed Date": fictitious_names,
    "Entity Types": entity_types,
    "Identification Numbers": identification_numbers,
    "Date of Incorporations": date_of_incorporations,
    "Effective Dates": effective_dates,
    "Location of Principal Offices": loc_principal_offices,
    "Agent Resigned": agent_resigned,
    "Address Maintained": address_maintained,
    "Name and Address of Registered Agents": name_address_reg_agents,
    "Officers, Directors, and Corporations": off_dir_corps,
    "Name and Business Address of General Partners": name_business_add_GPs,
    "LLC Managers": LLC_managers,
    "Stocks": stocks,
    "Purposes": purposes,
    "NIACS": NIACS,
    "Business Entities": business_entities
}

full_comp_df = pd.DataFrame(extra_comp_data)
####change to csv filename
full_comp_df.to_csv('red80000.csv')


000103054
000103055
000103061
000103062
000103063
000103064
000103064
000103065
000103066
000103067
000103068
000103069
000103070
000103071
000103074
000103074
000103074
000103075
000103076
000103077
000103078
000103078
000103079
000103080
000103081
000103082
000103083
000103084
000103085
000103086
000103086
000103087
000103088
000103089
000103090
000103091
000103092
000103093
000103095
000103096
000103097
000103098
000103098
000103099
000103100
000103101
000103102
000103103
000103103
000103104
000103105
000103107
000103108
000103109
000103109
000103110
000103111
000103112000103113

000103114
000103115
000103116
000103118
000103119
000103120
000103122000103123

000103124
000103126
000103128
000103129
000103130
000103131
000103132
000103134
000103135
000103135
000103135
000103136
000103137
000103138000103139
000103140
000103141

000103142
000103143000103144

000103145
000103146
000103147000103148

000103148
000103148
000103150
000103150
000103150
000103150
000103151
000103151000103152

