In [6]:
from selenium.webdriver import Chrome
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from fake_useragent import UserAgent
#from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
import time
import requests  # For making HTTP requests
from bs4 import BeautifulSoup  # For parsing HTML content
from fake_useragent import UserAgent  # For generating random user agents
import pandas as pd  # For data manipulation and creating DataFrames
import numpy as np
import re
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor

In [3]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Constants
CHROMEDRIVER_PATH = r'C:\Users\nakki\Downloads\Compressed\b\chromedriver-win64\chromedriver.exe'
URL = 'https://business.sos.ri.gov/CorpWeb/CorpSearch/CorpSearch.aspx'
ENTITY_TYPES = ['Corp', 'LLC', 'INC', 'LLP', 'LP']

def setup_driver():
    service = Service(executable_path=CHROMEDRIVER_PATH)
    options = Options()
    ua = UserAgent()
    options.add_argument(f'user-agent={ua.random}')
    options.add_argument("--headless")
    options.add_argument("--window-size=1920,1080")
    return webdriver.Chrome(service=service, options=options)

def scrape_entities(entity_type, status):
    driver = setup_driver()
    try:
        driver.get(URL)
        
        # Select status (active or inactive)
        status_id = "MainContent_rdbActive" if status == "active" else "MainContent_rdbInactive"
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, status_id))).click()
        
        # Enter entity type and set search options
        text_field = driver.find_element(By.ID, "MainContent_txtEntityName")
        text_field.clear()
        text_field.send_keys(entity_type)
        
        Select(driver.find_element(By.ID, "MainContent_ddBeginsWithEntityName")).select_by_visible_text("Full text")
        Select(driver.find_element(By.ID, "MainContent_ddRecordsPerPage")).select_by_visible_text("All items")
        
        # Scroll and wait
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(10)
        
        # Parse the page
        soup = BeautifulSoup(driver.page_source, "html.parser")
        table = soup.find(id="MainContent_SearchControl_grdSearchResultsEntity")
        
        if not table:
            logging.warning(f"No data found for {entity_type} ({status})")
            return None
        
        headers = [th.text for th in table.find_all('th')]
        rows = []
        for row in table.find_all('tr')[1:]:
            rows.append([td.text for td in row.find_all('td')])
        
        df = pd.DataFrame(rows, columns=headers)
        df['status'] = status
        df['entity_type'] = entity_type
        
        df.to_csv(f"companies_{status}_{entity_type}.csv", index=False)
        logging.info(f"Successfully scraped {status} {entity_type}")
        return df
    except Exception as e:
        logging.error(f"Error scraping {status} {entity_type}: {str(e)}")
        return None
    finally:
        driver.quit()

def scrape_entity_type(entity_type):
    active_df = scrape_entities(entity_type, "active")
    inactive_df = scrape_entities(entity_type, "inactive")
    return pd.concat([active_df, inactive_df]) if active_df is not None and inactive_df is not None else None

if __name__ == "__main__":
    with ThreadPoolExecutor(max_workers=5) as executor:
        results = list(executor.map(scrape_entity_type, ENTITY_TYPES))
    
    # Combine all results
    final_df = pd.concat([df for df in results if df is not None])
    final_df.to_csv("all_companies.csv", index=False)
    logging.info("Scraping completed. Data saved to all_companies.csv")

2024-06-28 19:46:49,857 - INFO - Successfully scraped active LLP
2024-06-28 19:46:52,224 - INFO - Successfully scraped active LP
2024-06-28 19:46:55,156 - INFO - Successfully scraped active Corp
2024-06-28 19:47:23,425 - INFO - Successfully scraped inactive LP
2024-06-28 19:47:29,494 - INFO - Successfully scraped inactive LLP
2024-06-28 19:48:05,259 - INFO - Successfully scraped inactive Corp
2024-06-28 20:03:33,989 - INFO - Successfully scraped active INC
2024-06-28 20:56:29,967 - INFO - Successfully scraped active LLC
2024-06-28 22:19:33,731 - INFO - Successfully scraped inactive INC
2024-06-28 22:27:32,161 - INFO - Successfully scraped inactive LLC
2024-06-28 22:27:40,302 - INFO - Scraping completed. Data saved to all_companies.csv


Unnamed: 0,Entity Name,ID Number,NAICS,Inactive status,Address,status,entity_type
0,1177 E 82 ST GROUP CORP,001761799,,Revocation Notice,"805 PINEWOOD DR SMITHFIELD, RI 02917 USA",active,Corp
1,141 Corp.,000089282,722511,,"260 NEWPORT AVENUE EAST PROVIDENCE, RI 02916 ...",active,Corp
2,141 Westminster Corp.,000151557,722513,,"383 SMITHFIELD AVENUE PAWTUCKET, RI 02860 USA",active,Corp
3,1537 Corp.,000005981,722511,,"1537 NEWPORT AVENUE PAWTUCKET, RI 02861 USA",active,Corp
4,1836 REALTY CORP.,000076285,447110,,"1850 WARWICK ACE WARWICK, RI 02889 USA",active,Corp
...,...,...,...,...,...,...,...
299180,"WM Associates, l.p.",000050043,,,"C/O JOHN ASSALONE 6 VERONICA COURTCOVENTRY, RI...",inactive,LP
299181,WOODS FAMILY L.P.,000148347,,Cancelled,"575 EAST MAIN ROAD/WYATT SQUARE MIDDLETOWN, RI...",inactive,LP
299182,"WORLDWIDE CORPORATE HOUSING, L.P.",000144654,,Cancelled,"222 JEFFERSON BOULEVARD, SUITE 200 WARWICK, RI...",inactive,LP
299183,XM Express Trucking LP,001723174,,Conversion,"45 LIBERTY ST. CENTRAL FALLS, RI 02863 USA",inactive,LP


# Scraping per identification number page for all companies scraped

### Date of incorporation not qualification

### The exact name: <span style="color: green;">✅</span>

### The fictitious name and filed date: <span style="color: green;">✅</span>

### Entity type: <span style="color: green;">✅</span>

### Identification Number: <span style="color: green;">✅</span>

### Date of Incorporation in Rhode Island: <span style="color: green;">✅</span>

### Effective Date: <span style="color: green;">✅</span>

### The location of the Principal Office: <span style="color: green;">✅</span>

### Agent Resigned: <span style="color: green;">✅</span>

### Address Maintained: <span style="color: green;">✅</span>

### The name and address of the Registered Agent: <span style="color: green;">✅</span>

### The Officers and Directors of the Corporation (if INC or Corp): <span style="color: green;">✅</span>

### The name and business address of each General Partner (if LP): <span style="color: green;">✅</span>

### The name and business address of each Manager (if LLC): <span style="color: green;">✅</span>

### The name and business address of each Partner who is authorized to execute, acknowledge, deliver, and record any recordable instrument purporting to affect any interest in real property: <span style="color: red;">❌</span> Not Available

### The total number of shares and the par value, if any, of each class of stock which this business entity is authorized to issue: <span style="color: green;">✅</span>

### Purpose: <span style="color: green;">✅</span>

### North American Industry Classification System Code(NAICS): <span style="color: green;">✅</span>

### View filings for this business entity: <span style="color: green;">✅</span>


In [None]:
companies = pd.read_csv('all_companies.csv')
companies['ID Number'] = companies['ID Number'].astype(str).str.zfill(9)
companies

In [12]:

exact_names = []
fictitious_names = []
entity_types = []
identification_numbers = []
date_of_incorporations = []
effective_dates = []
loc_principal_offices = []
agent_resigned = []
address_maintained = []
name_address_reg_agents = []
off_dir_corps = []
name_business_add_GPs = []
LLC_managers = []
stocks = []
purposes = []
NIACS = []
business_entities = []

# service = Service(executable_path=r'C:\Users\H i - G E O R G E\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe')
# options = Options()
# ua = UserAgent()
# userAgent = ua.random
# options.add_argument(f'user-agent={userAgent}')

# driver = webdriver.Chrome(service=service,options=options)
# driver.get(f'https://business.sos.ri.gov/CorpWeb/CorpSearch/CorpSummary.aspx?FEIN={id_num}&SEARCH_TYPE=3')

# time.sleep(2)
# page = driver.page_source


def get_more_details(id_num):
    
    url = f'https://business.sos.ri.gov/CorpWeb/CorpSearch/CorpSummary.aspx?FEIN={id_num}&SEARCH_TYPE=3'
    print(id_num)
    ua = UserAgent()
    userAgent = ua.random
    headers = {'User-Agent': userAgent}
    page = requests.get(url, headers = headers)
    
    soup = BeautifulSoup(page.content, "html.parser")
    
    #exact_name
    if soup.find('span',attrs={'id': "MainContent_lblEntityName"}) is not None:
        exact_name = soup.find('span',attrs={'id': "MainContent_lblEntityName"}).text
        exact_names.append(exact_name)
    else:
        exact_names.append('None')

    #The fictitious name and filed date
    if soup.find('table',attrs={'id': "MainContent_tblFictitiousName"}) is not None:
        fictitious_name = soup.find('table',attrs={'id': "MainContent_tblFictitiousName"}).text.replace('he fictitious name of: ','')
        fictitious_name = re.sub(r'\s+', ' ', fictitious_name)
        fictitious_names.append(fictitious_name)
    else:
        fictitious_names.append('None')
        
    #entity_type
    if soup.find('span',attrs={'id': "MainContent_lblEntityType"}) is not None:
        entity_type = soup.find('span',attrs={'id': "MainContent_lblEntityType"}).text
        entity_types.append(entity_type)
    else:
        entity_types.append('None')
    
    #identification number
    if soup.find('span',attrs={'id': "MainContent_lblIDNumber"}) is not None:
        identification_number = soup.find('span',attrs={'id': "MainContent_lblIDNumber"}).text.replace('Identification Number: ','')
        identification_numbers.append(identification_number)
    else:
        identification_numbers.append('None')

    #date of incorporation
    if soup.find('span',attrs={'id': "MainContent_lblOrganisationDate"}) is not None:
        date_of_incorporation = soup.find('span',attrs={'id': "MainContent_lblOrganisationDate"}).text
        date_of_incorporations.append(date_of_incorporation)
    else:
        date_of_incorporations.append('None')
    
    #effective date
    if soup.find('span',attrs={'id': "MainContent_lblEffectiveDate"}) is not None:
        effective_date = soup.find('span',attrs={'id': "MainContent_lblEffectiveDate"}).text
        effective_dates.append(effective_date)
    else:
        effective_dates.append('None')
    
    #location of principal office
    if soup.find('table',attrs={'id': "MainContent_tblPrincipleOffice"}) is not None:
        loc_principal_office = soup.find('table',attrs={'id': "MainContent_tblPrincipleOffice"}).text.replace('\xa0','').replace('\n','').replace('The location of the Principal Office:','').replace('Address:','').replace('City or Town, State, Zip, Country:',',')
        loc_principal_office = re.sub(r'\s+', ' ', loc_principal_office)
        loc_principal_offices.append(loc_principal_office)
    else:
        loc_principal_offices.append('None')


    #agent resigned
    if soup.find('span',attrs={'id': "MainContent_lblResidentAgentFlag"}) is not None:
        agent_resign = soup.find('span',attrs={'id': "MainContent_lblResidentAgentFlag"}).text
        agent_resigned.append(agent_resign)
    else:
        agent_resigned.append('None')
        
    
    #address maintained
    if soup.find('span',attrs={'id': "MainContent_lblConsentFlag"}) is not None:
        address_maintain = soup.find('span',attrs={'id': "MainContent_lblConsentFlag"}).text
        address_maintained.append(address_maintain)
    else:
        address_maintained.append('None')
    
    
    #The name and address of the Registered Agent
    if soup.find('table',attrs={'id': "MainContent_tblResident"}) is not None:
        name_address_reg_agent = soup.find('table',attrs={'id': "MainContent_tblResident"}).text.replace('Name:','').replace('\n','').replace('The name and address of the Registered Agent:','').replace('Address:',',').replace('City or Town, State, Zip, Country:',',').replace('The name and address of the Resident Agent:','')
        name_address_reg_agent = re.sub(r'\s+', ' ', name_address_reg_agent)
        name_address_reg_agents.append(name_address_reg_agent)
    else:
        name_address_reg_agents.append('None')
    
    
    #The Officers and Directors of the Corporation
    if soup.find('table',attrs={'id': "MainContent_tblOfficers"}) is not None:
        off_dir_corp = soup.find('table',attrs={'id': "MainContent_tblOfficers"}).text.replace('TitleIndividual NameAddress','').replace('\n','').replace('The Officers and Directors of the Corporation:','').replace('Address:',',').replace('City or Town, State, Zip, Country:',',')
        off_dir_corp = re.sub(r'\s+', ' ', off_dir_corp)
        off_dir_corps.append(off_dir_corp)
    else:
        off_dir_corps.append('None')
        
    #The name and business address of each General Partner
    if soup.find('table',attrs={'id': "MainContent_grdManagers"}) is not None:
        name_business_add_GP = soup.find('table',attrs={'id': "MainContent_grdManagers"}).text.replace('TitleIndividual nameAddress','').replace('\n','')
        name_business_add_GP = re.sub(r'\s+', ' ', name_business_add_GP)
        name_business_add_GPs.append(name_business_add_GP)
    else:
        name_business_add_GPs.append('None')



    #The limited liability company is to be managed by its Managers
    if soup.find('table',attrs={'id': "MainContent_tblManagers"}) is not None:
        LLC_manager = soup.find('table',attrs={'id': "MainContent_tblManagers"}).text.replace('The limited liability company is to be managed by its ManagersThe name and business address of each Manager:','').replace('\n','').replace('TitleIndividual nameAddress','').replace('The limited liability company is to be managed by its MembersThe name and business address of each Manager:','')
        LLC_manager = re.sub(r'\s+', ' ', LLC_manager)
        LLC_managers.append(LLC_manager)
    else:
        LLC_managers.append('None')
    
    
    
    #The total number of shares and the par value, if any, of each class of stock which this business entity is authorized to issue
    if soup.find('table',attrs={'id': "MainContent_tblStocks"}) is not None:
        stock = soup.find('table',attrs={'id': "MainContent_tblStocks"}).text.replace('The total number of shares and the par value, if any, of each class of stock which this business entity is authorized to issue:','').replace('\n','').replace('TitleIndividual nameAddress','').replace('Class of Stock Series Par value per share Total AuthorizedTotal issued and outstandingNo. of sharesNo. of shares','')
        stock = re.sub(r'\s+', ' ', stock)
        stocks.append(stock)
    else:
        stocks.append('None')
    
    
    #Purpose
    if soup.find('table',attrs={'id': "MainContent_tblComments"}) is not None:
        purpose = soup.find('table',attrs={'id': "MainContent_tblComments"}).text.replace('Purpose:','').replace('\n','')
        purpose = re.sub(r'\s+', ' ', purpose)
        purposes.append(purpose)
    else:
        purposes.append('None')



    #North American Industry Classification System Code(NAICS)
    if soup.find('input',attrs={'id': "MainContent_txtNIACS","value": True}) is not None:
        NIAC = soup.find('input',attrs={'id': "MainContent_txtNIACS","value": True})
        NIAC = NIAC["value"]
        NIACS.append(NIAC)
    else:
        NIACS.append('None')
    
    
    
    #View filings for this business entity
    if soup.find('tr',attrs={'id': "MainContent_tdFilingList"}) is not None:
        business_entity = soup.find('tr',attrs={'id': "MainContent_tdFilingList"}).text.replace('\n',',').strip(',')
        business_entity = re.sub(r'\s+', ' ', business_entity)
        business_entities.append(business_entity)
    else:
        business_entities.append('None')

for i in companies['ID Number'][1:5]:
    get_more_details(i)
 


extra_comp_data = {
    "Exact Names": exact_names,
    "Fictitious Names and Filed Date": fictitious_names,
    "Entity Types": entity_types,
    "Identification Numbers": identification_numbers,
    "Date of Incorporations": date_of_incorporations,
    "Effective Dates": effective_dates,
    "Location of Principal Offices": loc_principal_offices,
    "Agent Resigned": agent_resigned,
    "Address Maintained": address_maintained,
    "Name and Address of Registered Agents": name_address_reg_agents,
    "Officers, Directors, and Corporations": off_dir_corps,
    "Name and Business Address of General Partners": name_business_add_GPs,
    "LLC Managers": LLC_managers,
    "Stocks": stocks,
    "Purposes": purposes,
    "NIACS": NIACS,
    "Business Entities": business_entities
}

full_comp_df = pd.DataFrame(extra_comp_data)

full_comp_df

000089282
000151557
000005981
000076285


Unnamed: 0,Exact Names,Fictitious Names and Filed Date,Entity Types,Identification Numbers,Date of Incorporations,Effective Dates,Location of Principal Offices,Agent Resigned,Address Maintained,Name and Address of Registered Agents,"Officers, Directors, and Corporations",Name and Business Address of General Partners,LLC Managers,Stocks,Purposes,NIACS,Business Entities
0,141 Corp.,,Domestic Profit Corporation,89282,04-25-1996,04-25-1996,"260 NEWPORT AVENUE ,EAST PROVIDENCE, RI 02916 USA",N,Y,"MICHAEL A. MOSCO, ESQ.,33 RUXTON ST. ,CRANSTO...",PRESIDENTSTEPHEN RICARD 3 EMERALD LANE JOHNSTO...,,,Class of Stock Series Par value per share Tota...,REAL ESTATE OWNERSHIP. TITLE: 7-1.1-51,722511 Full-Service Restaurants,"ALL FILINGS,Annual Report,Annual Report - Amen..."
1,141 Westminster Corp.,,Domestic Profit Corporation,151557,11-02-2005,11-02-2005,"383 SMITHFIELD AVENUE ,PAWTUCKET, RI 02860 USA",N,Y,"BRUCE A. WOLPERT, ESQ.,235 PROMENADE STREET, ...",PRESIDENTDAVID A. PETROSINELLI 337 EAST 62ND S...,,,Class of Stock Series Par value per share Tota...,OPERATION OF A DUNKIN' DONUTS FRANCHISE TITLE...,722513 Limited-Service Restaurants,"ALL FILINGS,Annual Report,Annual Report - Amen..."
2,1537 Corp.,TSPUMONI'S RESTAURANT was filed on 09-21-1999,Domestic Profit Corporation,5981,04-24-1975,04-24-1975,"1537 NEWPORT AVENUE ,PAWTUCKET, RI 02861 USA",N,Y,"GEORGE A. PANAS,1537 NEWPORT AVENUE ,PAWTUCKE...","PRESIDENTGEORGE PANAS 165 FAIRWAY DR SEEKONK, ...",,,Class of Stock Series Par value per share Tota...,RESTAURANT TITLE: 7-1.1-51,722511 Full-Service Restaurants,"ALL FILINGS,Annual Report,Annual Report - Amen..."
3,1836 REALTY CORP.,,Domestic Profit Corporation,76285,03-31-1994,03-31-1994,"1850 WARWICK ACE ,WARWICK, RI 02889 USA",N,Y,"CHRISTIAN C. POTTER, ESQ.,1850 WARWICK AVENUE...",TREASURERSARA CAITLIN POTTER 39 BRIARWOOD DR. ...,,,Class of Stock Series Par value per share Tota...,"BUYING, SELLING, LEASING, HOLDING AND OTHERWI...",447110 Gasoline Stations with Convenience Stores,"ALL FILINGS,Annual Report,Annual Report - Amen..."


In [10]:
for i in companies['ID Number']:
    print(i)

1761799
89282
151557
5981
76285
76283
115782
115783
65408
487970
1731359
791798
75075
1698885
1737982
75278
1690061
10227
789962
88725
1761540
906409
1670100
1752136
163473
151555
1271
87492
1765705
522037
1740355
1688187
1767573
830351
118376
118376
1680383
1723685
1736763
95225
118376
118376
1763258
35416
90193
90981
118315
1685268
557420
241
1683530
1767754
293567
68312
682435
1700208
1730918
1730909
883001
567302
1772649
1775633
506278
516372
1685835
923522
1767988
45546
90193
131487
120298
550604
443186
88176
45508
45508
45508
61591
1761425
1682453
1751437
1762367
157003
114469
487606
558
304843
1750123
129830
41942
1745544
1718605
99804
100751
69056
1769757
546599
1757014
113902
571487
142338
1769833
602104
1756220
130275
710710
143176
123487
881
33501
797511
1671121
1756323
1715576
521216
1775546
654187
1038
869992
1743949
138048
1755605
103760
1725573
1660844
135905
135905
135905
1768343
1773236
1735329
47860
1736744
171835
1732903
1732903
1701152
1763244
1754818
1680717
799193