In [2]:
#This is where we import the necessary modules. 
#If you would like to run this code: Go to your anaconda terminal and type pip install <package name>
#No need to import time or random
import pandas as pd
from tqdm import tqdm
import requests 
from bs4 import BeautifulSoup as BS
import re
import json
import time
import random
import warnings
warnings.filterwarnings('ignore')
#This formats the tqdm package for pandas looping (makes the loading bars for the long functions)
tqdm.pandas()

In [3]:
#Missing Data Read.
#This block just reads the code from the G drive and opens the excel file. 
#!!!!!!!!!!!!!! YOU MAY NEED TO ADJUST THE PATH FOR YOUR PERSONAL MACHINE
#FOR NON VPN TESTING:

MDF_path=r"C:\Users\khat\OneDrive - PENNSYLVANIA COMPENSATION RATING BUREAU\Desktop\Copy of List_Combined_threshold_0.5_Chunk_10000.csv"
MDF=pd.read_csv(MDF_path)

In [4]:
#Just a quick check of the dimensions of the dataset
MDF.shape

(6845, 3)

In [5]:
#Helper Functions
#General Functions -> See Docustrings (The text under the function name)
def parse_business_info(bid):
    Name= bid[1]['TITLE']
    Name=Name[0]
    #Cleaning the ID out of the name
    Name= Name.split('(')[0].strip()
    IFD= bid[1]['FILING_DATE']
    Status=bid[1]['STATUS']
    Entity=bid[1]['ENTITY_TYPE']
    Formed_in= bid[1]['FORMED_IN']
    Address= bid[1]['AGENT']
    Stnd=bid[1]['STANDING']
    return([Name, Address])

#========================== 
def Choose_Result(response_data):
    '''
    This function takes as input the response from the Penn Gov website and formats it into a useable result for our code
    Input: .json response
    Output: A list with two items: Name, Address as given by the PA gov. Returns NA for no results
    '''
    # Extract rows from the response
    rows = response_data.get('rows', {})
    Name=pd.NA
    Address=pd.NA
    count=0
    for bid in reversed(rows.items()):
        #This count is critical as it helps our code know how many results we have
        count+=1
        #The below if conditon checks if the status of a result is not equal to 'Active' and will not select it unless it is the only result
        if bid[1]['ENTITY_TYPE']== 'Fictitious Name':
            if count== len(rows.items()):
                return(parse_business_info(bid))
            else:
                continue
        else:
            if bid[1]['STATUS'] != 'Active':
                if count== len(rows.items()):
                    return(parse_business_info(bid))
                else:
                    continue
            else:
                return(parse_business_info(bid))
            
            

#============================================
def create_cols(DF,index, Opti_List):
    '''
    This function adds the new columns and populates it with new results
    Input: A pandas dataframe, and index (for looping), and a list of new columns to add and populate
    Output: The modified DF with the new search columns
    '''
    new_opt_cols=['Opti_Gov_Name', 'Opti_Gov_Add']
    #Check for prescence of new columns
    for col_name in new_opt_cols:
        if col_name not in DF.columns:
            DF[col_name]=pd.NA

    if Opti_List != None and len(Opti_List)>=2:
        #New Opti
        DF.at[index, new_opt_cols[0]]= Opti_List[0]
        DF.at[index, new_opt_cols[1]]= Opti_List[1]
    else:
        DF.at[index, new_opt_cols[0]]= pd.NA
        DF.at[index, new_opt_cols[1]]= pd.NA

    return(DF)
#========================================


In [6]:
#Biz Search Function for PA

user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'
]

def search_biz(bna):      
    url = r'https://file.dos.pa.gov/api/Records/businesssearch'
   
    headers = {
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, br, zstd',
        'Accept-Language': 'en-US,en;q=0.9',
        'Connection': 'keep-alive',
        'Content-Type': 'application/json',
        'Cookie': 'ASP.NET_SessionId=ck0bgrlxslkvidjnejw22iur',
        'Host': 'file.dos.pa.gov',
        'Origin': 'https://file.dos.pa.gov',
        'Referer': 'https://file.dos.pa.gov/search/business',
        'User-Agent': random.choice(user_agents)
    }

    payload = {
        "SEARCH_VALUE": bna,
        "SEARCH_FILTER_TYPE_ID": "1",
        "FILING_TYPE_ID": "",
        "STATUS_ID": "",
        "FILING_DATE": {"start": None, "end": None}
    }

    payload_json = json.dumps(payload)
    while True:
        try:
            #time.sleep(random.uniform(1,4))
            response = requests.post(url, headers=headers, data=payload_json)
            if response.status_code == 200:
                data = response.json()
                return Choose_Result(data)
            if response.status_code==403:
                time.sleep(8)
            else:
                return None
        except json.JSONDecodeError:
            print("Failed to parse JSON response.")
            return None
        except requests.RequestException as e:
            print(f"Request failed: {e}")
            return(None)
        


In [7]:
def ME(DF):
    '''
    This is the function that loops through a subDataframe and runs the above functions on each row, adding new columns and match data.
    Input: A dataframe
    Output: A dataframe with new rows and values
    '''
    for index, row in tqdm(DF.iterrows(), total=DF.shape[0], desc="Matchmaking.... Please be Patient"):
        Opti=search_biz(row['Trimed PRIMARYINSUREDNAME'])
        create_cols(DF, index, Opti)
    return(DF)

In [8]:

def main(DF):
    finished_chunks=[]
    for start in range(0, len(DF), 120):
        end=start+120
        if end < len(DF):
            print(f"Rows {start} to {end}/ {len(DF)}")
        else: 
            print(f"Rows {start} to {len(DF)}")
        chunk=DF[start:end]
        processed_chunk=ME(chunk)
        finished_chunks.append(processed_chunk)
        if end < len(DF):
            time.sleep(15)
            
    Final_DF=pd.concat(finished_chunks, ignore_index=True)
    SR=Final_DF['Opti_Gov_Name'].apply(lambda x: x is not pd.NA).mean()*100
    print(f"Success Rate: {SR}%")
    return(Final_DF)        


In [9]:
#Getting the mystery samp
#Maybe Domestic Business Corporation is a preferred Category?? Or maybe just the Domestic SubStringPr
Prob=['ADVANCE MEDICAL DESIGNS INC',
 'ARS INC',
 'CEDAR VILLAGE TOWNHOMES LP',
 'CMG INC',
 'EAGLE CHIROPRACTIC PC',
 'FARM FRESH MARKET',
 'GC BUILDERS LLC',
'KIDS2 INC'
]
#CMG INC is an interesting example Use as a test for how it iterates
#KIDS2 INC is an example of the rare former name parameter from the Penn Gov Site

#I think my filtering practices need to be altered. They can be made more effcient
#It is still possible I am getting throttled
#First plan of atta
A=['Trimed PRIMARYINSUREDNAME']

test=pd.DataFrame(Prob, columns=A)

main(test)

Rows 0 to 8


Matchmaking.... Please be Patient: 100%|██████████| 8/8 [00:01<00:00,  6.05it/s]

Success Rate: 100.0%





Unnamed: 0,Trimed PRIMARYINSUREDNAME,Opti_Gov_Name,Opti_Gov_Add
0,ADVANCE MEDICAL DESIGNS INC,"Advance Medical Designs, Inc.",CT Corporation System
1,ARS INC,"ARS, INC.","204 CALDER WAY STE 206 B&D\r\nPO BOX 1026, STA..."
2,CEDAR VILLAGE TOWNHOMES LP,"Cedar Village Townhomes, LP","813 MANOR DRIVE\r\nWHISTLEWOOD COMMONS, DUBLIN..."
3,CMG INC,"CMG, Inc.","2399 OLD LINCOLN HIGHWAY, TREVOSE, PA 19053"
4,EAGLE CHIROPRACTIC PC,"EAGLE CHIROPRACTIC, P.C.",NO 3 THE TOOMEY BUILDING \r\nRO...
5,FARM FRESH MARKET,Farm Fresh Market of Pittsburgh Inc,"226 N. NEGLEY STREET, PITTSBURGH, PA 15206"
6,GC BUILDERS LLC,"G.C. Builders, LLC","42 MACK ROAD, BANGOR, PA 18013"
7,KIDS2 INC,"Kids2, LLC",Corporation Service Company


In [10]:
#THE PLAN: Run this thing in chunk of 300
#Make the partitions OUTSIDE of the function so that they can be saved in the event if an interuption

Partitions=[]
for start in range(0, len(MDF), 500):
        end=start+500
        parti=MDF[start:end]
        Partitions.append(parti)
print(len(Partitions))
print(Partitions[0])

Finshed=[]

14
              Trimed PRIMARYINSUREDNAME Exist in Matching Result 2  \
0                               3418576                  Not Found   
1                              0776 LLC                  Not Found   
2                       1 STOP TRAINING                  Not Found   
3                              100X LLC                  Not Found   
4                   1022403 ONTARIO LTD                  Not Found   
..                                  ...                        ...   
495                    ATKINSON DAVID R                  Not Found   
496         ATLANTIC BIOMEDICAL COMPANY                  Not Found   
497  ATLANTIC CONSTRUCTION SERVICES INC                  Not Found   
498          ATLANTIC TRAVEL CENTER LLC                  Not Found   
499                  ATLAS ADVISORS LLC                  Not Found   

    Exist in Matching Result 3  
0                    Not Found  
1                    Not Found  
2                    Not Found  
3                    Not

In [None]:
#First half
# p1=main(Partitions[0])
# Finshed.append(p1)
# p2=main(Partitions[1])
# Finshed.append(p2)
# p3=main(Partitions[2])
# Finshed.append(p3)
# p4=main(Partitions[3])
# Finshed.append(p4)
# p5=main(Partitions[4])
# Finshed.append(p5)
# p6=main(Partitions[5])
# Finshed.append(p6)
# p7=main(Partitions[6])
# Finshed.append(p7)

Rows 0 to 120/ 500


Matchmaking.... Please be Patient: 100%|██████████| 120/120 [00:24<00:00,  4.98it/s]


Rows 120 to 240/ 500


Matchmaking.... Please be Patient: 100%|██████████| 120/120 [03:35<00:00,  1.80s/it]


Rows 240 to 360/ 500


Matchmaking.... Please be Patient: 100%|██████████| 120/120 [03:07<00:00,  1.56s/it]


Rows 360 to 480/ 500


Matchmaking.... Please be Patient: 100%|██████████| 120/120 [03:06<00:00,  1.55s/it]


Rows 480 to 500


Matchmaking.... Please be Patient: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s]

Success Rate: 40.2%





In [None]:
df1 = pd.concat(Finshed)
# saving the dataframe
df1.to_csv('Govmatched_1.csv')
#First half saved For posterity

In [None]:
# p8=main(Partitions[7])
# Finshed.append(p8)
# p9=main(Partitions[8])
# Finshed.append(p9)
# p10=main(Partitions[9])
# Finshed.append(p10)
# p11=main(Partitions[10])
# Finshed.append(p11)
# p12=main(Partitions[11])
# Finshed.append(p12)
# p13=main(Partitions[12])
# Finshed.append(p13)
# p14=main(Partitions[13])
# Finshed.append(p14)

Rows 0 to 120/ 345


Matchmaking.... Please be Patient: 100%|██████████| 120/120 [00:21<00:00,  5.71it/s]


Rows 120 to 240/ 345


Matchmaking.... Please be Patient: 100%|██████████| 120/120 [04:41<00:00,  2.34s/it]


Rows 240 to 345


Matchmaking.... Please be Patient: 100%|██████████| 105/105 [02:29<00:00,  1.43s/it]

Success Rate: 28.405797101449277%





In [None]:
df_finished=pd.concat(Finshed)


6845
6845


In [None]:
#df_finished.to_csv('Govmatched.csv')

In [37]:
SR=df_finished['Opti_Gov_Name'].apply(lambda x: x is not pd.NA).mean()*100
print(f"Success Rate: {SR}%")

Success Rate: 38.55368882395909%
