In [23]:
import time
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup


driver = webdriver.Chrome()

# Function to scrape the front page (basic data)
def scrape_front_page():
    # Open the provided URL
    url = "https://www.mywsba.org/personifyebusiness/LegalDirectory.aspx?ShowSearchResults=TRUE&Status=Active"
    driver.get(url)
    time.sleep(5)  # Allow the page to load

    # Prepare CSV to save the front page data
    front_csv_file = 'front_page_data.csv'
    front_csv_columns = ['License Number', 'First Name', 'Last Name', 'City', 'Status', 'Phone', 'Profile_Link']

    with open(front_csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=front_csv_columns)
        writer.writeheader()

        # Parse the page source using BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Debugging: Print the raw HTML of the page
        print(soup.prettify())  # This is just for debugging; you can remove it once everything is fixed

        # Extract the person data from the page
        rows = soup.find_all('tr', class_='grid-row')  # Each row is in a <tr class="grid-row">
        print(f"Found {len(rows)} rows on this page.")  # Debugging: Print how many rows were found

        for row in rows:
            # Extract data for each row
            columns = row.find_all('td')
            if len(columns) >= 6:
                license_number = columns[0].text.strip() if columns[0].text.strip() else 'N/A'
                first_name = columns[1].text.strip().split()[0] if len(columns[1].text.strip().split()) > 0 else 'N/A'
                last_name = columns[2].text.strip() if len(columns[2].text.strip()) > 0 else 'N/A'
                city = columns[3].text.strip() if len(columns[3].text.strip()) > 0 else 'N/A'
                status = columns[4].text.strip() if len(columns[4].text.strip()) > 0 else 'N/A'
                phone = columns[5].text.strip() if len(columns[5].text.strip()) > 0 else 'N/A'

                # The link to the person's detailed profile page (from the onclick attribute)
                profile_link = row['onclick'].split("'")[1]  # Extract the profile URL from the onclick attribute
                profile_url = "https://www.mywsba.org/personifyebusiness/" + profile_link  # Full profile URL

                # Write the scraped data to the CSV file
                writer.writerow({
                    'License Number': license_number,
                    'First Name': first_name,
                    'Last Name': last_name,
                    'City': city,
                    'Status': status,
                    'Phone': phone,
                    'Profile_Link': profile_url
                })



In [24]:
scrape_front_page()

<html class="t-chrome t-chrome132 win chrome chrome1 webkit webkit5" lang="en-US">
 <head id="Head">
  <title>
   Legal Directory
  </title>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="text/javascript" http-equiv="Content-Script-Type"/>
  <meta content="text/css" http-equiv="Content-Style-Type"/>
  <meta content="Personify eBusiness" id="MetaDescription" name="DESCRIPTION"/>
  <meta content="Personify eBusiness" id="MetaKeywords" name="KEYWORDS"/>
  <meta content="Powered by Personify eBusiness" id="MetaCopyright" name="COPYRIGHT"/>
  <meta content="Personify eBusiness" id="MetaAuthor" name="AUTHOR"/>
  <meta content="DOCUMENT" name="RESOURCE-TYPE"/>
  <meta content="GLOBAL" name="DISTRIBUTION"/>
  <meta content="INDEX, FOLLOW" id="MetaRobots" name="ROBOTS"/>
  <meta content="1 DAYS" name="REVISIT-AFTER"/>
  <meta content="GENERAL" name="RATING"/>
  <meta content="RevealTrans(Duration=0,Transition=1)" http-equiv="PAGE-ENTER"/>
  <style id="St

In [25]:
# Function to scrape additional details from each profile page
def scrape_profile_details():
    profile_csv_file = 'profile_details.csv'
    profile_csv_columns = ['First Name', 'Last Name', 'License Number', 'License Type', 'License Status', 
                           'Firm Name', 'Address', 'City', 'State', 'Zip', 'Email', 'Phone', 'Fax', 'Website', 'Eligible To Practice', 'WSBA Admit Date']

    # Open the CSV file for writing the scraped data
    with open(profile_csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=profile_csv_columns)
        writer.writeheader()

        # Read the front page data to iterate over the profiles
        with open('front_page_data.csv', mode='r', newline='', encoding='utf-8') as front_file:
            reader = csv.DictReader(front_file)

            for row in reader:
                # Click on the profile link
                profile_url = row['Profile_Link']
                driver.get(profile_url)
                time.sleep(2)  # Allow profile page to load

                # Parse the profile page with BeautifulSoup
                profile_soup = BeautifulSoup(driver.page_source, 'html.parser')

                # License Information Section Extraction
                license_number = profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblMemberNo').text.strip() if profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblMemberNo') else 'N/A'
                license_type = profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblLicenseType').text.strip() if profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblLicenseType') else 'N/A'
                license_status = profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblStatus').text.strip() if profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblStatus') else 'N/A'
                eligible_to_practice = profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblEligibleToPractice').text.strip() if profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblEligibleToPractice') else 'N/A'
                wsba_admit_date = profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblWaAdmitDate').text.strip() if profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblWaAdmitDate') else 'N/A'

                # Contact Information Section Extraction
                firm_name = profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblAddCompanyName').text.strip() if profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblAddCompanyName') else 'N/A'
                address = profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblAddress').text.strip() if profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblAddress') else 'N/A'
                city_state_zip = profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblAddress').text.strip() if profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblAddress') else 'N/A'
                phone = profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblPhone').text.strip() if profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblPhone') else 'N/A'
                fax = profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblFax').text.strip() if profile_soup.find('span', id='dnn_ctr2977_DNNWebControlContainer_ctl00_lblFax') else 'N/A'

                # Email extraction fix: Find the <a> tag with the 'mailto:' prefix
                email_tag = profile_soup.find('a', {'href': lambda x: x and x.startswith('mailto:')})
                email = email_tag['href'].replace('mailto:', '') if email_tag else 'N/A'

                # Safe extraction of the website URL
                website_tag = profile_soup.find('a', id='dnn_ctr2977_DNNWebControlContainer_ctl00_hlWebsite')
                website = website_tag['href'] if website_tag and 'href' in website_tag.attrs else 'N/A'

                # Extract city, state, and zip code from the address
                city_state_zip_split = city_state_zip.split(',') if city_state_zip != 'N/A' else ['N/A', 'N/A', 'N/A']
                city = city_state_zip_split[0].strip() if len(city_state_zip_split) > 0 else 'N/A'
                state_zip = city_state_zip_split[1] if len(city_state_zip_split) > 1 else 'N/A'
                state = state_zip.split(' ')[0].strip() if state_zip != 'N/A' else 'N/A'
                zip_code = state_zip.split(' ')[1] if len(state_zip.split(' ')) > 1 else 'N/A'

                # Write the extracted profile data into CSV
                writer.writerow({
                    'First Name': row['First Name'],
                    'Last Name': row['Last Name'],
                    'License Number': license_number,
                    'License Type': license_type,
                    'License Status': license_status,
                    'Firm Name': firm_name,
                    'Address': address,
                    'City': city,
                    'State': state,
                    'Zip': zip_code,
                    'Email': email,
                    'Phone': phone,
                    'Fax': fax,
                    'Website': website,
                    'Eligible To Practice': eligible_to_practice,
                    'WSBA Admit Date': wsba_admit_date
                })

    print("Profile details have been successfully scraped and saved to profile_details.csv.")



In [26]:
scrape_profile_details()

Profile details have been successfully scraped and saved to profile_details.csv.


In [19]:
def combine_csv_files():
    # Open the front page data (the first CSV)
    with open('front_page_data.csv', mode='r', newline='', encoding='utf-8') as front_file:
        front_reader = csv.DictReader(front_file)
        front_data = list(front_reader)  # Read all rows into a list
        
    # Open the profile details data (the second CSV)
    with open('profile_details.csv', mode='r', newline='', encoding='utf-8') as profile_file:
        profile_reader = csv.DictReader(profile_file)
        profile_data = list(profile_reader)  # Read all rows into a list

    # Define the final CSV columns
    combined_columns = [
        'First Name', 'Last Name', 'License Number', 'License Type', 'License Status', 
        'Firm Name', 'Address', 'City', 'State', 'Zip', 'Email', 'Phone', 'Fax', 'Website', 
        'Eligible To Practice', 'WSBA Admit Date'
    ]
    
    # Create a new CSV file to store the combined data
    with open('combined_profile_data.csv', mode='w', newline='', encoding='utf-8') as combined_file:
        writer = csv.DictWriter(combined_file, fieldnames=combined_columns)
        writer.writeheader()
        
        # Combine data from both CSVs (we assume they have common columns like First Name, Last Name)
        for front_row, profile_row in zip(front_data, profile_data):
            # Prepare a dictionary for each row of the combined data
            combined_row = {
                'First Name': front_row['First Name'],
                'Last Name': front_row['Last Name'],
                'License Number': profile_row.get('License Number', 'N/A'),
                'License Type': profile_row.get('License Type', 'N/A'),
                'License Status': profile_row.get('License Status', 'N/A'),
                'Firm Name': profile_row.get('Firm Name', 'N/A'),
                'Address': profile_row.get('Address', 'N/A'),
                'City': profile_row.get('City', 'N/A'),
                'State': profile_row.get('State', 'N/A'),
                'Zip': profile_row.get('Zip', 'N/A'),
                'Email': profile_row.get('Email', 'N/A'),
                'Phone': profile_row.get('Phone', 'N/A'),
                'Fax': profile_row.get('Fax', 'N/A'),
                'Website': profile_row.get('Website', 'N/A'),
                'Eligible To Practice': profile_row.get('Eligible To Practice', 'N/A'),
                'WSBA Admit Date': profile_row.get('WSBA Admit Date', 'N/A')
            }
            # Write the combined row to the CSV file
            writer.writerow(combined_row)

    print("CSV files have been successfully combined and saved as 'combined_profile_data.csv'.")


CSV files have been successfully combined and saved as 'combined_profile_data.csv'.
