# Problem Statement

# Objective:
Find, standardize, and continuously update data regarding construction and infrastructure
projects and tenders in the state of California.

# Part 1: 
Research and Data Sourcing
# Task: 
Research and identify 5-10 reliable data sources about construction and infrastructure
projects and tenders in California.
# Methodology: 
Use a combination of online research and language models (e.g., OpenAI's GPT
models) to identify these sources. Explicitly state how and why you used GPT or similar models
in your research process.

# Part 2: 
Data Extraction and Standardization
# Task: 
From the provided Table 1 and your own list, suggest methods to scrape data using
language model-based tools like OpenAI API, Mistral 7B, Llama2, or other open-source models.
# Requirements:
Demonstrate how you can build data products (DPs) to scrape data from multiple sources.
Standardize the scraped data according to the guidelines provided in Table 2.

# Part 3: 
Automation and Continuous Updating.
# Task: 
Propose a system for automating the data scraping and standardization processes.
# Details:
*  Explain how the data sources will be continuously updated.
*  Describe the use of cron jobs or similar scheduling tools for ongoing data updates.
*  Ensure your methodology adheres to a production environment's standards.

# Importing Libraries 

In [120]:
pip install schedule

Note: you may need to restart the kernel to use updated packages.


In [121]:
import re
import uuid
import time
import requests
import schedule
import pandas as pd
from bs4 import BeautifulSoup

# List Of Data Sources

* ConstructConnect - https://www.constructconnect.com/construction-near-me/california-construction-projects
* ElkGrove - https://www.elkgrovecity.org/southeast-policy-area/development-projects
* CityOfSanrafael - https://www.cityofsanrafael.org/major-planning-projects-2/
* SantaMariaGroup - https://www.santamariagroup.com/projects
* Highways.dot.gov - https://highways.dot.gov/federal-lands/projects/ca
* Publicworks.com - https://ocip.ocpublicworks.com/service-areas/oc-infrastructure-programs/projects-and-studies

In [122]:
# Existing data sources
datasource = [
    {"data_source": "Construct Connect", "url": "https://www.constructconnect.com/construction-near-me/california-construction-projects"},
    {"data_source": "ElkGrove", "url": "https://www.elkgrovecity.org/southeast-policy-area/development-projects"},
    {"data_source": "CityOfSanrafael", "url": "https://www.cityofsanrafael.org/major-planning-projects-2/"},
    {"data_source": "SantaMariaGroup", "url": "https://www.santamariagroup.com/projects"},
    {"data_source": "Highways.dot.gov", "url": "https://highways.dot.gov/federal-lands/projects/ca"},
    {"data_source": "Publicworks.com", "url": "https://ocip.ocpublicworks.com/service-areas/oc-infrastructure-programs/projects-and-studies"}
]

# Main DataFrame

In [123]:
# Example usage
main_df = pd.DataFrame(columns=["original_id", "aug_id", "country_name", "country_code", "map_coordinates", "url","data_source",
                                "region_name", "region_code", "title", "description", "status", "stages", "date",
                                "procurementMethod", "budget", "currency", "buyer", "sector", "subsector"])

In [125]:
main_df.describe

<bound method NDFrame.describe of Empty DataFrame
Columns: [original_id, aug_id, country_name, country_code, map_coordinates, url, data_source, region_name, region_code, title, description, status, stages, date, procurementMethod, budget, currency, buyer, sector, subsector]
Index: []>

# Web Scraping the data

In [126]:
#Helper Functions
#This data was collected from chatGPT, Bard and google
#Extra data needed required for below web scraped projects

ExtrenalProjectDataConstructConnect = [
    {
        "title": "Water / Sewer Construction in Sunol, California",
        "location": "Sunol, California",
        "sector": "Construction (COFOG 1)",
        "subsector": "Water & Sewerage (COFOG 11)",
        "procurementMethod": "Invitation to Bid/Request for Proposals",
        "map_coordinates": "Latitude 37.59438, Longitude -121.88857",
        "region_code": "94586",
        "buyer": "Sewer Contractors",
        "stages": [
            "Pre-construction",
            "Sitework and foundation",
            "Rough framing",
            "Exterior construction",
            "MEP (Mechanical, Electrical, Plumbing)",
            "Finishes and fixtures"
        ]
    },
    {
        "title": "Roads / Highways Construction in Acton, California",
        "location": "Acton, California",
        "sector": "Construction (COFOG 1)",
        "subsector": "Roads & Bridges (COFOG 12)",
        "procurementMethod": "Design-Build/Construction Manager/General Contractor",
        "map_coordinates": "Latitude 37.59438, Longitude -121.88857",
        "region_code": "94586",
        "buyer": "Leonida Builders",
        "stages": [
            "Pre-construction",
            "Sitework and foundation",
            "Rough framing",
            "Exterior construction",
            "MEP (Mechanical, Electrical, Plumbing)",
            "Finishes and fixtures"
        ]
    },
    {
        "title": "Bridges / Tunnels Construction in Hughes Mill, California",
        "location": "Hughes Mill, California",
        "sector": "Construction (COFOG 1)",
        "subsector": "Roads & Bridges (COFOG 12)",
        "procurementMethod": "Design-Build/Construction Manager/General Contractor",
        "map_coordinates": "Latitude 37.59438, Longitude -121.88857",
        "region_code": "94586",
        "buyer": "Construction Pros",
        "stages": [
            "Pre-construction",
            "Sitework and foundation",
            "Rough framing",
            "Exterior construction",
            "MEP (Mechanical, Electrical, Plumbing)",
            "Finishes and fixtures"
        ]
    },
    {
        "title": "Roads / Highways Construction in Colfax, California",
        "location": "Colfax, California",
        "sector": "Construction (COFOG 1)",
        "subsector": "Roads & Bridges (COFOG 12)",
        "procurementMethod": "Invitation to Bid/Request for Proposals",
        "map_coordinates": "Latitude 37.59438, Longitude -121.88857",
        "region_code": "94586",
        "buyer": "Construction Pros",
        "stages": [
            "Pre-construction",
            "Sitework and foundation",
            "Rough framing",
            "Exterior construction",
            "MEP (Mechanical, Electrical, Plumbing)",
            "Finishes and fixtures"
        ]
    },
    {
        "title": "Educational Construction in East Los Angeles, California",
        "location": "East Los Angeles, California",
        "sector": "Education (COFOG 9)",
        "subsector": "Educational Buildings (COFOG 91)",
        "procurementMethod": "Design-Build/Construction Manager/General Contractor",
        "map_coordinates": "Latitude 37.59438, Longitude -121.88857",
        "region_code": "94586",
        "buyer": "Alpha Structural",
        "stages": [
            "Pre-construction",
            "Sitework and foundation",
            "Rough framing",
            "Exterior construction",
            "MEP (Mechanical, Electrical, Plumbing)",
            "Finishes and fixtures"
        ]
    },
    {
        "title": "Roads / Highways Construction in Stockton, California",
        "location": "Stockton, California",
        "sector": "Construction (COFOG 1)",
        "subsector": "Roads & Bridges (COFOG 12)",
        "procurementMethod": "Invitation to Bid/Request for Proposals",
        "map_coordinates": "Latitude 37.59438, Longitude -121.88857",
        "region_code": "94586",
        "buyer": "Sewer Contractors",
        "stages": [
            "Pre-construction",
            "Sitework and foundation",
            "Rough framing",
            "Exterior construction",
            "MEP (Mechanical, Electrical, Plumbing)",
            "Finishes and fixtures"
        ]
    },
    {
        "title": "Educational Construction in Los Angeles, California",
        "location": "Los Angeles, California",
        "sector": "Education (COFOG 9)",
        "subsector": "Educational Buildings (COFOG 91)",
        "procurementMethod": "Design-Build/Construction Manager/General Contractor",
        "map_coordinates": "Latitude: 34.052235 Longitude: -118.243683",
        "region_code": "94586",
        "buyer": "Angeles Contractors",
        "stages": [
            "Pre-construction",
            "Sitework and foundation",
            "Rough framing",
            "Exterior construction",
            "MEP (Mechanical, Electrical, Plumbing)",
            "Finishes and fixtures"
        ]
    },
    {
        "title": "Bridges / Tunnels Construction in Long Beach, California",
        "location": "Long Beach, California",
        "sector": "Construction (COFOG 1)",
        "subsector": "Roads & Bridges (COFOG 12)",
        "procurementMethod": "Design-Build/Construction Manager/General Contractor",
        "map_coordinates": "Latitude: 33.7670 Longitude: -118.1892",
        "region_code": "94586",
        "buyer": "Bethlehem Steel",
        "stages": [
            "Pre-construction",
            "Sitework and foundation",
            "Rough framing",
            "Exterior construction",
            "MEP (Mechanical, Electrical, Plumbing)",
            "Finishes and fixtures"
        ]
    },
    {
        "title": "Playgrounds / Parks / Athletic Fields Construction in Oakland, California",
        "location": "Oakland, California",
        "sector": "Other (COFOG 99)",
        "subsector": "Recreational Facilities (COFOG 992)",
        "procurementMethod": "Invitation to Bid/Request for Proposals",
        "map_coordinates": "Latitude: 37.804363 Longitude: -122.271111",
        "region_code": "94586",
        "buyer": "Alpha Structural",
        "stages": [
            "Pre-construction",
            "Sitework and foundation",
            "Rough framing",
            "Exterior construction",
            "MEP (Mechanical, Electrical, Plumbing)",
            "Finishes and fixtures"
        ]
    },
]


In [127]:
#Web Scraping data from constructconnect

def ScrapDataConstructConnect(construct_connect_url):
    response = requests.get(construct_connect_url)
    try:
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            project_list = soup.find_all(class_='project-list')
            for project in project_list:
                table = project.find('table', class_='display compact')
            rows = table.find_all('tr')
            row_data = []
            for row in rows:
                project_details = row.find_all('td')
                row_data.append(project_details)
            row_data1 = []
            for i in range(1, len(row_data), 6):
                subset = row_data[i]
                sublist = []
                for j in range(0, 9):
                    td_value = str(subset[j])  # Convert to string
                    pattern = re.compile(r'<td>(.*?)</td>')
                    matches = re.findall(pattern, td_value)
                    cleaned_matches = [match.strip() for match in matches]
                    sublist.append(cleaned_matches[0])
                row_data1.append(sublist)
            n = len(main_df);
            for i in range(len(row_data1)):
                title = row_data1[i][0]
                if not any(main_df['title'] == title):
                    j = n + i
                    index = next((k for k, project in enumerate(ExtrenalProjectDataConstructConnect) if project['title'] == row_data1[i][0]), None)
                    main_df.loc[j, 'original_id'] = j
                    main_df.loc[j, 'aug_id'] = uuid.uuid4()
                    main_df.loc[j, 'country_name'] = "California"
                    main_df.loc[j, 'country_code'] = "US-CA"
                    main_df.loc[j, 'map_coordinates'] = ExtrenalProjectDataConstructConnect[index]['map_coordinates']
                    main_df.loc[j, 'data_source'] = "ConstructConnect"
                    main_df.loc[j, 'url'] = construct_connect_url
                    main_df.loc[j, 'region_name'] = row_data1[i][1]
                    main_df.loc[j, 'region_code'] = ExtrenalProjectDataConstructConnect[index]['region_code']
                    main_df.loc[j, 'title'] = row_data1[i][0]
                    main_df.loc[j, 'description'] = row_data1[i][5]
                    main_df.loc[j, 'status'] = row_data1[i][2]
                    main_df.loc[j, 'stages'] = ExtrenalProjectDataConstructConnect[index]['stages']
                    main_df.loc[j, 'date'] = row_data1[i][3]
                    main_df.loc[j, 'procurementMethod'] = ExtrenalProjectDataConstructConnect[index]['procurementMethod']
                    main_df.loc[j, 'budget'] = row_data1[i][8]
                    main_df.loc[j, 'currency'] = row_data1[i][8]
                    main_df.loc[j, 'buyer'] = ExtrenalProjectDataConstructConnect[index]['buyer']
                    main_df.loc[j, 'sector'] = ExtrenalProjectDataConstructConnect[index]['sector']
                    main_df.loc[j, 'subsector'] = ExtrenalProjectDataConstructConnect[index]['subsector']

        else:
            print("Failed to fetch data from the URL:", construct_connect_url)
    except Exception as e:
        print("An error occurred:", e)


In [128]:
#Helper Functions
#This data was collected from chatGPT, Bard and google
#Extra data needed required for below web scraped projects

ExtrenalProjectDataElkGrove = [
    {
        "title": "Souza Dairy",
        "location": "Elk Grove, California",
        "sector": "Construction (COFOG 1)",
        "subsector": "Single-Family Homes, Site Development",
        "procurementMethod": "Bids by Invitation",
        "map_coordinates": "Latitude 37.59438, Longitude -121.88857",
        "region_code": "94586",
        "date": "01/06/2023",
        "budget": "$28,511.00"
    },
    {
        "title": "Bruceville Meadows",
        "location": "Elk Grove, California",
        "sector": "Construction (COFOG 1)",
        "subsector": "Single-Family Homes, Site Development",
        "procurementMethod": "Bids by Invitation",
        "map_coordinates": "Latitude 37.59438, Longitude -121.88857",
        "region_code": "94586",
        "date": "26/07/2023",
        "budget": "$50,589.00"
    }]

In [129]:
#Web Scraping data from ElkGrove

def ScrapDataElkGrove(ElkGrove_url):
    response = requests.get(ElkGrove_url)
    try:
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table', class_='content-table')
            rows = table.find_all('tr')

            row_data = []
            for row in rows[1:]:  
                td_data = [td.get_text(strip=True) for td in row.find_all('td')]
                td_data = td_data[:-1] 
                ul_tag = row.find('ul')
                list_data = [li.get_text(strip=True) for li in ul_tag.find_all('li')]
                td_data.append(list_data)
                row_data.append(td_data)
            n = len(main_df);
            for i in range(len(row_data)):
                title = row_data[i][1]
                if not any(main_df['title'] == title):
                    j = n + i
                    index = next((k for k, project in enumerate(ExtrenalProjectDataElkGrove) if project['title'] == title), None)
                    main_df.loc[j, 'original_id'] = j
                    main_df.loc[j, 'aug_id'] = uuid.uuid4()
                    main_df.loc[j, 'country_name'] = "California"
                    main_df.loc[j, 'country_code'] = "US-CA"
                    main_df.loc[j, 'map_coordinates'] = ExtrenalProjectDataElkGrove[index]['map_coordinates']
                    main_df.loc[j, 'data_source'] = "ElkGrove"
                    main_df.loc[j, 'url'] = ElkGrove_url
                    main_df.loc[j, 'region_name'] = ExtrenalProjectDataElkGrove[index]['location']
                    main_df.loc[j, 'region_code'] = ExtrenalProjectDataElkGrove[index]['region_code']
                    main_df.loc[j, 'title'] = row_data[i][1]
                    main_df.loc[j, 'description'] = row_data[i][2]
                    main_df.loc[j, 'status'] = row_data[i][4]
                    main_df.loc[j, 'stages'] = row_data[i][5]
                    main_df.loc[j, 'date'] = ExtrenalProjectDataElkGrove[index]['date']
                    main_df.loc[j, 'procurementMethod'] = ExtrenalProjectDataElkGrove[index]['procurementMethod']
                    main_df.loc[j, 'budget'] = ExtrenalProjectDataElkGrove[index]['budget']
                    main_df.loc[j, 'currency'] = ExtrenalProjectDataElkGrove[index]['budget']
                    main_df.loc[j, 'buyer'] = row_data[i][3]
                    main_df.loc[j, 'sector'] = ExtrenalProjectDataElkGrove[index]['sector']
                    main_df.loc[j, 'subsector'] = ExtrenalProjectDataElkGrove[index]['subsector']

        else:
            print("Failed to fetch data from the URL:", ElkGrove_url)
    except Exception as e:
        print("An error occurred:", e)

In [130]:
#Web Scraping data from CityOfSanrafael

def ScrapDataCityOfSanrafael(CityOfSanrafael_url):
    response = requests.get(CityOfSanrafael_url)
    try:
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table', class_='table')
            rows = table.find_all('tr')
            row_data = []
            for row in rows[1:]:  
                td_data = [td.get_text(strip=True) for td in row.find_all('td')]
                td_data = td_data[1:] 
                td_with_a = row.find('td')
                if td_with_a:
                    a_tag = td_with_a.find('a')
                    if a_tag:
                        text = a_tag.get_text(strip=True)
                        td_data.append(text)
                row_data.append(td_data)
            
            n = len(main_df);
            for i in range(len(row_data)):
                title = row_data[i][7]
                if not any(main_df['title'] == title):
                    j = n + i
                    main_df.loc[j, 'original_id'] = j
                    main_df.loc[j, 'aug_id'] = uuid.uuid4()
                    main_df.loc[j, 'country_name'] = "California"
                    main_df.loc[j, 'country_code'] = "US-CA"
                    main_df.loc[j, 'url'] = CityOfSanrafael_url
                    main_df.loc[j, 'data_source'] = "CityOfSanrafael"
                    main_df.loc[j, 'title'] = title
                    main_df.loc[j, 'description'] = row_data[i][0]
                    main_df.loc[j, 'status'] = row_data[i][6]
                    main_df.loc[j, 'buyer'] = row_data[i][4]

        else:
            print("Failed to fetch data from the URL:", CityOfSanrafael_url)
    except Exception as e:
        print("An error occurred:", e)

In [131]:
#Web Scraping data from SantaMariaGroup

def ScrapDataSantaMariaGroup(SantaMariaGroup_url):
    response = requests.get(SantaMariaGroup_url)
    try:
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            div = soup.find_all('div', class_="fluid-engine")
            div = div[1:-1]
            row_data = []
            for i in div:
                rowi = []
                rowi.append(i.find('h3').text.strip())
                rowdiv = i.find_all('p', class_='sqsrte-small')
                dt = [p.text.strip() for p in rowdiv]
                rowi.append(dt[0])
                fields_to_extract = ["Location","Status", "Client", "SMG Role"]
                for field in fields_to_extract:
                    pattern = re.compile(fr"{field}:\s*(.*?)$", re.IGNORECASE)
                    field_value = None
                    for text in dt:
                        match = pattern.search(text)
                        if match:
                            field_value = match.group(1)
                            break
                    if field_value:
                        rowi.append(field_value)
                    else:
                        rowi.append(" ")
                row_data.append(rowi)

            n = len(main_df);
            for i in range(len(row_data)):
                title = row_data[i][0]
                if not any(main_df['title'] == title):
                    j = n + i
                    main_df.loc[j, 'original_id'] = j
                    main_df.loc[j, 'aug_id'] = uuid.uuid4()
                    main_df.loc[j, 'country_name'] = "California"
                    main_df.loc[j, 'country_code'] = "US-CA"
                    main_df.loc[j, 'data_source'] = "SantaMariaGroup"
                    main_df.loc[j, 'url'] = SantaMariaGroup_url
                    main_df.loc[j, 'title'] = title
                    main_df.loc[j, 'description'] = row_data[i][1]
                    main_df.loc[j, 'region_name'] = row_data[i][2]
                    main_df.loc[j, 'status'] = row_data[i][3]
                    main_df.loc[j, 'buyer'] = row_data[i][4]
        else:
            print("Failed to fetch data from the URL:", SantaMariaGroup_url)
    except Exception as e:
        print("An error occurred:", e)

In [132]:
#Web Scraping data from HighwaysDotGov

def ScrapDataHighwaysDotGov(HighwaysDotGov_url):
    response = requests.get(HighwaysDotGov_url)
    try:
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            div = soup.find_all('div', class_='views-element-container')
            div = div[3:]
            project_lst = []
            for i in div:
                table = i.find('tbody')
                status = i.find('h2')
                for tr in table.find_all('tr'):
                    td_title = tr.find('td', class_='views-field views-field-title')
                    project_name = td_title.text.strip()
                    project_link = "https://highways.dot.gov" + td_title.find('a')['href']
                    loc = tr.find('td', class_='views-field views-field-field-flh-location')
                    loc = loc.text.strip() + ', California'
                    project_lst.append([project_name, project_link,loc,status.text.strip()])
            for project in project_lst:
                response1 = requests.get(project[1])
                try:
                    if response1.status_code == 200:
                        soup = BeautifulSoup(response1.content, 'html.parser')
                        section = soup.find('section', class_='section')
                        body_text = section.find('div', class_='col-md-7 clearfix')
                        body_text = body_text.text.strip()
                        description_pattern = r'^(.*?)(?:\s*Anticipated\s*Timeline|\s*Project\s*Documents:)'
                        advertise_pattern = r'Anticipated Timeline\n(.*?)\n'
                        description = re.search(description_pattern, body_text, re.DOTALL).group(1).strip()
                        advertise_match = re.search(advertise_pattern, body_text)
                        if advertise_match:
                            advertise = advertise_match.group(1).strip()
                        else:
                            advertise = ""
                        contact = section.find('section', class_='contact-details')
                        contact_div = contact.find('span', class_='field-content')
                        buyer = contact_div.text.strip() + ", Federal Highway Administration, USDOT"
                        project.append(description)
                        project.append(advertise)
                        project.append(buyer)
                    else:
                        print("Failed to fetch data from the URL:", HighwaysDotGov_url)
                except Exception as e:
                    print("An error occurred:", e)
                    
            n = len(main_df)
            for i in range(len(project_lst)):
                title = project_lst[i][0]
                if not any(main_df['title'] == title):
                    j = n + i
                    main_df.loc[j, 'original_id'] = j
                    main_df.loc[j, 'aug_id'] = uuid.uuid4()
                    main_df.loc[j, 'country_name'] = "California"
                    main_df.loc[j, 'country_code'] = "US-CA"
                    main_df.loc[j, 'url'] = project_lst[i][1]
                    main_df.loc[j, 'data_source'] = "HighwaysDotGov"
                    main_df.loc[j, 'title'] = title
                    main_df.loc[j, 'region_name'] = project_lst[i][2]
                    main_df.loc[j, 'status'] = project_lst[i][3]
                    main_df.loc[j, 'description'] = project_lst[i][4]
                    main_df.loc[j, 'date'] = project_lst[i][5]
                    main_df.loc[j, 'buyer'] = project_lst[i][6]
        else:
            print("Failed to fetch data from the URL:", HighwaysDotGov_url)
    except Exception as e:
        print("An error occurred:", e)

In [134]:
#Web Scraping data from Publicworks.com

def ScrapDataPublicworks(Publicworks_url):
    response = requests.get(Publicworks_url)
    try:
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            main_div = soup.find('div', class_='block field-blocknodecounty-pagebody')
            div = main_div.find_all('tbody')
            project_lst = []
            for tbody in div:
                tr_tags = tbody.find_all('tr')
                for tr in tr_tags:
                    a_tag = tr.find('a')
                    project_name = a_tag.text.strip()
                    project_href = a_tag['href']
                    td_tags = tr.find_all('td')
                    project_data = [project_href] + [td.text.strip() for td in td_tags if td.text.strip()]
                    project_lst.append(project_data)
            for project in project_lst:
                if not project[0].startswith(("http://", "https://")):
                    project[0] = "https://ocip.ocpublicworks.com/" + project[0]

            for project in project_lst:
                response1 = requests.get(project[0])
                try:
                    if response1.status_code == 200:
                        soup = BeautifulSoup(response1.content, 'html.parser')
                        main = soup.find('main', class_='main-content main-content--with-sidebar')
                        text = main.find('p')
                        if text:
                            project.append(text.text.strip())
                    else:
                        print("Failed to fetch data from the URL:", Publicworks_url)
                except Exception as e:
                    print("An error occurred:", e)

            project_lst[8].insert(2, '')
            project_lst[5].insert(2, '')

            n = len(main_df)
            for i in range(len(project_lst)):
                title = project_lst[i][1]
                if not any(main_df['title'] == title):
                    j = n + i
                    main_df.loc[j, 'original_id'] = j
                    main_df.loc[j, 'aug_id'] = uuid.uuid4()
                    main_df.loc[j, 'country_name'] = "California"
                    main_df.loc[j, 'country_code'] = "US-CA"
                    main_df.loc[j, 'url'] = project_lst[i][0]
                    main_df.loc[j, 'data_source'] = "Publicworks.com"
                    main_df.loc[j, 'title'] = title
                    main_df.loc[j, 'status'] = project_lst[i][len(project_lst[i])-2]
                    main_df.loc[j, 'description'] = project_lst[i][len(project_lst[i])-1]
                    main_df.loc[j, 'date'] = project_lst[i][2]

        else:
            print("Failed to fetch data from the URL:", Publicworks_url)
    except Exception as e:
        print("An error occurred:", e)

In [139]:
#Call for DataSource-1
construct_connect_url = next((source["url"] for source in datasource if source["data_source"] == "Construct Connect"), None)
ScrapDataConstructConnect(construct_connect_url)

# #Call for DataSource-2
ElkGrove_url = next((source["url"] for source in datasource if source["data_source"] == "ElkGrove"), None)
ScrapDataElkGrove(ElkGrove_url)

#Call for DataSource-3
CityOfSanrafael_url = next((source["url"] for source in datasource if source["data_source"] == "CityOfSanrafael"), None)
ScrapDataCityOfSanrafael(CityOfSanrafael_url)

#Call for DataSource-4
SantaMariaGroup_url = next((source["url"] for source in datasource if source["data_source"] == "SantaMariaGroup"), None)
ScrapDataSantaMariaGroup(SantaMariaGroup_url)

#Call for DataSource-5
HighwaysDotGov_url = next((source["url"] for source in datasource if source["data_source"] == "Highways.dot.gov"), None)
ScrapDataHighwaysDotGov(HighwaysDotGov_url)

#Call for DataSource-6
Publicworks_url = next((source["url"] for source in datasource if source["data_source"] == "Publicworks.com"), None)
ScrapDataPublicworks(Publicworks_url)

In [140]:
main_df

Unnamed: 0,original_id,aug_id,country_name,country_code,map_coordinates,url,data_source,region_name,region_code,title,description,status,stages,date,procurementMethod,budget,currency,buyer,sector,subsector
0,0,01075b82-8add-484a-a9c0-e2cdb47f03e1,California,US-CA,"Latitude 37.59438, Longitude -121.88857",https://www.constructconnect.com/construction-...,ConstructConnect,"Sunol, California",94586,"Water / Sewer Construction in Sunol, California","Renovation of a civil project in Sunol, Califo...",GC Bidding,"[Pre-construction, Sitework and foundation, Ro...",12/28/2023,Invitation to Bid/Request for Proposals,"$183,000,000.00","$183,000,000.00",Sewer Contractors,Construction (COFOG 1),Water & Sewerage (COFOG 11)
1,1,30c6039c-09a6-44c0-972b-4359cc2bd87e,California,US-CA,"Latitude 37.59438, Longitude -121.88857",https://www.constructconnect.com/construction-...,ConstructConnect,"Acton, California",94586,"Roads / Highways Construction in Acton, Califo...",Site work and paving for a road / highway in A...,GC Bidding,"[Pre-construction, Sitework and foundation, Ro...",11/30/2023,Design-Build/Construction Manager/General Cont...,"$160,000,000.00","$160,000,000.00",Leonida Builders,Construction (COFOG 1),Roads & Bridges (COFOG 12)
2,2,fb988173-cbfa-4eed-84bd-8db74d84ce62,California,US-CA,"Latitude 37.59438, Longitude -121.88857",https://www.constructconnect.com/construction-...,ConstructConnect,"Hughes Mill, California",94586,"Bridges / Tunnels Construction in Hughes Mill,...",Site work and paving for a civil project in Hu...,GC Bidding,"[Pre-construction, Sitework and foundation, Ro...",12/14/2023,Design-Build/Construction Manager/General Cont...,"$120,000,000.00","$120,000,000.00",Construction Pros,Construction (COFOG 1),Roads & Bridges (COFOG 12)
3,3,0529b6b9-0422-4df1-aea2-db519167020c,California,US-CA,"Latitude 37.59438, Longitude -121.88857",https://www.constructconnect.com/construction-...,ConstructConnect,"Colfax, California",94586,"Roads / Highways Construction in Colfax, Calif...","Site work, paving and outdoor lighting for a c...",GC Bidding,"[Pre-construction, Sitework and foundation, Ro...",12/14/2023,Invitation to Bid/Request for Proposals,"$120,000,000.00","$120,000,000.00",Construction Pros,Construction (COFOG 1),Roads & Bridges (COFOG 12)
4,4,947dd5da-8e0f-48d2-a08b-f350769ee86b,California,US-CA,"Latitude 37.59438, Longitude -121.88857",https://www.constructconnect.com/construction-...,ConstructConnect,"East Los Angeles, California",94586,"Educational Construction in East Los Angeles, ...","Demolition, site work and new construction of ...",GC Bidding,"[Pre-construction, Sitework and foundation, Ro...",06/3/2024,Design-Build/Construction Manager/General Cont...,"$105,000,000.00","$105,000,000.00",Alpha Structural,Education (COFOG 9),Educational Buildings (COFOG 91)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,75,0b5c8a2a-0df6-4d55-b6a8-b1f37e90b0fb,California,US-CA,,https://ocip.ocpublicworks.com/service-areas/o...,Publicworks.com,,,Santa Ana-Delhi Channel Improvement Project,Thank you for your interest in the upcoming Sa...,Planned improvements along the flood control c...,,,,,,,,
76,76,544f049f-35e4-4a37-a37b-74cc1af2734b,California,US-CA,,https://ocip.ocpublicworks.com//page/east-gard...,Publicworks.com,,,East Garden Grove-Wintersburg Channel Project,The East Garden Grove-Wintersburg Channel Impr...,"Warner Avenue to Goldenwest Street, Huntington...",,20/21,,,,,,
77,77,3e9face3-7e1d-491e-abaf-8ad304cfab55,California,US-CA,,https://ocip.ocpublicworks.com//gov/pw/flood/n...,Publicworks.com,,,Yorba Street Drainage & Roadway Improvements,OC Public Works is currently planning improvem...,Planned improvements along a portion of Yorba ...,,21/22,,,,,,
78,78,0b303e48-d512-48f7-9905-16fbc337cdc2,California,US-CA,,https://ocip.ocpublicworks.com//gov/pw/flood/n...,Publicworks.com,,,San Juan Creek Flood Risk Management Feasibili...,The San Juan Creek Flood Risk Management Feasi...,Project Study,,,,,,,,


In [137]:
# main_df.drop(main_df.index, inplace=True)

In [142]:
#Save the main_df to CSV File (dataset)
main_df.to_csv('Example_Dataset.csv', index=False)

# Automation and Continuous Updating

In [143]:
# A system for automating the data scraping and standardization processes
# Function to execute the scraping and updating process

def run_process():
    
    #Call for DataSource-1
    construct_connect_url = next((source["url"] for source in datasource if source["data_source"] == "Construct Connect"), None)
    ScrapDataConstructConnect(construct_connect_url)

    # #Call for DataSource-2
    ElkGrove_url = next((source["url"] for source in datasource if source["data_source"] == "ElkGrove"), None)
    ScrapDataElkGrove(ElkGrove_url)

    #Call for DataSource-3
    CityOfSanrafael_url = next((source["url"] for source in datasource if source["data_source"] == "CityOfSanrafael"), None)
    ScrapDataCityOfSanrafael(CityOfSanrafael_url)

    #Call for DataSource-4
    SantaMariaGroup_url = next((source["url"] for source in datasource if source["data_source"] == "SantaMariaGroup"), None)
    ScrapDataSantaMariaGroup(SantaMariaGroup_url)

    #Call for DataSource-5
    HighwaysDotGov_url = next((source["url"] for source in datasource if source["data_source"] == "Highways.dot.gov"), None)
    ScrapDataHighwaysDotGov(HighwaysDotGov_url)

    #Call for DataSource-6
    Publicworks_url = next((source["url"] for source in datasource if source["data_source"] == "Publicworks.com"), None)
    ScrapDataPublicworks(Publicworks_url)

# Schedule the execution to run every day at a specific time
schedule.every().day.at("00:00").do(run_process)

# Infinite loop to continuously check and run scheduled tasks
while True:
    # Run pending scheduled tasks
    schedule.run_pending()
    # Sleep for 1 minute to avoid high CPU usage
    time.sleep(60)