This file contains all the backend work related to scraping websites, building database (simulated with creation of csv files) 

## *All the data being scraped is done legally.*

In [32]:
import pandas as pd
import os
import time
import re

This Cell is used to simulate the workflow of collecting company data from Linkedin from a prebuilt data with company names.

In [None]:
#This Cell is used to simulate the workflow of collecting company data from Linkedin from a prebuilt data with company names. 
data_folder = "data"

websites = []
links = []  #Links could be generated from https://linkedin.com/company/about/
for file in os.listdir(data_folder):
    if file.endswith('.html'):
        websites.append(file[:-5])
        links.append(data_folder + '/' + file)
        print(file)

# print(links)

# Create a DataFrame and save it to a CSV file. This contains the website names and their corresponding paths stored locally.
df = pd.DataFrame({"websites": websites, "paths": links})
df.to_csv("websites.csv", index=False)

(13) Deloitte_ About _ LinkedIn.html
(13) Sony Research India_ About _ LinkedIn.html
(14) Accenture_ About _ LinkedIn.html
(14) Flipkart_ About _ LinkedIn.html
(14) JPMorganChase_ About _ LinkedIn.html
(14) NVIDIA_ About _ LinkedIn.html
(14) Tata Consultancy Services_ About _ LinkedIn.html
['data/(13) Deloitte_ About _ LinkedIn.html', 'data/(13) Sony Research India_ About _ LinkedIn.html', 'data/(14) Accenture_ About _ LinkedIn.html', 'data/(14) Flipkart_ About _ LinkedIn.html', 'data/(14) JPMorganChase_ About _ LinkedIn.html', 'data/(14) NVIDIA_ About _ LinkedIn.html', 'data/(14) Tata Consultancy Services_ About _ LinkedIn.html']


Below cell performs simulated web scraping and data enrichment for all companies.  
It extracts key information such as company name, website, industry, specialties, headquarters, quick links, contact details, and overview from local HTML files or cached data.  
The enriched dataset is then saved to a CSV file for further analysis or use.

In [None]:
import re
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
    
# This function converts a file path to a file URL.
def get_url_from_file(path):
    html_path = os.path.abspath(path)
    file_url = f"file:///{html_path.replace(' ', '%20')}"

    return file_url

# This function retrieves quick links from the main URL of a company.
def get_quick_links(main_url):
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get(main_url)
    time.sleep(2)  # Let the page load

    links = {
        "about": ["about", "company", "who-we-are"],
        "contact": ["contact", "get-in-touch", "reach-us"],
        "services": ["service", "what-we-do", "solutions"],
        "blog": ["blog", "news", "insights", "press"],
        "investors": ["investor", "investors", "investor-relations"]
    }

    section_links = {key: [] for key in links}

    for a in driver.find_elements(By.TAG_NAME, "a"):
        href = a.get_attribute("href")
        text = a.text.lower()
        if not href:
            continue
        for section, keywords in links.items():
            for kw in keywords:
                if (kw in href.lower()) or (kw in text):
                    section_links[section].append(href)
                    break  # Avoid duplicate for same link in multiple keywords

    shortest_links = {}
    for section, hrefs in section_links.items():
        if hrefs:
            shortest_links[section] = min(hrefs, key=len)
        else:
            shortest_links[section] = None

    driver.quit()

    quick_links = {"Quick_links": shortest_links}
    return quick_links

# This function extracts contact information from a given URL.
def contact_info(file_url):
    if file_url == None:
        contact_info = {"Contact_info": 
                        {
                "Email": "not available",
                "Phone": "not available",
                "Location": "not available"
            }
        }
        return contact_info
    else:
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
        driver.get(file_url)

        html = driver.page_source

        emails = re.findall(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", html)
        email = emails[0] if emails else "not available"

        # Extract phone numbers (basic pattern)
        phones = re.findall(r"\+?\d[\d\s().-]{7,}\d", html)
        phone = phones[0] if phones else "not available"

        # Extract locations from <address> tags
        locations = []
        try:
            address_elements = driver.find_elements(By.TAG_NAME, "address")
            for addr in address_elements:
                locations.append(addr.text.strip())
        except:
            pass
        
        location = locations[0] if locations else "not available"

        contact_info = {"Contact_info": 
                        {
                "Email": email,
                "Phone": phone,
                "Location": location
            }
        }
        driver.quit()

    return contact_info


# This function fetches company information from a given URL.
def fetch_info_from_url(file_url):
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    driver.get(file_url)

    time.sleep(2)  # Let the page load

    try:
        # Try the most common LinkedIn company name selector
        company_name = driver.find_element(
            By.CSS_SELECTOR, "h1.org-top-card-summary__title"
        ).text.strip()
    except:
        # Fallback: try a more generic h1 if the above fails
        try:
            company_name = driver.find_element(By.TAG_NAME, "h1").text.strip()
        except Exception as e:
            company_name = f"Company name not found: {e}"

    try:
        overview = driver.find_element(
            By.CSS_SELECTOR,
            "section.org-about-module__margin-bottom p.break-words.white-space-pre-wrap.t-black--light.text-body-medium"
        ).text
    except:
        overview = "Overview not found"

    params = {
        "Website": "Not Found",
        "Industry": "Not Found",
        "Specialties": "Not Found",
        "Headquarters": "Not Found"
    }

    try:
        dl = driver.find_element(By.CSS_SELECTOR, "dl.overflow-hidden")
        dt_elements = dl.find_elements(By.CSS_SELECTOR, "dt.mb1")
        dd_elements = dl.find_elements(By.CSS_SELECTOR, "dd.mb4.t-black--light.text-body-medium")
        
        for dt, dd in zip(dt_elements, dd_elements):
            label = dt.text.strip()
            value = dd.text.strip()
            # Only update if label is in our params
            for key in params:
                if label == key:
                    params[key] = value
    except Exception as e:
        print("Error extracting parameters:", e)

    company_size = "Company size not found"
    try:
        dl = driver.find_element(By.CSS_SELECTOR, "dl.overflow-hidden")
        dt_elements = dl.find_elements(By.CSS_SELECTOR, "dt.mb1")
        dd_elements = dl.find_elements(By.CSS_SELECTOR, "dd")

        # Loop through all dt/dd pairs
        for dt, dd in zip(dt_elements, dd_elements):
            label = dt.text.strip()
            value = dd.text.strip()
            if "Company size" in label:
                company_size = value
                break
    except Exception as e:
        company_size = f"Company size not found: {e}"

    driver.quit()

    quick_links = get_quick_links(params["Website"])
    contact_link = None
    if quick_links["Quick_links"]["contact"] is not None:
        contact_link = quick_links["Quick_links"]["contact"]

    company_info = {
        "Company Name": company_name,
        "Website": params["Website"],
        "Industry": params["Industry"],
        "Company Size": company_size,
        "Specialties": params["Specialties"],
        "Headquarters": params["Headquarters"],
        "Quick Links": get_quick_links(params["Website"]),
        "Contact Info": contact_info(contact_link),
        "Description": overview   # A Summary of the company could be made using NLP here.
    }

    return company_info


records = []
for i in range(len(df)):
    path = df['paths'][i]
    file_url = get_url_from_file(path)

    info = fetch_info_from_url(file_url)

    records.append(info)

# Create a DataFrame from the records
df = pd.DataFrame(records)

# Save the DataFrame to a CSV file. Applicable to new and existing files
if os.path.exists("data.csv"):
    df.to_csv("data.csv", mode='a', header=False, index=False)
else:
    df.to_csv("data.csv", index=False)