In [22]:
!pip install requests beautifulsoup4



In [23]:
!pip install selenium



In [24]:
#https://www.screener.in/company/RELIANCE/consolidated/

import pandas as pd            #to work with the data frames
import requests                #to request webpages for scrapping 
from bs4 import BeautifulSoup  #to parse html
from selenium import webdriver #to scrappages that require javascript
from selenium.webdriver.common.by import By #to select html elements by their attributes
from selenium.webdriver.chrome.service import Service   #to use the chrome driver
from selenium.webdriver.chrome.options import Options #to set options for the chrome driver
from webdriver_manager.chrome import ChromeDriverManager #to manage the chrome driver

## Getting user input and make screener link for the same

In [25]:
#symbol  = 'RELIANCE'
symbol = input("Enter symbol of the company : ").strip().upper()

def get_screener_link(sym):
    sym_url = f"https://www.screener.in/company/{sym}/consolidated/"
    return sym_url
    
url = get_screener_link(symbol)

## Web scrapping for the screener link

In [26]:
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Set up the WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Navigate to the URL
driver.get(url)

page_source = driver.page_source

#print(page_source)

# Close the WebDriver
driver.quit()




#### Extracting Company website

In [27]:
from bs4 import BeautifulSoup

def extract_company_website(html_content):
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the first <a> tag within the div class 'company-links'
    company_links_div = soup.find('div', class_='company-links')
    if company_links_div:
        website_link = company_links_div.find('a', href=True)  # Look for the first <a> tag with href
        if website_link:
            return website_link['href']
    return None

# Call the function
company_website = extract_company_website(page_source)
print(f"Company Website: {company_website}")


Company Website: http://www.ril.com


### Getting annual reports links

In [28]:

# Parse the HTML
soup = BeautifulSoup(page_source, 'html.parser')

# Find the first `ul` with class `list-links` and extract the links
ul = soup.find_all('ul', class_='list-links')[1]  # Adjust index to select the desired `ul`

# Extract all <a> tags within the selected `ul`
links = ul.find_all('a', href=True)

# Print the links and their text
for link in links:
    href = link['href']
    text = link.get_text(strip=True)
    print(f"Link: {href}, Text: {text}")


Link: https://www.bseindia.com/stockinfo/AnnPdfOpen.aspx?Pname=d473280d-1c2b-4037-8ff9-dfbb82aa2c7e.pdf, Text: Financial Year 2024from bse
Link: https://www.bseindia.com/stockinfo/AnnPdfOpen.aspx?Pname=\b55b5dfc-a3bf-4f24-9d7f-ca09774a1dd9.pdf, Text: Financial Year 2023from bse
Link: https://www.bseindia.com/bseplus/AnnualReport/500325/74185500325.pdf, Text: Financial Year 2022from bse
Link: https://www.bseindia.com/bseplus/AnnualReport/500325/68509500325.pdf, Text: Financial Year 2021from bse
Link: https://www.bseindia.com/bseplus/AnnualReport/500325/5003250320.pdf, Text: Financial Year 2020from bse
Link: https://www.bseindia.com/bseplus/AnnualReport/500325/5003250319.pdf, Text: Financial Year 2019from bse
Link: https://www.bseindia.com/bseplus/AnnualReport/500325/5003250318.pdf, Text: Financial Year 2018from bse
Link: https://www.bseindia.com/bseplus/AnnualReport/500325/5003250317.pdf, Text: Financial Year 2017from bse
Link: https://www.bseindia.com/bseplus/AnnualReport/500325/500325

### Fetching from company website

In [29]:
import requests
from bs4 import BeautifulSoup

# Fetch the sitemap XML
sitemap_url =  f"{company_website}/sitemap.xml"
response = requests.get(sitemap_url)
sitemap_xml = response.content

# Parse the XML
soup = BeautifulSoup(sitemap_xml, 'xml')

# Extract all <loc> tags which contain the URLs
urls = [loc.get_text() for loc in soup.find_all('loc')]

investor_keywords = ['corporate-governance']
filtered_links = [url for url in urls if any(keyword in url for keyword in investor_keywords)]

# Print the URLs
for url in filtered_links:
    print(url)

https://www.ril.com/investors/shareholders-information/corporate-governance


In [30]:
def extract_governance_links(url):
    """
    Extracts all governance-related links from the specified URL, including nested sections.

    Args:
    - url (str): The URL of the webpage to scrape.

    Returns:
    - list: A list of dictionaries with 'text' and 'link' keys for governance-related links.
    """
    try:
        # Fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()  # Raise error for bad status codes
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all 'a' tags with 'href' attribute
        links = soup.find_all('a', href=True)
        
        # Keywords related to governance
        governance_keywords = [
            "governance", "policy", "compliance", "audit", "code of conduct",
            "board of directors", "committee", "ethics", "regulations"
        ]
        
        # Filter links based on governance keywords
        governance_links = []
        for link in links:
            link_text = link.get_text(strip=True).lower()
            link_href = link['href']
            
            # Check if the link text contains any governance-related keyword
            if any(keyword in link_text for keyword in governance_keywords):
                governance_links.append({'text': link_text, 'link': link_href})
        
        return governance_links

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return []

def fetch_governance_links_from_multiple_pages(filtered_links):
    """
    Fetches governance-related links from a list of URLs (filtered_links).

    Args:
    - filtered_links (list): A list of URLs to scrape.

    Returns:
    - list: A list of dictionaries with 'text' and 'link' keys for governance-related links.
    """
    all_governance_links = []
    
    for url in filtered_links:
        print(f"Fetching governance links from: {url}")
        governance_links = extract_governance_links(url)
        all_governance_links.extend(governance_links)
    
    return all_governance_links



governance_links_from_all_pages = fetch_governance_links_from_multiple_pages(filtered_links)

for item in governance_links_from_all_pages:
    print(f"Text: {item['text']}, Link: {item['link']}")


Fetching governance links from: https://www.ril.com/investors/shareholders-information/corporate-governance
Text: ril business partner code of conduct (bpcoc), Link: /eb2b/bpcoc
Text: board of directors, Link: /about/board-of-directors
Text: board committees, Link: /about/board-committees
Text: corporate governance report, Link: /investors/shareholders-information/corporate-governance
Text: disclosures under regulation 46 and 62 of sebi (lodr) regulations, 2015, Link: /investors/shareholders-information/disclosures-under-regulation-46-and-62-of-sebi-lord-regulations-2015
Text: tax strategy & policy, Link: /investors/resource-center/tax-strategy-policy
Text: ril business partner code of conduct (bpcoc), Link: /eb2b/bpcoc
Text: corporate governance report for the quarter and half-year ended september 30, 2024, Link: https://rilstaticasset.akamaized.net/sites/default/files/2024-10/CGSept2024.pdf
Text: corporate governance report for the quarter ended june 30, 2024, Link: https://rilstatic