# 1. Extract data from earnings calls

## Adding required packages

1. *Selenium* : https://www.selenium.dev/
2. *Beautiful Soup* : https://beautiful-soup-4.readthedocs.io/en/latest/
3. *WebDriver Manager* : https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/

In [1]:
# !pip install selenium
# !pip install beautifulsoup4
# !pip3 install webdriver-manager

## Importing Selenium and Drivers

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

## Importing Data Processing Libs

In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

In [4]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://www.google.com")

In [5]:
def extract_urls(driver_data,params_bs4_filter : dict) -> list:
    
    page_content_str = None
    bs4_soup_data_list = None
    parsed_links_list = []
    
    page_content_str = driver_data.page_source
    bs4_soup_data_list = BeautifulSoup(page_content_str)
    
    for links in bs4_soup_data_list.findAll(params_bs4_filter['name'],
                                            href=params_bs4_filter['href'], 
                                            attrs=params_bs4_filter['attrs'],
                                            recursive=params_bs4_filter['recursive']):
        link = links['href']
        header = links.contents[0]
        
        parsed_links_list.append([header,link])
    
    return parsed_links_list

In [6]:
def url_validator(url:str):
    regex = re.compile(
            r'^(?:http|ftp)s?://' # http:// or https://
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
            r'localhost|' #localhost...
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
            r'(?::\d+)?' # optional port
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    
    if re.match(regex, url):
        return True
    
    return False

In [7]:
def parse_page_data(url:str,params_iter : dict,params_bs4_filter : dict):
    
    embedded_url_list = []
    
    SCROLL_PAUSE_TIME = params_iter['scroll_wait_time']
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    
    if not url_validator(url):
        raise ValueError("The URL "&url&" is not a valid URL format")
        pass
    
    driver.get(url)
    
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    iteration = 0
    while True:
        
        iteration += 1
        
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
        time.sleep(SCROLL_PAUSE_TIME)
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        if (new_height == last_height) or (iteration == params_iter['iter_threshold']):
            break
        
    new_results = extract_urls(driver,params_bs4_filter)
    
    extracted_url_df = pd.DataFrame(new_results,columns=['header','link'])
    
    extracted_url_df = extracted_url_df[~extracted_url_df['header'].str.contains("\[",na=True)]
    
    extracted_url_df.reset_index()
    
    return extracted_url_df

        

In [8]:
url = "https://alphastreet.com/india/earnings-call-transcripts/"

params_bs4_filter = {}
params_bs4_filter['name'] = 'a'
params_bs4_filter['href'] = True
params_bs4_filter['attrs'] = {'rel':'bookmark'}
params_bs4_filter['recursive'] = True

params_iter = {}
params_iter['scroll_wait_time'] = 5.0
params_iter['iter_threshold'] = 2

extracted_url_df = parse_page_data(url , params_iter , params_bs4_filter)

In [9]:
extracted_url_df

Unnamed: 0,header,link
0,SBI Cards and Payment Services Ltd (SBICARD) Q...,https://alphastreet.com/india/sbi-cards-and-pa...
3,SBI Cards and Payment Services Ltd (SBICARD) Q...,https://alphastreet.com/india/sbi-cards-and-pa...
5,Star Health and Allied Insurance Co Ltd (STARH...,https://alphastreet.com/india/star-health-and-...
7,CSB Bank Limited (CSBBANK) Q4 FY23 Earnings Co...,https://alphastreet.com/india/csb-bank-limited...
9,Mahindra & Mahindra Financial Services Limited...,https://alphastreet.com/india/mahindra-mahindr...
...,...,...
125,AU Small Finance Bank Ltd (AUBANK) Q4 FY23 Ear...,https://alphastreet.com/india/au-small-finance...
127,Bajaj Auto Limited (BAJAJAUTO) Q4 FY23 Earning...,https://alphastreet.com/india/bajaj-auto-limit...
129,HDFC Asset Management Company Ltd (HDFCAMC) Q4...,https://alphastreet.com/india/hdfc-asset-manag...
131,Meghmani Finechem Limited (MFL) Q4 FY23 Earnin...,https://alphastreet.com/india/meghmani-fineche...
