# 1. Extract data from earnings calls

## Adding required packages

1. *Selenium* : https://www.selenium.dev/
2. *Beautiful Soup* : https://beautiful-soup-4.readthedocs.io/en/latest/
3. *WebDriver Manager* : https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/

In [1]:
# !pip install selenium
# !pip install beautifulsoup4
# !pip3 install webdriver-manager

## Importing Selenium and Drivers

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

## Importing Data Processing Libs

In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

In [4]:
# driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
# driver.get("https://www.google.com")

In [5]:
def url_validator(url:str) -> bool:
    """
    
    Validates if the url have correct format
    ------------------------------------
    
    Input:
    url (str) : url string to be checked 
    
    ------------------------------------
    Output:
    Bool 
    
    """
    regex = re.compile(
            r'^(?:http|ftp)s?://' # http:// or https://
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
            r'localhost|' #localhost...
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
            r'(?::\d+)?' # optional port
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    
    if re.match(regex, url):
        return True
    
    return False

In [6]:
def get_driver_data(url:str,params_iter : dict):
    """
    
    Get the html data for the page opened by Selenium driver
    ------------------------------------
    
    Input:
    url (str) : url from which data is to be parsed 
    params_iter (dict) : Dictionary of parameters that are required to iterate over the web page
    
    ------------------------------------
    Output:
    Selenium.webdriver.Chrome object : Contains data about the page that needs to be parsed 
    
    """    
    
    SCROLL_PAUSE_TIME = params_iter['scroll_wait_time']
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    
    ## Check if the URL has correct format
    if not url_validator(url):
        raise ValueError("The URL "&url&" is not a valid URL format")
        pass
    
    driver.get(url)
    
    ## Check what was the last height of the page
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    ## Get the whole page data by loading all data from lazy loading page
    iteration = 0
    while True:
        
        iteration += 1
        
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
        time.sleep(SCROLL_PAUSE_TIME)
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        if (new_height == last_height) or (iteration == params_iter['iter_threshold']):
            break
            
    return driver

In [7]:
def extract_urls(driver_data,params_bs4_filter : dict) -> list:
    """
    
    Extract embedded urls from the main page in form of [header,link] pairs
    ------------------------------------
    
    Input:
    driver_data (Selenium.webdriver.Chrome object) : Contains data about the page that needs to be parsed 
    params_bs4_filter (dict) : Dictionary of parameters that are required for effective parsing using BeatifulSoup
    
    ------------------------------------
    Output:
    parsed_links_list (list) : List of [header,link] for embedded urls in the main page
    
    """      
    
    page_content_str = None
    bs4_soup_data_list = None
    parsed_links_list = []
    
    page_content_str = driver_data.page_source
    bs4_soup_data_list = BeautifulSoup(page_content_str)
    
    for links in bs4_soup_data_list.findAll(params_bs4_filter['name'],
                                            href=params_bs4_filter['href'], 
                                            attrs=params_bs4_filter['attrs'],
                                            recursive=params_bs4_filter['recursive']):
        link = links['href']
        header = links.contents[0]
        
        parsed_links_list.append([header,link])
    
    return parsed_links_list

In [8]:
def parse_page_data_for_url(driver ,params_bs4_filter : dict):
    """
    
    Extract embedded urls from the main page in form of Pnadas Dataframe
    ------------------------------------
    
    Input:
    url (str) : url from which data is to be parsed 
    params_iter (dict) : Dictionary of parameters that are required to iterate over the web page
    params_bs4_filter (dict) : Dictionary of parameters that are required for effective parsing using BeatifulSoup
    
    ------------------------------------
    Output:
    extracted_url_df (Pandas Dataframe) : Pandas Dataframe with header and corresponding urls
    
    """
    
    
    ## Get [header,links] pairs for embedded urls 
    new_results = extract_urls(driver,params_bs4_filter)
    
    
    ## Convert [header,links] pairs to pandas dataframe
    extracted_url_df = pd.DataFrame(new_results,columns=['header','link'])
    
    
    ## Data cleaning:
    ## 1. Remove unwanted rows
    extracted_url_df = extracted_url_df[~extracted_url_df['header'].str.contains("\[",na=True)]
    extracted_url_df.reset_index(inplace = True)
    extracted_url_df.drop('index',axis=1,inplace=True)
    
    
    return extracted_url_df

In [9]:
def extract_entity_participants(driver_data , transcript_header : str):
    """
    
    Extract participants from the transcripts
    ------------------------------------
    
    Input:
    driver_data (Selenium.webdriver.Chrome object) : Contains data about the page that needs to be parsed 
    org_name (str) : Company name to which the transcripts belongs to
    params_bs4_filter_ppts (dict) : Dictionary of parameters that are required for effective parsing of participants using BeatifulSoup
    
    ------------------------------------
    Output:
    parsed_links_list (list) : List of ppts data [[],...]
    
    """     
    parsed_corp_ppts_list = []
    parsed_ppts_list = []
    page_content_str = None
    bs4_soup_data_list = None
    parsed_links_list = []
    
    page_content_str = driver_data.page_source
    bs4_soup_data_list = BeautifulSoup(page_content_str)
    
    
    ## For Corporate PPTs
    
    params_corp_ppts = {
        'tag_val' : 'h2',
        'text_val' : 'Corporate Participants:'
    }
    
    target = bs4_soup_data_list.find(params_corp_ppts['tag_val'],
                                     text=params_corp_ppts['text_val'])

    for sib in target.find_next_siblings():
        if sib.name==params_corp_ppts['tag_val']:
            break
        else:
            ppt_corp = transcript_header.split('(', 1)[0]
            try:
                ppt_name , ppt_desig = sib.text.split("\xa0—\xa0")
            except:
                ppt_name , ppt_desig = sib.text.split("\xa0")

            parsed_corp_ppts_list.append([transcript_header , ppt_name , ppt_desig , ppt_corp])
    
          
    ## For Analysts
    
    params_analyst_ppts = {
        'tag_val' : 'h2',
        'text_val' : 'Analysts:'
    }
    
    target = bs4_soup_data_list.find(params_analyst_ppts['tag_val'],
                                     text=params_analyst_ppts['text_val'])
    

    for sib in target.find_next_siblings():
        if sib.name==params_analyst_ppts['tag_val']:
            break
        else:
            ppt_name , _ , ppt_corp_x_desig = sib.text.split("\xa0")
            
            ppt_corp , ppt_desig =  ppt_corp_x_desig.split("—")

            parsed_ppts_list.append([transcript_header , ppt_name , ppt_desig , ppt_corp])
            
    
    
    return parsed_corp_ppts_list,parsed_ppts_list

In [10]:
def get_transcripts_data_wrapper(url:str,params_iter : dict,params_bs4_filter : dict):
    
    corp_ppts_df = pd.DataFrame()
    analyst_ppts_df = pd.DataFrame()
    
    
    sel_driver = get_driver_data(url , params_iter)
    
    extracted_url_df = parse_page_data_for_url(sel_driver , params_bs4_filter)
    
    for index,row in extracted_url_df.iterrows():
        
        sel_driver.get(row['link'])
        
        corp_ppts_list , analyst_ppts_list = extract_entity_participants(sel_driver,row['header'])
        
        inter_corp_ppts_df = pd.DataFrame(corp_ppts_list,columns=['Transcript Header','Name','Designation','Corp Name'])
        
        inter_analyst_ppts_df = pd.DataFrame(analyst_ppts_list,columns=['Transcript Header','Name','Designation','Corp Name'])
        
        if corp_ppts_df.shape[1] > 0:
            corp_ppts_df = corp_ppts_df.append(inter_corp_ppts_df)
            analyst_ppts_df = analyst_ppts_df.append(inter_analyst_ppts_df)
        else:
            corp_ppts_df = inter_corp_ppts_df
            analyst_ppts_df = inter_analyst_ppts_df
        
        if index > 2:
            break
        
    return corp_ppts_df , analyst_ppts_df
    

In [11]:
url = "https://alphastreet.com/india/earnings-call-transcripts/"

params_bs4_filter = {}
params_bs4_filter['name'] = 'a'
params_bs4_filter['href'] = True
params_bs4_filter['attrs'] = {'rel':'bookmark'}
params_bs4_filter['recursive'] = True

params_iter = {}
params_iter['scroll_wait_time'] = 5.0
params_iter['iter_threshold'] = 1

In [12]:
corp_ppts_df , analyst_ppts_df = get_transcripts_data_wrapper(url,params_iter,params_bs4_filter)

  corp_ppts_df = corp_ppts_df.append(inter_corp_ppts_df)
  analyst_ppts_df = analyst_ppts_df.append(inter_analyst_ppts_df)
  corp_ppts_df = corp_ppts_df.append(inter_corp_ppts_df)
  analyst_ppts_df = analyst_ppts_df.append(inter_analyst_ppts_df)
  corp_ppts_df = corp_ppts_df.append(inter_corp_ppts_df)
  analyst_ppts_df = analyst_ppts_df.append(inter_analyst_ppts_df)


In [13]:
corp_ppts_df

Unnamed: 0,Transcript Header,Name,Designation,Corp Name
0,TVS Motor Company Ltd (TVSMOTOR) Q4 FY23 Earni...,K. N. Radhakrishnan,Director and Chief Executive Officer,TVS Motor Company Ltd
1,TVS Motor Company Ltd (TVSMOTOR) Q4 FY23 Earni...,K. Gopala Desikan,Group Chief Financial Officer,TVS Motor Company Ltd
0,TVS Motor Company Ltd (TVSMOTOR) Q4 FY23 Earni...,K. N. Radhakrishnan,Director and Chief Executive Officer,TVS Motor Company Ltd
1,TVS Motor Company Ltd (TVSMOTOR) Q4 FY23 Earni...,K. Gopala Desikan,Group Chief Financial Officer,TVS Motor Company Ltd
0,Tata Power Company Limited (TATAPOWER) Q4 FY23...,Dr. Praveer Sinha,Chief Executive Officer and Managing Director,Tata Power Company Limited
1,Tata Power Company Limited (TATAPOWER) Q4 FY23...,Sanjeev Churiwala,Chief Financial Officer,Tata Power Company Limited
0,Dabur India Limited (DABUR) Q4 FY23 Earnings C...,Gagan Ahluwalia,Vice President Corporate Affairs,Dabur India Limited
1,Dabur India Limited (DABUR) Q4 FY23 Earnings C...,Mohit Malhotra,Chief Executive Officer,Dabur India Limited
2,Dabur India Limited (DABUR) Q4 FY23 Earnings C...,Ankush Jain,Chief Financial Officer,Dabur India Limited


In [14]:
analyst_ppts_df

Unnamed: 0,Transcript Header,Name,Designation,Corp Name
0,TVS Motor Company Ltd (TVSMOTOR) Q4 FY23 Earni...,Annamalai Jayaraj,Analyst,Batlivala & Karani Securities India Pvt Ltd
1,TVS Motor Company Ltd (TVSMOTOR) Q4 FY23 Earni...,Chandramouli Muthiah,Analyst,Goldman Sachs
2,TVS Motor Company Ltd (TVSMOTOR) Q4 FY23 Earni...,Pramod Kumar,Analyst,UBS Securities
3,TVS Motor Company Ltd (TVSMOTOR) Q4 FY23 Earni...,Kapil Singh,Analyst,Nomura
4,TVS Motor Company Ltd (TVSMOTOR) Q4 FY23 Earni...,Gunjan Prithyani,Analyst,BOA
5,TVS Motor Company Ltd (TVSMOTOR) Q4 FY23 Earni...,Amyn Pirani,Analyst,JPMorgan
6,TVS Motor Company Ltd (TVSMOTOR) Q4 FY23 Earni...,Pramod Amthe,Analyst,Incred Capital
7,TVS Motor Company Ltd (TVSMOTOR) Q4 FY23 Earni...,Hitesh,Analyst,CLSA
8,TVS Motor Company Ltd (TVSMOTOR) Q4 FY23 Earni...,Aditya Makharia,Analyst,HDFC Securities
0,TVS Motor Company Ltd (TVSMOTOR) Q4 FY23 Earni...,Annamalai Jayaraj,Analyst,Batlivala & Karani Securities India Pvt Ltd
