# 1. Extract data from earnings calls

## Adding required packages

1. *Selenium* : https://www.selenium.dev/
2. *Beautiful Soup* : https://beautiful-soup-4.readthedocs.io/en/latest/
3. *WebDriver Manager* : https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/

In [2]:
# !pip install selenium
# !pip install beautifulsoup4
# !pip3 install webdriver-manager
# !pip3 install yfinance
# !pip3 install nsepython

In [3]:
import warnings
warnings.filterwarnings("ignore")

## Importing Selenium and Drivers

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [5]:
from selenium.webdriver.chrome.options import Options

## Importing Data Processing Libs

In [6]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
import requests
import time
import json
import datetime

## 0. Common Utility

In [7]:
def get_driver_data(url:str,params_iter : dict):
    """
    
    Get the html data for the page opened by Selenium driver
    ------------------------------------
    
    Input:
    url (str) : url from which data is to be parsed 
    params_iter (dict) : Dictionary of parameters that are required to iterate over the web page
    
    ------------------------------------
    Output:
    Selenium.webdriver.Chrome object : Contains data about the page that needs to be parsed 
    
    """    
    
    SCROLL_PAUSE_TIME = params_iter['scroll_wait_time']
    
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
    
#     driver.minimize_window()
    
    ## Check if the URL has correct format
    if not url_validator(url):
        raise ValueError("The URL "&url&" is not a valid URL format")
        pass
    
    driver.get(url)
    
    ## Check what was the last height of the page
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    ## Get the whole page data by loading all data from lazy loading page
    iteration = 0
    while True:
        
        iteration += 1
        
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
        time.sleep(SCROLL_PAUSE_TIME)
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        if (new_height == last_height) or (iteration == params_iter['iter_threshold']):
            break
            
    return driver

In [8]:
def url_validator(url:str) -> bool:
    """
    
    Validates if the url have correct format
    ------------------------------------
    
    Input:
    url (str) : url string to be checked 
    
    ------------------------------------
    Output:
    Bool 
    
    """
    regex = re.compile(
            r'^(?:http|ftp)s?://' # http:// or https://
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
            r'localhost|' #localhost...
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
            r'(?::\d+)?' # optional port
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)
    
    if re.match(regex, url):
        return True
    
    return False

In [9]:
def get_industry_info(ticker,trial=0):
    
    if trial > 3:
        print('Failed for ticker => ',ticker)
        return ['','','','']
    
    if (ticker is None) or (ticker.strip() == ''):
        return ['','','','']
    
    industry_data_df = pd.read_csv('industry.csv')
    
    filtered_industry_data_df = industry_data_df[industry_data_df['ticker']==ticker]
    
    if filtered_industry_data_df.shape[0]>0:
        return [
            filtered_industry_data_df['Macro-Economic Sector'].iat[0],
            filtered_industry_data_df['Sector'].iat[0],
            filtered_industry_data_df['Industry'].iat[0],
            filtered_industry_data_df['Basic Industry'].iat[0]
        ]
    
    else:
        print(ticker)
        
        url = 'https://www.nseindia.com/get-quotes/equity?symbol={}'.format(ticker)

        params_iter = {}
        params_iter['scroll_wait_time'] = 5.0
        params_iter['iter_threshold'] = 1

        driver_data = get_driver_data(url , params_iter)

        page_content_str = driver_data.page_source
        bs4_soup_data_list = BeautifulSoup(page_content_str)

        driver_data.close()

        for table in bs4_soup_data_list.findAll('table'):
            if table.attrs.get('id','') == 'industryInfo':
                header = list(table.stripped_strings)[:4]
                body = list(table.stripped_strings)[4:]
                
                try:
                    industry_data_df.loc[len(industry_data_df)] = [ticker,'NSE',body[0],body[1],body[2],body[3]] 

                    industry_data_df.to_csv('industry.csv',index=False)
                except:
                    trial += 1
                    return get_industry_info(ticker,trial)
                
                return body
            
        return ['','','','']

In [10]:
def get_stock_historical_data(symbol,start_date,end_date):
    
    payload_df = pd.DataFrame()
    baseurl = "https://www.nseindia.com/"
    series = "EQ"
    headers = {
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
        'DNT': '1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
        'Sec-Fetch-User': '?1',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-Mode': 'navigate',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9,hi;q=0.8',
    }
    
    
    dt_start_date = datetime.datetime.strptime(start_date,"%d-%m-%Y")
    dt_end_date = datetime.datetime.strptime(end_date,"%d-%m-%Y")
    
    dt_inter_end_date = dt_start_date
    
    while (dt_end_date - dt_inter_end_date).days > 0:
        
        print("\r",dt_start_date," to " , dt_inter_end_date , " with days " ,abs((dt_inter_end_date - dt_start_date).days))
        
        dt_inter_end_date = dt_inter_end_date + datetime.timedelta(days=40)
        dt_inter_end_date = min([dt_inter_end_date,dt_end_date])
        
        inter_end_date = datetime.datetime.strftime(dt_inter_end_date,"%d-%m-%Y")
        
        url="https://www.nseindia.com/api/historical/cm/equity?symbol="+symbol+"&series=[%22"+series+"%22]&from="+str(start_date)+"&to="+str(inter_end_date)+""
    
        session = requests.Session()
        request = session.get(baseurl, headers=headers, timeout=5)
        cookies = dict(request.cookies)
        payload = session.get(url, headers=headers, timeout=5, cookies=cookies).json()

        inter_payload_df = pd.DataFrame(payload['data'])
        
        if payload_df.shape[0]:
            payload_df = pd.concat([payload_df,inter_payload_df])
        else:
            payload_df = inter_payload_df
        
    return payload_df

## 1. Parsing Transcripts

In [11]:
def get_ticker(transcript_header:str) -> str:
    """
    
    Parses out the ticker symbol from the transcript's header 
    ------------------------------------
    
    Input:
    transcript_header (str) : header of the transcripts 
    
    ------------------------------------
    Output:
    ticker_cd (str) : Ticker Symbol Code 
    
    """    
    try:
        ticker_cd = transcript_header[transcript_header.rfind(r'(')+1 : transcript_header.rfind(r')')]
        return ticker_cd
    except:
        return ''

In [12]:
def get_metadata(company_name:str):
    """
    
    Get ticker symbol from Company Name
    ------------------------------------
    
    Input:
    company_name (str) : Company Name
    
    ------------------------------------
    Output:
    company_code (str) : Company code from Yahoo Finance  
    
    """    
    
    yfinance = "https://query2.finance.yahoo.com/v1/finance/search"
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    params = {"q": company_name, "quotes_count": 1, "country": "United States"}

    
    res = requests.get(url=yfinance, params=params, headers={'User-Agent': user_agent})
    data = res.json()

    try:
        meta_data = data['quotes'][0]
        exchange = meta_data['exchange']
        sector = meta_data['sector']
        industry = meta_data['industry']
    
    except:
        exchange = ''
        sector = ''
        industry = ''
    
    return [ exchange , sector , industry]

In [13]:
def extract_urls(driver_data,params_bs4_filter : dict) -> list:
    """
    
    Extract embedded urls from the main page in form of [header,link] pairs
    ------------------------------------
    
    Input:
    driver_data (Selenium.webdriver.Chrome object) : Contains data about the page that needs to be parsed 
    params_bs4_filter (dict) : Dictionary of parameters that are required for effective parsing using BeatifulSoup
    
    ------------------------------------
    Output:
    parsed_links_list (list) : List of [header,link] for embedded urls in the main page
    
    """      
    
    page_content_str = None
    bs4_soup_data_list = None
    parsed_links_list = []
    
    page_content_str = driver_data.page_source
    bs4_soup_data_list = BeautifulSoup(page_content_str)
    
    for links in bs4_soup_data_list.findAll(params_bs4_filter['name'],
                                            href=params_bs4_filter['href'], 
                                            attrs=params_bs4_filter['attrs'],
                                            recursive=params_bs4_filter['recursive']):
        link = links['href']
        header = links.contents[0]
        
        parsed_links_list.append([header,link])
    
    return parsed_links_list

In [14]:
def parse_page_data_for_url(driver ,params_bs4_filter : dict):
    """
    
    Extract embedded urls from the main page in form of Pandas Dataframe
    ------------------------------------
    
    Input:
    url (str) : url from which data is to be parsed 
    params_iter (dict) : Dictionary of parameters that are required to iterate over the web page
    params_bs4_filter (dict) : Dictionary of parameters that are required for effective parsing using BeatifulSoup
    
    ------------------------------------
    Output:
    extracted_url_df (Pandas Dataframe) : Pandas Dataframe with header and corresponding urls
    
    """
    
    
    ## Get [header,links] pairs for embedded urls 
    new_results = extract_urls(driver,params_bs4_filter)
    
    
    ## Convert [header,links] pairs to pandas dataframe
    extracted_url_df = pd.DataFrame(new_results,columns=['header','link'])
    
    extracted_url_df[['Org Name','temp']] = extracted_url_df['header'].str.split('(', 1,expand=True)
    
    extracted_url_df['ticker_cd'] = extracted_url_df['header'].map(get_ticker).to_list()
    
    extracted_url_df[['stock_exchange','sector','industry']] = extracted_url_df['Org Name'].map(get_metadata).to_list()
    
    extracted_url_df[['nse_mes','nse_sector','nse_industry','nse_basic_industry']] = extracted_url_df['ticker_cd'].map(get_industry_info).to_list()
        
    
    ## Data cleaning:
    ## 1. Remove unwanted rows
    extracted_url_df = extracted_url_df[~extracted_url_df['header'].str.contains("\[",na=True)]
    extracted_url_df.reset_index(inplace = True)
    extracted_url_df.drop(['index','temp'],axis=1,inplace=True)
    
    ## 2. Remove duplicate rows
    extracted_url_df.drop_duplicates(inplace=True)
    
    return extracted_url_df

In [15]:
def extract_entity_participants(driver_data , transcript_header : str):
    """
    
    Extract participants from the transcripts
    ------------------------------------
    
    Input:
    driver_data (Selenium.webdriver.Chrome object) : Contains data about the page that needs to be parsed 
    transcript_header (str) : Company name to which the transcripts belongs to
    
    ------------------------------------
    Output:
    parsed_links_list (list) : List of ppts data [[],...]
    
    """     
    parsed_corp_ppts_list = []
    parsed_ppts_list = []
    page_content_str = None
    bs4_soup_data_list = None
    parsed_links_list = []
    
    page_content_str = driver_data.page_source
    bs4_soup_data_list = BeautifulSoup(page_content_str)
    
    
    ## For Corporate PPTs
    
    params_corp_ppts = {
        'tag_val' : 'h2',
        'text_val' : 'Corporate Participants:'
    }
    
    target = bs4_soup_data_list.find(params_corp_ppts['tag_val'],
                                     text=params_corp_ppts['text_val'])

    for sib in target.find_next_siblings():
        if sib.name==params_corp_ppts['tag_val']:
            break
        else:
            ppt_corp = transcript_header.split('(', 1)[0]
            try:
                ppt_name , ppt_desig = sib.text.split("\xa0—\xa0")
            except:
                ppt_name , ppt_desig = sib.text.split("\xa0")

            parsed_corp_ppts_list.append([transcript_header , ppt_name , ppt_desig , ppt_corp])
    
          
    ## For Analysts
    
    params_analyst_ppts = {
        'tag_val' : 'h2',
        'text_val' : 'Analysts:'
    }
    
    target = bs4_soup_data_list.find(params_analyst_ppts['tag_val'],
                                     text=params_analyst_ppts['text_val'])
    

    for sib in target.find_next_siblings():
        if sib.name==params_analyst_ppts['tag_val']:
            break
        else:
            ppt_name , _ , ppt_corp_x_desig = sib.text.split("\xa0")
            
            ppt_corp , ppt_desig =  ppt_corp_x_desig.split("—")

            parsed_ppts_list.append([transcript_header , ppt_name , ppt_desig , ppt_corp])
            
    
    
    return parsed_corp_ppts_list,parsed_ppts_list

In [16]:
def get_transcripts_urls_from_url(url:str,params_iter : dict,params_bs4_filter : dict):
    """
    
    Extract url of transcripts from main url
    ------------------------------------
    
    Input:
    url (str) : url from which data is to be parsed 
    params_iter (dict) : Dictionary of parameters that are required to iterate over the web page
    params_bs4_filter (dict) : Dictionary of parameters that are required for effective parsing using BeatifulSoup
    
    ------------------------------------
    Output:
    extracted_url_df (pandas dataframe) : Dataframe of extrated urls
    
    """   
    
    sel_driver = get_driver_data(url , params_iter)
    
    extracted_url_df = parse_page_data_for_url(sel_driver , params_bs4_filter)
    
    return extracted_url_df , sel_driver

In [17]:
def get_disclosures_from_transcripts(driver_data,pading_cols:dict):
    
    page_content_str = driver_data.page_source
    bs4_soup_data_list = BeautifulSoup(page_content_str)
    
    header = None

    value_list = []

    for values in bs4_soup_data_list.find('h2',text='Presentation:').find_next_siblings():

        if values.find('span') is not None:
            continue

        if values.find('strong') is not None:
            header = values.text

        else:
            value_list.append([header,values.text])

    data_df = pd.DataFrame(value_list,columns=['said_by','info'])
    
    data_df[list(pading_cols.keys())] = list(pading_cols.values())
    
    return data_df

In [18]:
def get_question_answers_from_transcripts(driver_data,pading_cols:dict):
    
    page_content_str = driver_data.page_source
    bs4_soup_data_list = BeautifulSoup(page_content_str)    

    header = None

    question = []

    value_list = []

    for values in bs4_soup_data_list.find('h2',text='Questions and Answers:').find_next_siblings():

        if values.find('span') is not None:
            continue

        if values.find('strong') is not None:
            header = values.text

        else:
            if header == 'Operator':
                continue
            if 'Analyst' in header:
                question = [header,values.text]
            else:
                value_list.append([question[0],question[1],header,values.text])

    data_df = pd.DataFrame(value_list,columns=['question_by','question','answer_by','answer'])
    
    data_df[list(pading_cols.keys())] = list(pading_cols.values())
    
    return data_df

In [19]:
def get_data_from_transcripts(extracted_url_df : pd.DataFrame , sel_driver):
    """
    
    Extract data from transcripts from url dataframe
    1. Participants Data
        a. Corporate Participants
        b. Analysts Participants
    ------------------------------------
    
    Input:
    url (str) : url from which data is to be parsed 
    params_iter (dict) : Dictionary of parameters that are required to iterate over the web page
    params_bs4_filter (dict) : Dictionary of parameters that are required for effective parsing using BeatifulSoup
    
    ------------------------------------
    Output:
    corp_ppts_df (pandas dataframe) : Dataframe of Corporate Participants
    analyst_ppts_df (pandas dataframe) : Dataframe of Analysts Participants
    
    """ 
    
    corp_ppts_df = pd.DataFrame()
    analyst_ppts_df = pd.DataFrame()
    disclosures_df = pd.DataFrame()
    qa_df = pd.DataFrame()
    
    for index,row in extracted_url_df.iterrows():
        
        try:
            
            print('Information Retrieval Started -> ',row['Org Name'],'(',row['ticker_cd'],')')
            sel_driver.get(row['link'])
            
            pading_cols = {
                "ticker_cd" : row['ticker_cd'],
                "Org Name" : row['Org Name']
            }
            
            print('\n')
            
            print('\tStep 1 : Get Disclosures data','\n')
            inter_disclosures_df = get_disclosures_from_transcripts(sel_driver,pading_cols)
            
            
            print('\tStep 2 : Get Q/A data','\n')
            inter_qa_df = get_question_answers_from_transcripts(sel_driver,pading_cols)
        
        
            print('\tStep 3 : Get participants data','\n')
            corp_ppts_list , analyst_ppts_list = extract_entity_participants(sel_driver,row['header'])
            
            inter_corp_ppts_df = pd.DataFrame(corp_ppts_list,columns=['Transcript Header','Name','Designation','Corp Name'])

            inter_analyst_ppts_df = pd.DataFrame(analyst_ppts_list,columns=['Transcript Header','Name','Designation','Corp Name'])
            
            
            
            print('\tStep 4 : Combine Data','\n')
            if corp_ppts_df.shape[1] > 0:
                corp_ppts_df = pd.concat(corp_ppts_df,inter_corp_ppts_df)
                analyst_ppts_df = pd.concat(analyst_ppts_df,inter_analyst_ppts_df)
                disclosures_df = pd.concat(disclosures_df,inter_disclosures_df)
                qa_df = pd.concat(qa_df,inter_qa_df)
            else:
                corp_ppts_df = inter_corp_ppts_df
                analyst_ppts_df = inter_analyst_ppts_df
                disclosures_df = inter_disclosures_df
                qa_df = inter_qa_df
                
        except:
            print('\r','Information Retrieval Failed -> ',row['Org Name'],'(',row['ticker_cd'],')\n')
            continue
        
        if index > 2:
            return corp_ppts_df , analyst_ppts_df , disclosures_df , qa_df
        
    return corp_ppts_df , analyst_ppts_df , disclosures_df , qa_df

In [20]:
def get_transcripts_data_wrapper(url:str,params_iter : dict,params_bs4_filter : dict):
    """
    
    Extract data from transcripts from url 
    ------------------------------------
    
    Input:
    url (str) : url from which data is to be parsed 
    params_iter (dict) : Dictionary of parameters that are required to iterate over the web page
    params_bs4_filter (dict) : Dictionary of parameters that are required for effective parsing using BeatifulSoup
    
    ------------------------------------
    Output:
    corp_ppts_df (pandas dataframe) : Dataframe of Corporate Participants
    analyst_ppts_df (pandas dataframe) : Dataframe of Analysts Participants
    
    """
    
    extracted_url_df , sel_driver = get_transcripts_urls_from_url(url , params_iter , params_bs4_filter)
    
    corp_ppts_df , analyst_ppts_df , disclosures_df , qa_df = get_data_from_transcripts(extracted_url_df , sel_driver)
        
    return extracted_url_df , corp_ppts_df , analyst_ppts_df , disclosures_df , qa_df
    

## 2. Fundamentals Data

In [21]:
def clean_fundamentals_table_data(my_list:list) -> list:
    """
    
    Clean Parsed table data for fundamentals
    ------------------------------------
    
    Input:
    my_list (list) : Table row to be cleaned
    
    ------------------------------------
    Output:
    my_list (list) : Cleaned Row
    
    """    
    
    my_list = list(map(str.strip, my_list))
    
    strings_to_clean = ['' ,
                        '        ' ,
                        '          ']
    
    for string in strings_to_clean:
        try:
            while True:
                my_list.remove(string)
        except ValueError:
            pass
    
    
    my_list = list(map(lambda x: x.replace(",",""), my_list))
    
    my_list = list(map(lambda x: x.replace("+",""), my_list))
    
    return my_list

In [22]:
def get_fundamentals_data_tables(bs4_soup_data_list , params_fndmntls_data:dict , pading_cols:dict):
    
    data_section_tag = params_fndmntls_data.get('data_section_tag','section')
    data_section_tag_id = params_fndmntls_data.get('data_section_tag_id','id')
    
    table_section_tag = params_fndmntls_data.get('table_section_tag','table')
    table_section_subtag = params_fndmntls_data.get('table_section_subtag','class')

    parsed_table_df = pd.DataFrame()
    
    for tables in bs4_soup_data_list.findAll(data_section_tag):
        
        parsed_table_inter_df = pd.DataFrame()
        
        for table in tables.findAll(table_section_tag):
            
            print('\t',tables.get(data_section_tag_id,'')," -> ",'Data Parsing - Started')
            
            rows = []
            header = []
            values = []

            for row in table.findAll("tr"):
                values.append(row.text.split("\n"))
            
            values = list(map(clean_fundamentals_table_data,values))
            
            header = ['Metric'] + values[0]

            rows = values[1:]

            try:
                parsed_table_inter_df = pd.DataFrame(rows,columns=header)
                
                parsed_table_inter_df = pd.melt(
                        parsed_table_inter_df, 
                        id_vars =list(parsed_table_inter_df.columns)[0], 
                        value_vars =list(parsed_table_inter_df.columns)[1:]
                       )

                parsed_table_inter_df['fundamental_data_type'] = tables.get(data_section_tag_id,'')
                
                
                if parsed_table_df.shape[0]:
                    parsed_table_df = pd.concat(parsed_table_inter_df,parsed_table_df)
                else:
                    parsed_table_df = parsed_table_inter_df
                
                print('\r\t',tables.get(data_section_tag_id,'')," -> ",'Data Parsing - Success\n')
                
            except:
                print('\r\t',tables.get(data_section_tag_id,'')," -> ",'Data Parsing - Failed\n')
    
    parsed_table_df[list(pading_cols.keys())] = list(pading_cols.values())
    
    return parsed_table_df

In [None]:
def get_fundamentals_data_wrapper(extracted_url_df):
    """
    
    Get Funadamentals data for the identified ticker 
    ------------------------------------
    
    Input:
    params_fundamentals (dict) : parameter for scraping fundamentals data 
    params_iter (dict) : Dictionary of parameters that are required to iterate over the web page
    
    ------------------------------------
    Output:
    
    """    
    
    fundamentals_data_df = pd.DataFrame()
    
    for index,stocks in extracted_url_df.iterrows():
        
        print('Fundamantals Data -> ',stocks['Org Name'],'(',stocks['ticker_cd'],')\n')
    
        url = "https://www.screener.in/company/{}/consolidated/#profit-loss".format(stocks['ticker_cd'])

        driver_data = get_driver_data(url , params_iter)

        page_content_str = driver_data.page_source
        bs4_soup_data_list = BeautifulSoup(page_content_str)
        
        driver_data.close()

        params_fndmntls_data = {
            'data_section_tag':'section' ,
            'data_section_tag_id':'id' ,
            'table_section_tag':'table' ,
            'table_section_subtag':'class' ,
        }

        pading_cols = {
            "ticker_cd" : stocks['ticker_cd'],
            "Org Name" : stocks['Org Name']
        }

        inter_fundamentals_data_df = pd.DataFrame()
        inter_fundamentals_data_df = get_fundamentals_data_tables(bs4_soup_data_list,params_fndmntls_data,pading_cols)
        
        if not inter_fundamentals_data_df.shape[0]:
            pass
        elif fundamentals_data_df.shape[0]:
            fundamentals_data_df = pd.concat([inter_fundamentals_data_df,fundamentals_data_df])
        else:
            fundamentals_data_df = inter_fundamentals_data_df
            
    fundamentals_data_df = fundamentals_data_df.dropna().reset_index()
    
    fundamentals_data_df.drop(['index'],axis=1,inplace=True)
    
    return fundamentals_data_df

## Sample Run

In [None]:
url = "https://alphastreet.com/india/earnings-call-transcripts/"

params_bs4_filter = {}
params_bs4_filter['name'] = 'a'
params_bs4_filter['href'] = True
params_bs4_filter['attrs'] = {'rel':'bookmark'}
params_bs4_filter['recursive'] = True

params_iter = {}
params_iter['scroll_wait_time'] = 5.0
params_iter['iter_threshold'] = 1

In [None]:
extracted_url_df , corp_ppts_df , analyst_ppts_df , disclosures_df , qa_df = get_transcripts_data_wrapper(url,params_iter,params_bs4_filter)

In [None]:
fundamentals_data_df = get_fundamentals_data_wrapper(extracted_url_df)

In [None]:
extracted_url_df

In [None]:
corp_ppts_df

In [None]:
analyst_ppts_df

In [None]:
disclosures_df

In [None]:
qa_df

In [None]:
fundamentals_data_df

In [None]:
parsed_corp_ppts_list = []
parsed_ppts_list = []
page_content_str = None
bs4_soup_data_list = None
parsed_links_list = []

page_content_str = driver_data.page_source
bs4_soup_data_list = BeautifulSoup(page_content_str)


## For Corporate PPTs

params_corp_ppts = {
    'tag_val' : 'h2',
    'text_val' : 'Corporate Participants:'
}

target = bs4_soup_data_list.find(params_corp_ppts['tag_val'],
                                 text=params_corp_ppts['text_val'])

for sib in target.find_next_siblings():
    if sib.name==params_corp_ppts['tag_val']:
        break
    else:
        ppt_corp = transcript_header.split('(', 1)[0]
        try:
            ppt_name , ppt_desig = sib.text.replace("\xa0","").split("—")
        except:
            ppt_name , ppt_desig = sib.text.split("\xa0")

        parsed_corp_ppts_list.append([transcript_header , ppt_name , ppt_desig , ppt_corp])


## For Analysts

params_analyst_ppts = {
    'tag_val' : 'h2',
    'text_val' : 'Analysts:'
}

target = bs4_soup_data_list.find(params_analyst_ppts['tag_val'],
                                 text=params_analyst_ppts['text_val'])


for sib in target.find_next_siblings():
    if sib.name==params_analyst_ppts['tag_val']:
        break
    else:
        ppt_name , _ , ppt_corp_x_desig = sib.text.split("\xa0")

        ppt_corp , ppt_desig =  ppt_corp_x_desig.split("—")

        parsed_ppts_list.append([transcript_header , ppt_name , ppt_desig , ppt_corp])