In [1]:
import os.path
import re

import requests
from bs4 import BeautifulSoup
from dateutil.parser import ParserError
from dateutil.parser import parse

In [33]:
url = "https://www.fool.com/earnings/call-transcripts/2020/10/15/accd-q2-2021-earnings-call-transcript/"

In [34]:
url

'https://www.fool.com/earnings/call-transcripts/2020/10/15/accd-q2-2021-earnings-call-transcript/'

In [38]:
# Download and parse
transcript_html = requests.get(url).text    
soup = BeautifulSoup(transcript_html, 'lxml')
transcript_body = soup.find(class_='tailwind-article-body')

In [37]:
transcript_body

<div class="tailwind-article-body">
<div class="image imgR"><img alt="Logo of jester cap with thought bubble." src="https://g.foolcdn.com/misc-assets/fool-transcripts-logo.png"/>
<p class="caption">Image source: The Motley Fool.</p>
</div>
<p><strong>Accolade Inc</strong> <span class="font-bold whitespace-nowrap" data-id="343206">(<a class="ticker-symbol" href="/quote/nasdaq/accd/">ACCD</a><span class="text-red-900 ticker-change"> -1.35%</span>)</span><br/>Q2 2021 Earnings Call<br/><span id="date">Oct 14, 2020</span>, <em id="time">5:00 p.m. ET</em></p>
<h2>Contents:</h2>
<ul>
<li>Prepared Remarks</li>
<li>Questions and Answers</li>
<li>Call Participants</li>
</ul>
<h2>Prepared Remarks:</h2>
<p></p>
<p><strong>Operator</strong></p>
<p>Ladies and gentlemen, thank you for standing by, and welcome to the Accolade second-quarter 2021 earnings results conference call. [Operator instructions] Please be advised that today's conference is being recorded. [Operator instructions] I would now lik

In [42]:
transcript_body.find('a')

<a class="ticker-symbol" href="/quote/nasdaq/accd/">ACCD</a>

'ACCD'

In [27]:
# === Preprocess cleanup ===

# Removes Watermark
transcript_body.find(class_='image imgR').decompose()

# Remove disclaimer at the end of the article
terms = transcript_body.find('a', string='Terms and Conditions')
if terms is not None:
    disclaimer = terms.parent
    disclaimer.decompose()

# Removes useless tags for resulting transcript and simplify parsing
# The following will also simplify mentions of participants
for tag in transcript_body.find_all(['strong', 'em', 'span']):
    tag.unwrap()

# Remove added links at the bottom of the page
for link in transcript_body.find_all('a'):
    parent = link.parent
    link.decompose()
    if not parent.contents:
        parent.decompose()

# Remove ads from Motley Fool
for ad in transcript_body.find_all(class_='article-pitch-container'):
    ad.decompose()

transcript_body.smooth()


# Extraction

In [5]:
def parse_date(text):
    corrected = re.sub(r'(\D)([0-9])(\d\d)(\D)', r'\1\2:\3\4', text)
    return parse(corrected, tzinfos={'ET': 'EST'})

In [28]:
# === Extracting information ===
title = soup.title.text
quarter_reg = re.search(r'Q[1-4] 20[0-9]{2}', title)
company_ticker_reg = re.search(r'\((?:(?:NYSE|NASDAQ):\s?)?([A-Z\d]+[-.: ]?[A-Z\d]*)\)', title)

# First paragraph is company's information
meta = transcript_body.find('p', recursive=False)

# If the company is not found in the title, try extracting it from the information at the top of the page
if company_ticker_reg is None:
    company_ticker_reg = re.search(r'\((?:(?:NYSE|NASDAQ):\s?)?([A-Z\d]+[-.: ]?[A-Z\d]*)\)', meta.get_text())

In [30]:
meta

<p>Accolade Inc ( -1.35%)<br/>Q2 2021 Earnings Call<br/>Oct 14, 2020, 5:00 p.m. ET</p>

In [22]:
re.match(r'\s', '\xa0')

<re.Match object; span=(0, 1), match='\xa0'>

In [24]:
company_ticker_reg.group(1)

'PROF'

In [8]:

# Date is inlined date thanks to previous cleanup
try:
    event_dt = parse_date(list(meta.strings)[-1])
except ParserError:
    # Sometimes the date is in the second paragraph, the 1st being the name of the company
    meta, name = meta.findNextSibling('p', recursive=False), meta
    name.decompose()
    event_dt = parse_date(list(meta.strings)[-1])

# If the quarter cannot be extracted from the title, it may not contain the year.
# We try to capture the quarter form the first paragraph, even though it is more random
if quarter_reg is None:
    quarter_reg = re.search(r'Q[1-4] 20[0-9]{2}', meta.get_text())

if quarter_reg is not None:
    quarter = quarter_reg.group()
# If we still have no quarter, we'll try to deduct it from the date
else:
    q = re.search(r'Q([1-4]) ', title).group(1)  # Just fail if not present
    # call may be for the same year if consistent with current month, otherwise take previous
    y = event_dt.year if event_dt.month > int(q) * 3 else event_dt.year - 1
    quarter = f'Q{q} {y}'

# Meta data is no longer required
meta.decompose()

# Other metadata
company_ticker = re.search(r'\((?:NYSE: ?)?([A-Z\d]+[-.: ]?[A-Z\d]*)\)', title).group(1)
company_name = re.search(r'^(.*?) (?:\(|Q[1-4])', title).group(1)

{
    'url': url,
    'title': title,
    'company_name': company_name,
    'company_ticker': company_ticker,
    'quarter': quarter,
    'date': event_dt.isoformat(),
    'content': transcript_body.get_text(separator="\n").strip(),
}

IndexError: list index out of range

In [235]:
print(transcript_body.get_text(separator="\n").strip())

Contents:




Prepared Remarks


Questions and Answers


Call Participants




Prepared Remarks:


Operator


Ladies and gentlemen, thank you for standing by. Welcome to the American Electric Power Q3 2017 earnings conference call. At this time, all participants are in a listen-only mode. Later we'll conduct a question and answer session. Instructions will be given at that time. If you should require assistance during the call, please press *, then 0. As a reminder, this conference is being recorded. At this time, I would now like to turn the conference over to our host, Ms. Bette Jo Rozsa. Please go ahead.


Bette Jo Rozsa -- Managing Director, Investor Relations


Thank you, Rich. Good morning, everyone, and welcome to the Q3 2017 earnings call for American Electric Power. Thank you for taking the time to join us today. Our earnings release, presentation slides, and related financial information are available on our website at aep.com.


Today, we will be making forward-looking state

In [5]:
from pathlib import Path
import json

with open(Path.cwd().parent / 'transcript_data' / '_bundle.json') as transcripts:
    transcripts = json.load(transcripts)

In [16]:
import pandas as pd

df = pd.DataFrame(mapped_transcript.values())

In [17]:
df

Unnamed: 0,company_name,company_ticker,quarter,date,content
0,Barracuda Networks,CUDA,Q2,2017-10-10T16:30:00+00:00,2017-10-11-barracuda-networks-q2-2018-earnings...
1,Delta Air Lines,DAL,Q3,2017-10-11T10:00:00+00:00,2017-10-12-delta-air-lines-q3-2017-earnings-co...
2,JP Morgan Chase Co,JPM,Q3,2017-10-12T08:30:00+00:00,2017-10-12-jp-morgan-chase-co-q3-2017-earnings...
3,Citigroup,C,Q3,2017-10-12T10:00:00+00:00,2017-10-13-citigroup-q3-2017-earnings-conferen...
4,Bank of America Corporation,BAC,Q3,2017-10-13T08:30:00+00:00,2017-10-16-bank-of-america-corporation-q3-2017...
...,...,...,...,...,...
39386,DocuSign,DOCU,Q3,2023-12-07T17:00:00+00:00,2023-12-08-docusign-docu-q3-2024-earnings-call...
39387,Lululemon Athletica,LULU,Q3,2023-12-07T16:30:00+00:00,2023-12-08-lululemon-athletica-lulu-q3-2023-ea...
39388,RH,RH,Q3,2023-12-07T17:00:00+00:00,2023-12-08-rh-rh-q3-2023-earnings-call-transcr...
39389,"Innovative Industrial Properties, Inc. Class A",IIPR,Q4,2019-03-14T13:00:00+00:00,innovative-industrial-properties-inc-class-a-i...
