This notebook aims to identify defects in the dataset and correct them.

In [35]:
import traceback
from pathlib import Path
import json

folder = Path.cwd().parent / 'transcript_data'

with open(folder / '_bundle.json') as transcripts:
    transcripts = json.load(transcripts)

len(transcripts)

40002

In [2]:
with open(folder / '_sitemap_cache.txt') as sitemap:
    lines = sitemap.readlines()

In [3]:
parsed_urls = {transcript['url'] for transcript in transcripts}
all_urls = {url.strip() for url in lines}
missed_urls = all_urls - parsed_urls

In [4]:
len(all_urls)

40017

In [45]:
from tqdm.auto import tqdm

for transcript in tqdm(transcripts):
    content_path = folder / transcript['content']
    
    try:
        with content_path.open('r', encoding='utf8') as content_file:
            content = content_file.read()
    except Exception as e:
        print(f'{type(e).__name__}: {content_path}')

  0%|          | 0/40002 [00:00<?, ?it/s]

In [46]:
from datetime import datetime
from pathlib import Path
import zipfile


class MotleyFoolDataset:
    def __init__(self, dataset_path):
        self.dataset_path = Path(dataset_path)
        
        if self.dataset_path.suffix == '.zip':
            self.dataset_path = zipfile.Path(self.dataset_path)
        
        bundle_path = self.dataset_path / '_bundle.json'
        with bundle_path.open() as bundle:
            self.metadata = json.load(bundle)
    
    def __len__(self):
        return len(self.metadata)
    
    def __getitem__(self, item):
        return self._load_data(self.metadata[item])
    
    def __iter__(self):
        for instance in self.metadata:
            yield self._load_data(instance)
    
    def _load_data(self, instance):
        content_path = self.dataset_path / instance['content']
        with content_path.open('r', encoding='utf8') as content_file:
            return {
                **instance,
                'date': datetime.fromisoformat(instance['date']),
                'content': content_file.read()
            }

In [50]:
dataset = MotleyFoolDataset(Path.cwd().parent / 'TMF_dataset_annotated.zip')

In [51]:
dataset[8657]

{'company_name': 'Glu Mobile Inc',
 'company_ticker': 'GLUU',
 'quarter': 'Q2 2019',
 'date': datetime.datetime(2019, 8, 1, 17, 0, tzinfo=datetime.timezone.utc),
 'content': "Contents:\n\n\n\n\nPrepared Remarks\n\n\nQuestions and Answers\n\n\nCall Participants\n\n\n\n\nPrepared Remarks:\n\n\nOperator\n\n\nGood afternoon, ladies and gentlemen, and welcome to the Q2 2019 Glu Mobile earnings conference call. [Operator Instructions].\n\n\nI would now like to turn the conference over to your host, Harman Singh, Vice President of Finance and Investor Relations. You may begin, sir.\n\n\nHarman Singh -- Vise President, finance and Investor Relation\n\n\nGood afternoon, everyone, and thank you for joining us on Glu Mobile's Second Quarter 2019 Earnings Conference Call. On the call today are Nick Earl, President and Chief Executive Officer; and Eric Ludwig, COO and Chief Financial Officer. During this call, we will be making forward-looking statements regarding future events and the future finan

In [48]:
dataset = MotleyFoolDataset(folder)
len(dataset)  # -> 40002
dataset[3256]  # -> A lazy loaded element
for element in dataset:
    ...  # Iterate over elements by lazy loading them on demand

In [49]:
import pandas as pd
pd.DataFrame(dataset)

Unnamed: 0,url,title,company_name,company_ticker,quarter,date,content
0,https://www.fool.com/earnings/call-transcripts...,Barracuda Networks Q2 2018 Earnings Conference...,Barracuda Networks,CUDA,Q2 2018,2017-10-10 16:30:00+00:00,Contents:\n\n\n\n\nPrepared Remarks\n\n\nQuest...
1,https://www.fool.com/earnings/call-transcripts...,Delta Air Lines Q3 2017 Earnings Conference Ca...,Delta Air Lines,DAL,Q3 2017,2017-10-11 10:00:00+00:00,Contents:\n\n\n\n\nPrepared Remarks\n\n\nQuest...
2,https://www.fool.com/earnings/call-transcripts...,JP Morgan Chase Co Q3 2017 Earnings Conference...,JP Morgan Chase Co,JPM,Q3 2017,2017-10-12 08:30:00+00:00,Contents:\n\n\n\n\nPrepared Remarks\n\n\nQuest...
3,https://www.fool.com/earnings/call-transcripts...,Citigroup Q3 2017 Earnings Conference Call Tra...,Citigroup,C,Q3 2017,2017-10-12 10:00:00+00:00,Contents:\n\n\n\n\nPrepared Remarks\n\n\nQuest...
4,https://www.fool.com/earnings/call-transcripts...,Bank of America Corporation Q3 2017 Earnings C...,Bank of America Corporation,BAC,Q3 2017,2017-10-13 08:30:00+00:00,Contents:\n\n\n\n\nPrepared Remarks\n\n\nQuest...
...,...,...,...,...,...,...,...
39997,https://www.fool.com/earnings/call-transcripts...,RH (RH) Q3 2023 Earnings Call Transcript | The...,RH,RH,Q3 2023,2023-12-07 17:00:00+00:00,Contents:\n \n \nPrepared Remarks\n \nQuestion...
39998,https://www.fool.com/earnings/call-transcripts...,Johnson Controls International Plc (JCI) Q4 20...,Johnson Controls International Plc,JCI,Q4 2023,2023-12-12 08:30:00+00:00,Contents:\n \n \nPrepared Remarks\n \nQuestion...
39999,https://www.fool.com/earnings/call-transcripts...,Oracle (ORCL) Q2 2024 Earnings Call Transcript...,Oracle,ORCL,Q2 2024,2023-12-11 17:00:00+00:00,Contents:\n \n \nPrepared Remarks\n \nQuestion...
40000,https://www.fool.com/earnings/call-transcripts...,"Innovative Industrial Properties, Inc. Class A...","Innovative Industrial Properties, Inc. Class A",IIPR,Q4 2018,2019-03-14 13:00:00+00:00,Contents:\n\n\n\n\nPrepared Remarks\n\n\nQuest...
