In [37]:
import json
import pathlib
import zipfile
from zipfile import ZipFile

project_dir = pathlib.Path().cwd().parent
raw_zip = zipfile.Path(project_dir / 'TMF_dataset.zip')
augmented_zip = zipfile.Path(project_dir / 'augmented_transcript_data.zip') / 'augmented_transcript_data'

In [38]:
with raw_zip.joinpath('_bundle.json').open('r') as file:
    transcript_data = json.load(file)

len(transcript_data)

40002

In [39]:
augmented_transcript_data = []

for path in augmented_zip.iterdir():
    if path.suffix == '.json':
        with path.open('r') as file:
            augmented_transcript_data.append(json.load(file))

len(augmented_transcript_data)

39030

In [40]:
augmented_transcript_data[0]

{'company_name': 'Clorox Co',
 'company_ticker': 'CLX',
 'quarter': 'Q1',
 'date': '2018-10-31T13:30:00Z',
 'content': '2018-10-31-clorox-co-clx-q1-2019-earnings-conference-call-tra.txt',
 'daily_volatility': 0.01554540002379147,
 'closing_price_day_before': ['2018-10-30', 133.3592],
 'closing_price_day_of': ['2018-10-31', 129.5457],
 'closing_price_day_after': ['2018-11-01', 134.1534]}

In [41]:
result = []

mapped_augmented_transcript_data = {
    data['content']: data
    for data in augmented_transcript_data
}

for transcript in transcript_data:
    augmented_transcript = mapped_augmented_transcript_data.get(transcript['content'])
    
    if (augmented_transcript is None
            or augmented_transcript['closing_price_day_before'] is None
            or augmented_transcript['closing_price_day_after'] is None):
        continue
    
    before_price = augmented_transcript['closing_price_day_before'][1]
    after_price = augmented_transcript['closing_price_day_after'][1]

    result.append({
        'company_name': transcript['company_name'],
        'company_ticker': transcript['company_ticker'],
        'quarter': transcript['quarter'],
        'date': transcript['date'],
        'content': transcript['content'],
        'direction': 'UP' if after_price - before_price > 0 else 'DOWN'
    })


In [42]:
len(result)

38212

In [43]:
up_count = sum(1 for transcript in result if transcript['direction'] == 'UP')
down_count = sum(1 for transcript in result if transcript['direction'] == 'DOWN')

print(f'UP instances:   {up_count} ({up_count / len(result) * 100:2.2f})%')
print(f'DOWN instances: {down_count} ({down_count / len(result) * 100:2.2f})%')

UP instances:   19243 (50.36)%
DOWN instances: 18969 (49.64)%


In [44]:
result.sort(key=lambda t: t['date'])

In [45]:
from zipfile import ZIP_DEFLATED, ZipFile
from tqdm.auto import tqdm

with ZipFile(project_dir / 'TMF_dataset_annotated.zip', 'w', compression=ZIP_DEFLATED) as annotated_zip:
    with annotated_zip.open('_bundle.json', 'w') as file:
        file.write(json.dumps(result, indent=2).encode())
    
    for transcript in tqdm(result):
        with raw_zip.joinpath(transcript['content']).open('rb') as in_file, annotated_zip.open(transcript['content'], 'w') as out_file:
            out_file.write(in_file.read())

  0%|          | 0/38212 [00:00<?, ?it/s]