In [4]:
import pandas as pd
import numpy as np
import json
from edgar import *
from tqdm import tqdm
from joblib import Parallel, delayed
import pickle
import re
import multiprocessing
set_identity('8049qq.com@gmail.com')

year = '10-20'

In [5]:
with open(f'data/freeze_collection_{year}.pkl', 'rb') as f:
    freeze_collection = pickle.load(f)

In [6]:
non_empty = {}

for key, value in freeze_collection.items():
    if value[0]:
        non_empty[key.accession_no] = key.filing_date

In [7]:
non_empty

{'0001477932-20-007581': datetime.date(2020, 12, 31),
 '0001493152-20-024669': datetime.date(2020, 12, 31),
 '0001387131-20-012222': datetime.date(2020, 12, 31),
 '0001493152-20-024624': datetime.date(2020, 12, 31),
 '0001213900-20-045750': datetime.date(2020, 12, 31),
 '0001213900-20-045754': datetime.date(2020, 12, 31),
 '0001213900-20-045758': datetime.date(2020, 12, 31),
 '0001640334-20-003182': datetime.date(2020, 12, 31),
 '0000096536-20-000026': datetime.date(2020, 12, 31),
 '0001628280-20-017878': datetime.date(2020, 12, 31),
 '0001434389-20-000009': datetime.date(2020, 12, 31),
 '0001434389-20-000010': datetime.date(2020, 12, 31),
 '0001376474-20-000333': datetime.date(2020, 12, 31),
 '0001654954-20-014038': datetime.date(2020, 12, 31),
 '0001640334-20-003188': datetime.date(2020, 12, 31),
 '0001731122-20-001347': datetime.date(2020, 12, 30),
 '0000072633-20-000027': datetime.date(2020, 12, 30),
 '0001434389-20-000008': datetime.date(2020, 12, 30),
 '0001213900-20-045308': dat

In [84]:
# this is a regex function, it requires at least a verd and a noun exist, and their distance is less than 20 words 
verbs_1 = ['freeze', 'froze', 'frozen', 'freezing', 'close', 'closed', 'discontinue', 'discontinued', 'terminate', 'terminated', 'renegotiate', 'renegotiated']
nouns_1 = ['defined benefit', 'pension plan', 'retirement', 'postretirement', 'postemployment', 'pension', 'benefit']

# Create a regex pattern
pattern_1 = r'\b(?:' + '|'.join(re.escape(word) for word in verbs_1) + r')\b(?:\W+\w+){0,20}?\W+\b(?:' + '|'.join(re.escape(word) for word in nouns_1) + r')\b' + \
            r'|\b(?:' + '|'.join(re.escape(word) for word in nouns_1) + r')\b(?:\W+\w+){0,20}?\W+\b(?:' + '|'.join(re.escape(word) for word in verbs_1) + r')\b'

# for pattern 2
verbs_2 = ['move', 'moved', 'transfer', 'transferred', 'transfering', 'transfered', 'transfering', 'transfered' ,'transit', 'transition','transited','change','changed','turn','turned','new employee']
nouns_2 = ['defined benefit', 'pension plan', 'retirement', 'postretirement', 'defined contribution', 'contribution', '401(k)']

# pattern 2
pattern_2 = r'\b(?:' + '|'.join(re.escape(word) for word in verbs_2) + r')\b(?:\W+\w+){0,20}?\W+\b(?:' + '|'.join(re.escape(word) for word in nouns_2) + r')\b' + \
            r'|\b(?:' + '|'.join(re.escape(word) for word in nouns_2) + r')\b(?:\W+\w+){0,20}?\W+\b(?:' + '|'.join(re.escape(word) for word in verbs_2) + r')\b'

In [85]:
# for key, value in freeze_collection.items():
#     if value[1]['sections']:
#         print(value[1]['sections'][0]['doc'])

#         # find which key words filfill the pattern 1 in the document
#         matches_1 = re.findall(pattern_2, value[1]['sections'][0]['doc'], re.IGNORECASE)

#         print(matches_1)


#         break

In [86]:
freeze_all = {}

for key, value in freeze_collection.items():
    file_info = {'form': key.form,
                'date': key.filing_date,
                'cik': key.cik,
                'company': key.company,
                }
    
    result_1 = {}
    if value[0]:
        for n, section in enumerate(value[0]['sections']):
            matches_1 = re.findall(pattern_1, section['doc'], re.IGNORECASE)
            
            section_dict = {
                'doc' : section['doc'],
                'keywords': matches_1
            }
            
            result_1[f'section_{n}'] = section_dict
        


    result_2 = {}
    if value[1]:
        for n, section in enumerate(value[1]['sections']):
            matches_2 = re.findall(pattern_2, section['doc'], re.IGNORECASE)
            
            section_dict = {
                'doc' : section['doc'],
                'keywords': matches_2
            }
            
            result_2[f'section_{n}'] = section_dict


#    if both results are empty, then we don't need to store this record
    if not result_1 and not result_2:
        continue

    freeze_all[key.accession_no] = {
        'file_info': file_info,
        'result_1': result_1,
        'result_2': result_2
    }



with open(f'summary_keywords_{year}.pkl', 'wb') as f:
    pickle.dump(freeze_all, f)

In [87]:
# search 1 table
table_all = {}
for key, value in freeze_all.items():
    if not value['result_1']:
        continue

    col_dict = value['file_info']\

    section_list = []
    keyword_list = []
    for section, content in value['result_1'].items():
        section_list.append(content['doc'])
        keyword_list.append(content['keywords'])

    col_dict['section'] = section_list
    col_dict['keyword'] = keyword_list

    table_all[key] = col_dict

pd.DataFrame(table_all).T.explode('section').explode('keyword').to_csv(f'search1_result_{year}.csv')

# search 2 table
table_all = {}
for key, value in freeze_all.items():
    if not value['result_2']:
        continue

    col_dict = value['file_info']

    section_list = []
    keyword_list = []
    for section, content in value['result_2'].items():
        section_list.append(content['doc'])
        keyword_list.append(content['keywords'])

    col_dict['section'] = section_list
    col_dict['keyword'] = keyword_list

    table_all[key] = col_dict

pd.DataFrame(table_all).T.explode('section').explode('keyword').to_csv(f'search2_result_{year}.csv')
