# TAMU Datathon 2021 :: Bloomberg INDG Challenge

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
BASE_DIR = '/content/drive/MyDrive/TAMU Datathon/Fed Topic Model/indg-fed-register/'

## Imports

In [None]:
import numpy as np
import glob
import os
from tqdm import tqdm

from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

##How to efficiently extract SUBJECT, SUM (summary), SUPLINF text from XML files for each year?

In [None]:
# There are 3618 XML files in the docs folder
print(len([name for name in os.listdir(BASE_DIR+r"docs") if os.path.isfile(os.path.join(BASE_DIR+r"docs", name))]))

3618


In [None]:
# Find the most efficient way to get content from the set of XML files for each year
count = 0
for s in ['01', '02', '03', '04', '05', '06', '07', '08', '09']:
    local = 0
    filelist = glob.glob(BASE_DIR+f"docs/{s}-*.xml")
    count += len(filelist)
    local += count

    filelist = glob.glob(BASE_DIR+f"docs/E{s[1]}*.xml")
    count += len(filelist)
    print(s, local)

for i in range(2010, 2022):
    local = 0
    s = str(i)
    filelist = glob.glob(BASE_DIR+f"docs/{s}-*.xml")
    count += len(filelist)
    local += len(filelist)

    filelist = glob.glob(BASE_DIR+f"docs/C1-{s}-*.xml")
    count += len(filelist)
    local += len(filelist)
    filelist = glob.glob(BASE_DIR+f"docs/C2-{s}-*.xml")
    count += len(filelist)
    local += len(filelist)
    filelist = glob.glob(BASE_DIR+f"docs/R1-{s}-*.xml")
    count += len(filelist)
    local += len(filelist)

    print(s, local)
# print(count)

filelist = glob.glob(BASE_DIR+f"docs/R3-*.xml") # 2003, only 1 file
count += len(filelist)
print(count) # 3617 files and 1 file is a very short amendment file

01 170
02 305
03 461
04 628
05 743
06 817
07 896
08 1014
09 1177
2010 201
2011 253
2012 203
2013 170
2014 181
2015 156
2016 189
2017 128
2018 188
2019 239
2020 286
2021 81
3617


##Panel for years 2001-2006

In [None]:
panel_01_06 = []

for s in ['01', '02', '03', '04', '05', '06']:
    filelist = glob.glob(BASE_DIR+f"docs/{s}-*.xml")
    filelist_E = glob.glob(BASE_DIR+f"docs/E{s[1]}*.xml")
    print(f"year: {s}, # of files: {len(filelist) + len(filelist_E)}")
    for filename in tqdm(filelist + filelist_E):
        with open(filename, encoding="utf-8") as open_file:
            content = open_file.read()
            
            soup = BeautifulSoup(content, "xml")
            panel_01_06.append(soup.find('SUBJECT').get_text())
            # use soup to get all paragraphs in <SUM> tags
            if soup.find('SUM') is not None:
                panel_01_06.append(soup.find('SUM').get_text())
            # if soup.find('SUPLINF') is not None:
            #     panel_01_06.append(soup.find('SUPLINF').get_text())
            # print(filename)
            # print(soup.find('SUBJECT').get_text())
            # print(soup.find('SUM').get_text())
            
            #print(soup.find('SUPLINF').get_text())

            #print("First SUBJECT tag in " + filename + ": " + str(soup.find('SUBJECT')))

year: 01, # of files: 170


100%|██████████| 170/170 [00:04<00:00, 39.95it/s]


year: 02, # of files: 135


100%|██████████| 135/135 [00:04<00:00, 31.37it/s]


year: 03, # of files: 156


100%|██████████| 156/156 [00:05<00:00, 30.16it/s]


year: 04, # of files: 167


100%|██████████| 167/167 [00:06<00:00, 26.12it/s]


year: 05, # of files: 120


100%|██████████| 120/120 [00:04<00:00, 28.17it/s]


year: 06, # of files: 125


100%|██████████| 125/125 [00:03<00:00, 32.90it/s]


##Panel for years 2007-2012

In [None]:
panel_07_12 = []
for s in ['07', '08', '09']:
    filelist = glob.glob(BASE_DIR+f"docs/{s}-*.xml")
    filelist_E = glob.glob(BASE_DIR+f"docs/E{s[1]}*.xml")

    print(f"year: {s}, # of files: {len(filelist) + len(filelist_E)}")
    for filename in tqdm(filelist + filelist_E):
        with open(filename, encoding="utf-8") as open_file:
            content = open_file.read()
            
            soup = BeautifulSoup(content, "xml")
            panel_07_12.append(soup.find('SUBJECT').get_text())
            if soup.find('SUM') is not None:
                panel_07_12.append(soup.find('SUM').get_text())
            ## if soup.find('SUPLINF') is not None:
            ##    panel_07_12.append(soup.find('SUPLINF').get_text())

for i in range(2010, 2013):
    s = str(i)
    filelist = glob.glob(BASE_DIR+f"docs/{s}-*.xml")
    filelist += glob.glob(BASE_DIR+f"docs/C1-{s}-*.xml")
    filelist += glob.glob(BASE_DIR+f"docs/C2-{s}-*.xml")
    filelist += glob.glob(BASE_DIR+f"docs/R1-{s}-*.xml")

    print(f"year: {s}, # of files: {len(filelist)}")
    for filename in tqdm(filelist):
        with open(filename, encoding="utf-8") as open_file:
            content = open_file.read()
            
            soup = BeautifulSoup(content, "xml")
            panel_07_12.append(soup.find('SUBJECT').get_text())
            if soup.find('SUM') is not None:
                panel_07_12.append(soup.find('SUM').get_text())
            # if soup.find('SUPLINF') is not None:
            #     panel_07_12.append(soup.find('SUPLINF').get_text())

year: 07, # of files: 141


100%|██████████| 141/141 [00:05<00:00, 26.64it/s]


year: 08, # of files: 163


100%|██████████| 163/163 [00:06<00:00, 26.45it/s]


year: 09, # of files: 164


100%|██████████| 164/164 [00:06<00:00, 23.88it/s]


year: 2010, # of files: 201


100%|██████████| 201/201 [00:11<00:00, 17.40it/s]


year: 2011, # of files: 253


100%|██████████| 253/253 [00:15<00:00, 16.42it/s]


year: 2012, # of files: 203


100%|██████████| 203/203 [00:12<00:00, 16.70it/s]


##Topic Model using Latent Dirichlet Allocation

In [None]:
N_TOPICS = 20
MAX_FEATURES = 10000
RANDOM_STATE = 42
N_TOP_WORDS = 10

A_vec = CountVectorizer(max_features = MAX_FEATURES, # only top 10k words by freq
                        lowercase = True,            # remove capitalization
                        ngram_range = (1,2),         # include 1, 2-word phrases
                        min_df=10,   # note: absolute count of documents
                        max_df=0.95, # note: % of docs
                        stop_words='english'
                        )

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        term_list = [feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        print("topic %d:" % (topic_idx), term_list)

def process_panel(panel):
    A_vec_tf = A_vec.fit_transform(panel) # A_vec_tf is a sparse matrix
    tf_feature_names_A = A_vec.get_feature_names()

    # This will take a couple of minutes to run...
    lda = LatentDirichletAllocation(n_components=N_TOPICS, random_state=RANDOM_STATE)
    lda.fit(A_vec_tf)
    topic_models = lda.components_
    display_topics(lda, tf_feature_names_A, N_TOP_WORDS)  

##20 Topics and Top 10 words from panel for years 2001-2006

In [None]:
process_panel(panel_01_06)

topic 0: ['form', 'amendments', 'rule', 'disclosure', 'securities', 'registration', 'summary', 'technical', 'insurance', 'fees']
topic 1: ['capital', 'risk', 'banks', 'based', 'risk based', 'banking', 'based capital', 'bank', 'foreign', 'guidelines']
topic 2: ['edgar', 'manual', 'filer', 'filer manual', 'filing', 'electronic', 'commission', 'securities', 'electronic filing', 'revisions']
topic 3: ['annual', 'reports', 'rules', 'published', 'amendments', 'fr', 'annual reports', 'accelerated', 'report', 'summary']
topic 4: ['act', 'disclosure', 'directors', 'rules', '2002', 'sarbanes oxley', 'oxley act', 'sarbanes', 'oxley', 'executive']
topic 5: ['audit', 'board', 'rules', 'rule', 'lending', 'act', 'truth', 'certain', 'services', 'disclosures']
topic 6: ['commission', 'financial', 'rule', 'securities', 'exchange', 'section', 'rules', 'amendments', 'accounting', 'summary']
topic 7: ['fdic', 'insurance', 'deposit', 'deposit insurance', 'federal', 'comment', 'act', 'federal deposit', 'rule

##20 Topics and Top 10 words from panel for years 2007-2012

In [None]:
process_panel(panel_07_12)

topic 0: ['act', 'rules', 'consumer', 'final', 'financial', 'credit', 'regulation', 'board', 'final rules', 'electronic']
topic 1: ['securities', 'swap', 'security', 'security based', 'act', 'rules', 'based', 'exchange', 'based swap', 'requirements']
topic 2: ['commission', 'exchange', 'act', 'securities', 'exchange act', 'securities exchange', 'rule', 'market', 'exchange commission', 'summary']
topic 3: ['rule', 'regulation', 'final', 'final rule', 'board', 'published', 'summary', 'federal register', 'federal', 'register']
topic 4: ['capital', 'risk', 'federal', 'insurance', 'fdic', 'occ', 'deposit insurance', 'deposit', 'summary', 'office']
topic 5: ['institutions', 'depository', 'fdic', 'depository institutions', 'institution', 'reserve', 'insured', 'rule', 'deposit', 'insured depository']
topic 6: ['reserve', 'federal reserve', 'federal', 'board', 'office', 'bank', 'reserve bank', 'regulation', 'amendments', 'office federal']
topic 7: ['disclosure', '2008', 'foreign', 'issuers', 'e

##Panel for years 2008, 2009, 2016, and 2017

In [None]:
panel_08 = []
for s in ['08']:
    filelist = glob.glob(BASE_DIR+f"docs/{s}-*.xml")
    filelist_E = glob.glob(BASE_DIR+f"docs/E{s[1]}*.xml")

    print(f"year: {s}, # of files: {len(filelist) + len(filelist_E)}")
    for filename in tqdm(filelist + filelist_E):
        with open(filename, encoding="utf-8") as open_file:
            content = open_file.read()
            
            soup = BeautifulSoup(content, "xml")
            panel_08.append(soup.find('SUBJECT').get_text())
            if soup.find('SUM') is not None:
                panel_08.append(soup.find('SUM').get_text())
            # if soup.find('SUPLINF') is not None:
            #     panel_08.append(soup.find('SUPLINF').get_text())
panel_09 = []
for s in ['09']:
    filelist = glob.glob(BASE_DIR+f"docs/{s}-*.xml")
    filelist_E = glob.glob(BASE_DIR+f"docs/E{s[1]}*.xml")

    print(f"year: {s}, # of files: {len(filelist) + len(filelist_E)}")
    for filename in tqdm(filelist + filelist_E):
        with open(filename, encoding="utf-8") as open_file:
            content = open_file.read()
            
            soup = BeautifulSoup(content, "xml")
            panel_09.append(soup.find('SUBJECT').get_text())
            if soup.find('SUM') is not None:
                panel_09.append(soup.find('SUM').get_text())
            # if soup.find('SUPLINF') is not None:
            #     panel_09.append(soup.find('SUPLINF').get_text())

panel_16 = []
for i in range(2016, 2017):
    s = str(i)
    filelist = glob.glob(BASE_DIR+f"docs/{s}-*.xml")
    filelist += glob.glob(BASE_DIR+f"docs/C1-{s}-*.xml")
    filelist += glob.glob(BASE_DIR+f"docs/C2-{s}-*.xml")
    filelist += glob.glob(BASE_DIR+f"docs/R1-{s}-*.xml")

    print(f"year: {s}, # of files: {len(filelist)}")
    for filename in tqdm(filelist):
        with open(filename, encoding="utf-8") as open_file:
            content = open_file.read()
            
            soup = BeautifulSoup(content, "xml")
            panel_16.append(soup.find('SUBJECT').get_text())
            if soup.find('SUM') is not None:
                panel_16.append(soup.find('SUM').get_text())
            # if soup.find('SUPLINF') is not None:
            #     panel_16.append(soup.find('SUPLINF').get_text())

panel_17 = []
for i in range(2017, 2018):
    s = str(i)
    filelist = glob.glob(BASE_DIR+f"docs/{s}-*.xml")
    filelist += glob.glob(BASE_DIR+f"docs/C1-{s}-*.xml")
    filelist += glob.glob(BASE_DIR+f"docs/C2-{s}-*.xml")
    filelist += glob.glob(BASE_DIR+f"docs/R1-{s}-*.xml")

    print(f"year: {s}, # of files: {len(filelist)}")
    for filename in tqdm(filelist):
        with open(filename, encoding="utf-8") as open_file:
            content = open_file.read()
            
            soup = BeautifulSoup(content, "xml")
            panel_17.append(soup.find('SUBJECT').get_text())
            if soup.find('SUM') is not None:
                panel_17.append(soup.find('SUM').get_text())
            # if soup.find('SUPLINF') is not None:
            #     panel_17.append(soup.find('SUPLINF').get_text())

year: 08, # of files: 163


100%|██████████| 163/163 [00:05<00:00, 31.46it/s]


year: 09, # of files: 164


100%|██████████| 164/164 [00:05<00:00, 28.42it/s]


year: 2016, # of files: 189


100%|██████████| 189/189 [00:51<00:00,  3.65it/s]


year: 2017, # of files: 128


100%|██████████| 128/128 [00:29<00:00,  4.40it/s]


##20 Topics and Top 10 words for 2008

In [None]:
process_panel(panel_08)

topic 0: ['funds', 'collection', 'availability', 'investment', 'checks', 'truth', 'truth lending', 'lending', 'electronic', 'company act']
topic 1: ['federal', 'reserve', 'federal reserve', 'office', 'bank', 'board', 'reserve bank', 'amending', 'processing', 'regulation']
topic 2: ['capital', 'corporation', 'banks', 'exemption', 'comment', 'regulatory', 'certain', 'federal', 'relating', 'collectively']
topic 3: ['commission', 'futures', 'commodity', 'trading', 'regulations', 'commodity futures', 'futures trading', 'trading commission', 'commission commission', 'contracts']
topic 4: ['foreign', 'private', 'issuers', 'foreign private', 'financial', 'information', 'amendments', 'rules', 'revisions', 'form']
topic 5: ['investment', 'commission', 'securities', 'act', 'company', 'exchange', 'securities exchange', 'investment company', 'rules', 'exchange commission']
topic 6: ['act', 'rule', 'final', 'board', 'final rule', '2008', 'federal', 'mortgage', 'consumer', 'proposed']
topic 7: ['boar

##20 Topics and Top 10 words for 2009

In [None]:
process_panel(panel_09)

topic 0: ['2009', 'published', 'federal register', 'register', 'federal', 'fr', '74', '74 fr', 'rule', 'document']
topic 1: ['credit', 'regulation', 'board', 'consumer', 'act', 'final', 'disclosures', 'summary', 'requirements', 'end']
topic 2: ['commission', 'rule', 'securities', 'exchange', 'amendments', 'act', 'regulation', 'public', 'securities exchange', 'proposed']
topic 3: ['funds', 'availability', 'collection', 'checks', 'certain', 'transactions', 'banks', 'exemption', 'designed', 'securities']
topic 4: ['securities', 'act', 'exchange act', 'exchange', 'securities act', 'rules', '1934', 'act 1934', 'amendments', '1933']
topic 5: ['capital', 'bank', 'companies', 'holding', 'board', 'rule', 'securities', 'final', '2008', 'final rule']
topic 6: ['rule', 'interim', 'final', 'final rule', 'insurance', 'deposit', '2009', '2010', 'deposit insurance', 'institution']
topic 7: ['accounting', 'commission', 'release', 'financial', 'standards', 'staff', 'securities', 'guidance', '2008', '200

##20 Topics and Top 10 words for 2016

In [None]:
process_panel(panel_16)

topic 0: ['commission', 'commodity', 'futures', 'trading', 'commodity futures', 'regulations', 'trading commission', 'futures trading', 'cftc', 'commission cftc']
topic 1: ['final', 'rule', 'act', 'final rule', 'rules', 'regulations', 'summary', 'interim final', 'interim', 'information']
topic 2: ['rule', 'holding', 'assets', 'board', 'companies', 'bank', 'bank holding', 'holding companies', 'billion', 'foreign']
topic 3: ['act', 'securities', 'exchange', 'exchange act', 'securities exchange', '1934', 'act 1934', 'title', 'requirements', 'rule']
topic 4: ['institutions', 'insurance', 'deposit', 'insured', 'depository', 'fdic', 'depository institutions', 'insured depository', 'deposit insurance', 'institution']
topic 5: ['requirements', 'disclosure', 'disclosure requirements', 'important', 'systemically important', 'systemically', 'registrants', 'amendments', 'certain', 'liquidity']
topic 6: ['requirements', 'contracts', 'certain', 'recordkeeping', 'mortgage', 'guidance', 'compliance', 

##20 Topics and Top 10 words for 2017

In [None]:
process_panel(panel_17)

topic 0: ['amendments', 'proposed', 'proposed amendments', 'regulation', 'disclosure', 'certain', 'summary', 'proposing', 'requirements', 'order']
topic 1: ['board', 'reserve', 'federal', 'federal reserve', 'summary board', 'stress', 'total', 'policy', 'summary', 'bank']
topic 2: ['board', 'federal', 'reserve', 'rate', 'federal reserve', 'credit', 'regulation', 'amendments', 'summary board', 'reserve board']
topic 3: ['financial', 'certain', 'requirements', 'contracts', 'qualified', 'related', 'fdic', 'institutions', 'definition', 'revisions']
topic 4: ['commission', '2017', 'agenda', 'comment', 'summary', 'published', 'regulatory', 'period', 'register', 'publishing']
topic 5: ['regulation', 'act', 'act regulation', 'truth lending', 'lending', 'truth', 'lending act', 'electronic', 'transfer', 'fund']
topic 6: ['federal', 'final', 'covered', 'occ', 'banks', 'rule', 'fdic', 'final rule', 'board', 'deposit']
topic 7: ['threshold', 'increase', 'exemption', 'annual', 'percentage', 'bureau',

##Panel for years 2020-2021

In [None]:
panel_20_21 = []
for i in range(2020, 2021):
    s = str(i)
    filelist = glob.glob(BASE_DIR+f"docs/{s}-*.xml")
    filelist += glob.glob(BASE_DIR+f"docs/C1-{s}-*.xml")
    filelist += glob.glob(BASE_DIR+f"docs/C2-{s}-*.xml")
    filelist += glob.glob(BASE_DIR+f"docs/R1-{s}-*.xml")

    print(f"year: {s}, # of files: {len(filelist)}")
    for filename in tqdm(filelist):
        with open(filename, encoding="utf-8") as open_file:
            content = open_file.read()
            
            soup = BeautifulSoup(content, "xml")
            panel_20_21.append(soup.find('SUBJECT').get_text())
            if soup.find('SUM') is not None:
                panel_20_21.append(soup.find('SUM').get_text())

year: 2020, # of files: 286


100%|██████████| 286/286 [01:20<00:00,  3.54it/s]


##20 Topics and Top 10 words for 2020-21

In [None]:
process_panel(panel_20_21)

topic 0: ['loan', 'qm', 'reporting', 'data', 'requirements', 'temporary', 'mortgage', 'loans', 'definition', 'regulation']
topic 1: ['commission', 'amendments', 'certain', 'regulation', 'exemption', 'commodity', 'disclosure', 'investors', 'summary', 'adopting']
topic 2: ['investment', 'companies', 'investment companies', 'certain', 'funds', 'technical', 'end', 'clearing', 'exempt', 'requirement']
topic 3: ['reserve', 'institutions', 'depository', 'regulation', 'depository institutions', 'federal', 'requirements', 'federal reserve', 'reserve requirements', 'board']
topic 4: ['fdic', 'insurance', 'insured', 'deposit', 'federal', 'regarding', 'policy', 'summary', 'regulations', 'deposit insurance']
topic 5: ['period', 'comment', '2020', 'proposed', 'notice', 'comment period', 'rulemaking', 'notice proposed', 'proposed rulemaking', 'extension']
topic 6: ['securities', 'exchange', 'commission', 'securities exchange', 'exchange commission', 'rules', 'act', 'summary securities', 'issuers', 's

##Non negative Matrix Factorization

In [None]:
A_vec = TfidfVectorizer(max_features = 10000, # only top 10k words by freq
                        lowercase = True,     # remove capitalization
                        ngram_range = (1,2),  # include 1, 2-word phrases
                        min_df=10,   # note: absolute count of documents
                        max_df=0.95, # note: % of docs
                        stop_words='english'
                        )
tfidf_documents = A_vec.fit_transform(panel_20_21)
tfidf_feature_names = A_vec.get_feature_names()

nmf = NMF(n_components=N_TOPICS, random_state=RANDOM_STATE, init="nndsvd")
W = nmf.fit_transform(tfidf_documents) 
H = nmf.components_

for topic_index in range(N_TOPICS):
    top_indices = np.argsort(H[topic_index, :])[::-1]
    top_terms = []
    for term_index in top_indices[0:N_TOP_WORDS]:
        top_terms.append(tfidf_feature_names[term_index])
    print("topic ", topic_index, top_terms)

topic  0 ['rule', 'final', 'final rule', 'board', 'federal', 'interim', 'interim final', 'agencies', '2020', 'federal reserve']
topic  1 ['commission', 'commodity', 'trading', 'futures', 'futures trading', 'commodity futures', 'trading commission', 'summary commodity', 'cftc', 'regulations']
topic  2 ['swap', 'margin', 'dealers', 'swap dealers', 'swap participants', 'major', 'major swap', 'dealers major', 'participants', 'swaps']
topic  3 ['bureau', 'consumer', 'consumer financial', 'protection', 'financial', 'bureau consumer', 'protection bureau', 'financial protection', 'summary bureau', 'act']
topic  4 ['reserve', 'depository', 'institutions', 'depository institutions', 'reserve requirements', 'requirements depository', 'regulation reserve', 'regulation', 'federal reserve', 'requirements']
topic  5 ['federal savings', 'banks', 'national', 'savings', 'national banks', 'savings associations', 'associations', 'banks federal', 'federal', 'activities']
topic  6 ['regulation', 'definition