In [1]:
import os
import re
import numpy as np
import pandas as pd
# import nltk
from nltk.corpus import stopwords

from settings import resource_folder, covenant_folder, output_folder

### Text Preprocessing

In [2]:
test_file = 'temp_files/covenant/final-0000798949-04-000061.txt'
test_path = os.path.join(output_folder, test_file)

In [3]:
with open(test_path, 'r') as f:
    #     covenant = f.read()
    covenant = f.readlines()

In [4]:
f = lambda x: re.sub('^\s+', '', x)
covenant = [f(line) for line in covenant]

In [5]:
def reconstruct(text_list):
    result = []
    paragraph = ''
    for line in text_list:
        if re.match('^(\([a-z]\)|SECTION \d\.\d+?\.)\s', line):
            result.append(paragraph)
            paragraph = ''
        paragraph += line.replace('\n', ' ')
    return result


covenant = reconstruct(covenant)

In [6]:
covenant

['ARTICLE VII NEGATIVE COVENANTS ............................................ 46 7.1. Dividends ....................................................... 46 7.2. Indebtedness..................................................... 46 7.3. Limitation on Fundamental Changes ............................... 47 7.4. Sale of Assets................................................... 47 7.5. Investments and Acquisitions .................................... 48 7.6. Liens............................................................ 49 7.7. Affiliates....................................................... 50 7.8. Sale and Leaseback Transactions and other Off-Balance Sheet Liabilities...................................................... 50 7.9. Contingent Obligations .......................................... 50 7.10. Financial Contracts............................................. 51 7.11. Letters of Credit .............................................. 51 ii 7.12. Prohibited Contracts ...............

### Text Analysis

In [7]:
from gensim import corpora, models, similarities
from collections import defaultdict

In [8]:
stopwords = set(stopwords.words('english'))

In [9]:
texts = [[word for word in line.lower().strip().split() if word not in stopwords] for line in covenant]

In [10]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

corpus_tfidf = models.TfidfModel(corpus)[corpus]
num_topics = 60

lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary,
      alpha=0.01, eta=0.01, minimum_probability=0.001, update_every = 1, chunksize = 100, passes = 1)

  perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words


In [11]:
lda.get_topic_terms(topicid=0)

[(1071, 0.0006238302),
 (1072, 0.0006238302),
 (1068, 0.0006238302),
 (1069, 0.0006238302),
 (1070, 0.0006238302),
 (1066, 0.0006238302),
 (1075, 0.0006238302),
 (1074, 0.0006238302),
 (1064, 0.0006238302),
 (1073, 0.0006238302)]

In [12]:
for i in range(num_topics):
    print(f'{i}th topic:')
    terms_all = lda.get_topic_terms(topicid=i)
    term_distribution = np.array(terms_all[:7])
    term_id = term_distribution[:, 0].astype(np.int)
    for t in term_id:
        print(dictionary.id2token[t], end=' ')
    print('\n', term_distribution[:, 1])

0th topic:
agent's allocated action affidavit affirmative acceleration automatically 
 [0.00062383 0.00062383 0.00062383 0.00062383 0.00062383 0.00062383
 0.00062383]
1th topic:
agent's allocated action affidavit affirmative acceleration automatically 
 [0.00062383 0.00062383 0.00062383 0.00062383 0.00062383 0.00062383
 0.00062383]
2th topic:
necessary financial conduct respective statements use reasonably 
 [0.00734588 0.00538309 0.00501128 0.0045089  0.00427001 0.00402783
 0.00402783]
3th topic:
agent's allocated action affidavit affirmative acceleration automatically 
 [0.00062383 0.00062383 0.00062383 0.00062383 0.00062383 0.00062383
 0.00062383]
4th topic:
agent's allocated action affidavit affirmative acceleration automatically 
 [0.00062383 0.00062383 0.00062383 0.00062383 0.00062383 0.00062383
 0.00062383]
5th topic:
subsidiaries) liens. incur, 7.6. unit. (taking $5,000,000 
 [0.01618504 0.01618504 0.01618504 0.01291878 0.01291878 0.01291878
 0.01291878]
6th topic:
transaction 

In [13]:
doc_num = len(texts)
doc_topics = lda.get_document_topics(corpus_tfidf)

for i in range(doc_num):
    topic = np.array(doc_topics[i])
    topic_distribution = np.array(topic[:, 1])
    doc_topic_prob = list(topic_distribution)
    
    counter = 0
    prob_id = 0
    prob = 0
    for p in doc_topic_prob:
        counter += 1
        if prob >= p:
            continue
        else:
            prob = p
            prob_id = counter
    print(f'{i}th doc belongs to topic {prob_id} with probability {prob}')
    
#     print(f'{i}th doc probability distribustion is:')
#     print(list(topic_distribution), '\n')

0th doc belongs to topic 11 with probability 0.8210671544075012
1th doc belongs to topic 47 with probability 0.8887367248535156
2th doc belongs to topic 17 with probability 0.9343332052230835
3th doc belongs to topic 47 with probability 0.8657149076461792
4th doc belongs to topic 53 with probability 0.9157668352127075
5th doc belongs to topic 48 with probability 0.8680201768875122
6th doc belongs to topic 22 with probability 0.8668609261512756
7th doc belongs to topic 49 with probability 0.8856468200683594
8th doc belongs to topic 16 with probability 0.6818264722824097
9th doc belongs to topic 39 with probability 0.9203526377677917
10th doc belongs to topic 30 with probability 0.8657054901123047
11th doc belongs to topic 26 with probability 0.8632491827011108
12th doc belongs to topic 6 with probability 0.8881493806838989
13th doc belongs to topic 44 with probability 0.9080466032028198
14th doc belongs to topic 60 with probability 0.8358696699142456
15th doc belongs to topic 2 with pro