Biography Analysis - Generates the required files for Amendment Biography

Given two pieces of text, there are 3 modules running

Mod 1 : Compare the two pieces to find exact matches
Mod 2 : Find differences given lines

In [40]:
import nltk
from pprint import pprint

In [41]:
import difflib
# import itertools
from itertools import product
from difflib import SequenceMatcher
from shutil import copyfile

In [42]:
def split_para(text):
    """
    Split a paragraph (a regulation) into sentences
    """
    return nltk.sent_tokenize(text)

In [43]:
from Levenshtein import ratio
def naive_string_similarity(a, b):
    """
    Get levenshtein string similarity
    """
    if type(a) == list and type(b) == list:
        a = " ".join(a)
        b = " ".join(b)
        
    return ratio(a,b)
    
def full_score(sent_pair,s1,s2,threshold=0.9):
    """
    Check if a string is perfectly matching
    """
    i1 , i2 = sent_pair
#     print(s1[i1],s2[i2])
#     print("~~~~~~~~~~~~~~~")
    if type(s1[i1]) is list and type(s2[i2]) is list:
        l1 = s1[i1][1]
        l2 = s2[i2][1]
    elif type(s1[i1]) is str and type(s2[i2]) is str:
        l1 = s1[i1]
        l2 = s2[i2]
    return True if naive_string_similarity(l1,l2) >= threshold else False

In [44]:
def get_fully_matching_sent_indices(s1,s2):
    """Assumes that the no there are no repetitions in the given list, causing (i1,i2),(i3,i2) and s1[i1] == s1[i3]
    Returns all pairs that match fully (however full score is defined) , 
    in sorted order of the second tuple-element """
    i1 = range(len(s1))
    i2 = range(len(s2))
    
    all_combos = list(product(i1,i2))
    full_matches = [sent_pair for sent_pair in all_combos if full_score(sent_pair,s1,s2)]
    
    return sorted(full_matches, key=lambda x : x[1])

In [45]:
import heapq

def get_maximum_matching(s1,s2):
    """
    s1 = List of [reg_num, list of sub regs]
    Returns min(s1,s2) matching tuples
    """
    i1 = range(len(s1))
    i2 = range(len(s2))
    
    set_i1 = set(range(len(s1)))
    set_i2 = set(range(len(s2)))
    
    all_combos = list(product(i1,i2))
    
    scores = [(-naive_string_similarity(s1[ind1],s2[ind2]),ind1,ind2) for ind1,ind2 in all_combos]
    heapq.heapify(scores)
    score_threshold = 0.8
    """
    Even if there is a match it is useless after 0.8
    """
    max_tups = min(len(i1),len(i2))
    count = 0
    candidate_tuples = []
    
    matched_i1 = set()
    matched_i2 = set()
    while scores != []:
        score, ind1, ind2 = heapq.heappop(scores)
        if -score >= score_threshold and ind1 in set_i1 and ind2 in set_i2:
            candidate_tuples.append((-score,ind1,ind2))
            set_i1.remove(ind1)
            set_i2.remove(ind2)
            
            matched_i1.add(ind1)
            matched_i2.add(ind2)
    
    return candidate_tuples, matched_i1, matched_i2

In [46]:
CRED = '\033[91m'
CEND = '\033[0m'
CBLUE = '\033[34m'
CGREEN = '\033[92m'

printr = lambda inp : CRED + inp + CEND
printb = lambda inp : CBLUE + inp + CEND
printg = lambda inp : CGREEN + inp + CEND

print_join = lambda inp : print(" ".join(inp))

from collections import deque

def lcs(a, b):
    """
    Word level LCS
    a : array of words
    b : array of words
    """
    # generate matrix of length of longest common subsequence for substrings of both words
    lengths = [[0] * (len(b)+1) for _ in range(len(a)+1)]
    for i, x in enumerate(a):
        for j, y in enumerate(b):
            if x == y:
                lengths[i+1][j+1] = lengths[i][j] + 1
            else:
                lengths[i+1][j+1] = max(lengths[i+1][j], lengths[i][j+1])
 
    # read a substring from the matrix
    
    result = []
    j = len(b)
    for i in range(1, len(a)+1):
        if lengths[i][j] != lengths[i-1][j]:
            result.append(a[i-1])
    
    qa = deque(a)
    qb = deque(b)
    rq = deque(result)
    
    
    ansa = []
    ansb = []
    
    while rq :
        common_ele = rq.popleft()
        while len(qa) >= 1 and qa[0] != common_ele:
            ansa.append(printb(qa.popleft()))
        while len(qb) >= 1 and qb[0] != common_ele:
            ansb.append(printb(qb.popleft()))
        
        ansa.append(printr(common_ele))
        ansb.append(printr(common_ele))
        
        if len(qa) >= 1:
            qa.popleft()
        if len(qb) >= 1:
            qb.popleft()
    
    while len(qa) >= 1:
        ansa.append(printb(qa.popleft()))
    while len(qb) >= 1:
        ansb.append(printb(qb.popleft()))
    
    return result, ansa, ansb

In [47]:
from nltk import word_tokenize as wdtk

In [48]:
def match_two_regs(s1,s2,debug=False):
    """
    If the sorted array is increasing in both components, done ... problem solved !!
    TODO : Handle when the rows are jumbled
    match_lines : gives an array of tuples, x[i] belongs to index(si+1)
    non_match_areas : gives an array of "list of list of size (2,2)", 
                      format is [[start_index_s1, end_index_s1],[start_index_s2, end_index_s2]]
    
    """
    # check is to check both lists are in ascending order
    check = True
    
    full_matches = []
    partial_matches = []
    no_matches = [[],[]]
    
    full_match_pairs = get_fully_matching_sent_indices(s1,s2,threshold=0.9)
    
    #edges_added_to_full_pairs = [(0,0)] + full_match_pairs + [len(s1),len(s2)]
    #in the form p1 start:end, p2 start:end
    if len(full_match_pairs) > 0:
        non_match_areas = [ [[0,full_match_pairs[0][0]],[0,full_match_pairs[0][1]]] ]
    
    if check:
        for i, elem in enumerate(full_match_pairs):
            if i == len(full_match_pairs) - 1:
                non_match_areas.append([[elem[0]+1,len(s1)], [elem[1]+1, len(s2)] ])
            else:
                p11 , p12 = full_match_pairs[i]
                p21,  p22 = full_match_pairs[i+1]
                non_match_areas.append([ [p11+1, p21] , [p12+1, p22] ])
    
    if debug:
        d = difflib.Differ()
        for ele in full_match_pairs:
            diff = d.compare([s1[ele[0]]], [s2[ele[1]]])
            a,b,c = lcs(wdtk(s1[ele[0]]), wdtk(s2[ele[1]]))
            
            full_matches.append(" ".join(a))
            
#             print_join(a)
#             print_join(b)
#             print_join(c)
            
    
    if debug:
        pass
        #print("COMMON STRINGS DONE COMMON STRINGS DONE COMMON STRINGS DONE COMMON STRINGS DONE COMMON STRINGS DONE")   
    
    if debug:
        for elem in non_match_areas:
            p11, p12 = elem[0]
            p21, p22 = elem[1]
            if p11 == p12 and p21 == p22:
                continue
            l1 = s1[p11:p12]
            l2 = s2[p21:p22]
            
            max_match_tuples = get_maximum_matching(l1,l2)
            
            all_combos , mi1, mi2  = get_maximum_matching(l1,l2)
            
            for a,b,c in all_combos : 
#                 print("MATCHED --")
#                 print(f"SCORE : {a}")
                
                a1,a2,a3 =  lcs(wdtk(l1[b]),wdtk(l2[c]))
                partial_matches.append([" ".join(a1)," ".join(a2)," ".join(a3)])
#                 print_join(a1)
#                 print_join(a2)
#                 print_join(a3)
                
            
            for ind, ele in enumerate(l1): 
                if ind not in mi1:
                    no_matches[0].append(ele)
#                     print(printg(ele), end='--\n')
#             print(printg('000000000000000000000000000000000000000000000000000000000000'))
            
            for ind, ele in enumerate(l2): 
                if ind not in mi2:
                    no_matches[1].append(ele)
#                     print(printg(ele), end='--\n')
    
    return full_matches, partial_matches, no_matches
    

In [49]:
# a1 = """3.\n(1) No insider shall communicate, provide, or allow access to any unpublished price sensitive information, relating to a company or securities listed or proposed to be listed, to any person including other insiders except where such communication is in furtherance of legitimate purposes, performance of duties or discharge of legal obligations.\nNOTE\n: \nThis provision is intended to cast an obligation on all insiders who are essentially persons in possession of unpublished price sensitive information to handle such information with care and to deal with the information with them when transacting their business strictly on a \nneed-to-know\n basis. It is also intended to lead to organisations developing practices based on \nneed-to-know\n principles for treatment of information in their possession.\n(2)\nNo person shall procure from or cause the communication by any insider of unpublished price sensitive information, relating to a company or securities listed or proposed to be listed, except in furtherance of legitimate purposes, performance of duties or discharge of legal obligations.\nNOTE\n: This provision is intended to impose a prohibition on unlawfully procuring possession of unpublished price sensitive information. Inducement and procurement of unpublished price sensitive information not in furtherance of one’s legitimate duties and discharge of obligations would be illegal under this provision.\n(3)\nNotwithstanding anything contained in this regulation, an unpublished price sensitive information may be communicated, provided, allowed access to or procured, in connection with a transaction that would:–\n(i)\nentail an obligation to make an open offer under the takeover regulations where the board of directors of the company is of informed opinion that the proposed transaction is in the best interests of the company;\nNOTE\n: It is intended to acknowledge the necessity of communicating, providing, allowing access to or procuring UPSI for substantial transactions such as takeovers, mergers and acquisitions involving trading in securities and change of control to assess a potential investment. In an open offer under the takeover regulations, not only would the same price be made available to all shareholders of the company but also all information necessary to enable an informed divestment or retention decision by the public shareholders is required to be made available to all shareholders in the letter of offer under those regulations.\n(ii)\nnot attract the obligation to make an open offer under the takeover regulations but where the board of directors of the company is of informed opinion that the proposed transaction is in the best interests of the company and the information that constitute unpublished price sensitive information is disseminated to be made generally available at least two trading days prior to the proposed transaction being effected in such form as the board of directors may determine.\nNOTE\n: It is intended to permit communicating, providing, allowing access to or procuring UPSI also in transactions that do not entail an open offer obligation under the takeover regulations if it is in the best interests of the company. The board of directors, however, would cause public disclosures of such unpublished price sensitive information well before the proposed transaction to rule out any information asymmetry in the market.\n(4)\nFor purposes of \nsub-regulation\n (3), the board of directors shall require the parties to execute agreements to contract confidentiality and \nnon-disclosure\n obligations on the part of such parties and such parties shall keep information so received confidential, except for the purpose of \nsub-regulation\n"""
# a2 = """3.\n(1) No insider shall communicate, provide, or allow access to any unpublished price sensitive information, relating to a company or securities listed or proposed to be listed, to any person including other insiders except where such communication is in furtherance of legitimate purposes, performance of duties or discharge of legal obligations.\nNOTE\n: \nThis provision is intended to cast an obligation on all insiders who are essentially persons in possession of unpublished price sensitive information to handle such information with care and to deal with the information with them when transacting their business strictly on a \nneed-to-know\n basis. It is also intended to lead to organisations developing practices based on \nneed-to-know\n principles for treatment of information in their possession.\n(2)\nNo person shall procure from or cause the communication by any insider of unpublished price sensitive information, relating to a company or securities listed or proposed to be listed, except in furtherance of legitimate purposes, performance of duties or discharge of legal obligations.\nNOTE\n: This provision is intended to impose a prohibition on unlawfully procuring possession of unpublished price sensitive information. Inducement and procurement of unpublished price sensitive information not in furtherance of one’s legitimate duties and discharge of obligations would be illegal under this provision.\n7\n[(2A) The board of directors of a listed company shall make a policy for determination of “legitimate purposes” as a part of “Codes of Fair Disclosure and Conduct” formulated under regulation 8.\nExplanation – For the purpose of illustration, the term “legitimate purpose” shall include sharing of unpublished price sensitive information in the ordinary course of business by an insider with\n7\nInserted by Securities and Exchange Board of India (Prohibition of Insider Trading) (Amendment) Regulations, 2018 (w.e.f. April 01, 2019).\npartners, collaborators, lenders, customers, suppliers, merchant bankers, legal advisors, auditors, insolvency professionals or other advisors or consultants, provided that such sharing has not been carried out to evade or circumvent the prohibitions of these regulations.]\n8\n[(2B) Any person in receipt of unpublished price sensitive information pursuant to a “legitimate purpose” shall be considered an “insider” for purposes of these regulations and due notice shall be given to such persons to maintain confidentiality of such unpublished price sensitive information in compliance with these regulations.]\n(3)\nNotwithstanding anything contained in this regulation, an unpublished price sensitive information may be communicated, provided, allowed access to or procured, in connection with a transaction that would:–\n(i)\nentail an obligation to make an open offer under the takeover regulations where the board of directors of the \n9\n[listed] company is of informed opinion that \n10\n[sharing of such information] is in the best interests of the company;\nNOTE\n: It is intended to acknowledge the necessity of communicating, providing, allowing access to or procuring UPSI for substantial transactions such as takeovers, mergers and acquisitions involving trading in securities and change of control to assess a potential investment. In an open offer under the takeover regulations, not only would the same price be made available to all shareholders of the company but also all information necessary to enable an informed divestment or retention decision by the public shareholders is required to be made available to all shareholders in the letter of offer under those regulations.\n8\nInserted by Securities and Exchange Board of India (Prohibition of Insider Trading) (Amendment) Regulations, 2018 (w.e.f. April 01, 2019)\n9\nInserted by Securities and Exchange Board of India (Prohibition of Insider Trading) (Amendment) Regulations, 2018 (w.e.f. April 01, 2019)\n10\nSubstituted for the words “the proposed transaction” by Securities and Exchange Board of India (Prohibition of Insider Trading) (Amendment) Regulations, 2018 (w.e.f. April 01, 2019).\n(ii)\nnot attract the obligation to make an open offer under the takeover regulations but where the board of directors of the \n11\n[listed] company is of informed opinion \n12\n[that sharing of such information] is in the best interests of the company and the information that constitute unpublished price sensitive information is disseminated to be made generally available at least two trading days prior to the proposed transaction being effected in such form as the board of directors may determine \n13\n[to be adequate and fair to cover all relevant and material facts].\nNOTE\n: It is intended to permit communicating, providing, allowing access to or procuring UPSI also in transactions that do not entail an open offer obligation under the takeover regulations \n14\n[when authorised by the board of directors if sharing of such information] is in the best interests of the company. The board of directors, however, would cause public disclosures of such unpublished price sensitive information well before the proposed transaction to rule out any information asymmetry in the market.\n(4)\nFor purposes of \nsub-regulation\n (3), the board of directors shall require the parties to execute agreements to contract confidentiality and \nnon-disclosure\n obligations on the part of such parties and such parties shall keep information so received confidential, except for the purpose of \nsub-regulation\n (3), and shall not otherwise trade in securities of the company when in possession of unpublished price sensitive information.\n15\n[(5) The board of directors or head(s) of the organisation of every person required to handle unpublished price sensitive information shall ensure that a structured digital database is\n11\nInserted by Securities and Exchange Board of India (Prohibition of Insider Trading) (Amendment) Regulations, 2018 (w.e.f. April 01, 2019)\n12\nSubstituted for the words “that the proposed transaction” by Securities and Exchange Board of India (Prohibition of Insider Trading) (Amendment) Regulations, 2018 (w.e.f. April 01, 2019)\n13\nInserted by Securities and Exchange Board of India (Prohibition of Insider Trading) (Amendment) Regulations, 2018 (w.e.f. April 01, 2019)\n14\nSubstituted for the words “if it” by Securities and Exchange Board of India (Prohibition of Insider Trading) (Amendment) Regulations, 2018 (w.e.f. April 01, 2019)\n15\nSubstituted by Securities and Exchange Board of India (Prohibition of Insider Trading) (Amendment) Regulations, 2020 (w.e.f. July 17, 2020). Prior to the substitution, \nsub-regulation\n 5 read as follows: -\n“The board of directors shall ensure that a structured digital database is maintained containing the names of such persons or entities as the case may be with whom information is shared under this regulation along with the Permanent Account Number or any other identifier authorized by law where Permanent Account\nmaintained containing the nature of unpublished price sensitive information and the names of such persons who havThe  board  of  directors or  head(s)  of  the  organisation  of  every  person  required  to  handle e shared the information and also the names of such persons with whom information is shared under this regulation along with the Permanent Account Number or any other identifThe  board  of  directors or  head(s)  of  the  organisation  of  every  person  required  to  handle ier authorized by law where Permanent Account Number is not available. Such database shall not be outsourced and shall be maintained internally with adequate internal controls and checks such as time stamping and audit trails to ensure \nnon-tampering\n of the database.]\n16\n"""

# s1 = [ele.replace('\n','~~') for ele in split_para(a1) if len(ele) > 10 and 'Inserted' not in ele and 'Substituted' not in ele and 'NOTE' not in ele]
# s2 = [ele.replace('\n','~~') for ele in split_para(a2) if len(ele) > 10 and 'Inserted' not in ele and 'Substituted' not in ele and 'NOTE' not in ele]

# len(s1),len(s2)
# f,p,n = match_two_regs(s1,s2,debug=True)

# f
# for ele in p:
#     print(ele[0])
#     print(ele[1])
#     print(ele[2])
# print(n[0])
# print(n[1])

In [50]:
import json

`Script to compare two docs and get most similar docs, 
Assumes document is an array of regulations`

In [51]:
def nested_list_to_linear_list(list_):
    ans = []
    for ele in list_:
        if type(ele) is str:
            ans.append(ele)
        else:
            ans.append(''.join(ele))
    return ans

In [52]:
def get_matching_regs(s1,s2):
    """
    Returns min(s1,s2) matching tuples
    """
    i1 = range(len(s1))
    i2 = range(len(s2))
    
    set_i1 = set(range(len(s1)))
    set_i2 = set(range(len(s2)))
    
    all_combos = list(product(i1,i2))
    scores = [(-naive_string_similarity(s1[ind1][1],s2[ind2][1]),ind1,ind2) for ind1,ind2 in all_combos]
    heapq.heapify(scores)
    score_threshold = 0.0
    """
    Even if there is a match it is useless after score_threshold
    """
    max_tups = min(len(i1),len(i2))
    count = 0
    candidate_tuples = []
    
    matched_i1 = set()
    matched_i2 = set()
    while scores != []:
        score, ind1, ind2 = heapq.heappop(scores)
        if -score >= score_threshold and ind1 in set_i1 and ind2 in set_i2:
            candidate_tuples.append((-score,ind1,ind2))
            set_i1.remove(ind1)
            set_i2.remove(ind2)
            
            matched_i1.add(ind1)
            matched_i2.add(ind2)
    
    return candidate_tuples, matched_i1, matched_i2, set_i1, set_i2

In [53]:
# a,b,c,d,e = get_matching_regs(list_d1, list_d2)

# print(len(list_d1),len(list_d2))

# for ele in a:
#     i1, i2, i3 = ele
#     print(f"SCORE {i1}")
#     print(printr(list_d1[i2]))
#     print(printb(list_d2[i3]))
#     print("------------------------")

# print(f"LEFT OVERS")
# for ele in d:
#     print(printr(list_d1[ele]))
#     print("------------------------")

# for ele in e:
#     print(printr(list_d1[ele]))
#     print("------------------------")

In [54]:
from IPython.core.display import display, HTML

`Generate a HTML table given two documents`

In [55]:
def lcs_for_html(a, b):
    """
    Word level LCS, with colors
    a : array of words
    b : array of words
    """
    # generate matrix of length of longest common subsequence for substrings of both words
    lengths = [[0] * (len(b)+1) for _ in range(len(a)+1)]
    for i, x in enumerate(a):
        for j, y in enumerate(b):
            if x == y:
                lengths[i+1][j+1] = lengths[i][j] + 1
            else:
                lengths[i+1][j+1] = max(lengths[i+1][j], lengths[i][j+1])
    
    result = []
    j = len(b)
    for i in range(1, len(a)+1):
        if lengths[i][j] != lengths[i-1][j]:
            result.append(a[i-1])
    
    qa = deque(a)
    qb = deque(b)
    rq = deque(result)
    
    
    ansa = []
    ansb = []
    
    color_a = []
    color_b = []
    
    while rq :
        common_ele = rq.popleft()
        while len(qa) >= 1 and qa[0] != common_ele:
            ansa.append(qa.popleft())
            color_a.append(1)
        while len(qb) >= 1 and qb[0] != common_ele:
            ansb.append(qb.popleft())
            color_b.append(1)
        
        ansa.append(common_ele)
        ansb.append(common_ele)
        color_a.append(0)
        color_b.append(0)
        
        if len(qa) >= 1:
            qa.popleft()
        if len(qb) >= 1:
            qb.popleft()
    
    while len(qa) >= 1:
        ansa.append(qa.popleft())
        color_a.append(1)
        
    while len(qb) >= 1:
        ansb.append(qb.popleft())
        color_b.append(1)
    
    return result, ansa, ansb, color_a, color_b

In [56]:
def match_two_regs_for_html(tup_s1,tup_s2,debug=False):
    """
    If the sorted array is increasing in both components, done ... problem solved !!
    TODO : Handle when the rows are jumbled
    match_lines : gives an array of tuples, x[i] belongs to index(si+1)
    non_match_areas : gives an array of "list of list of size (2,2)", 
                      format is [[start_index_s1, end_index_s1],[start_index_s2, end_index_s2]]
    
    """
    check = True
    s1 = tup_s1
    s2 = tup_s2
    
    full_matches = []
    partial_matches = []
    no_matches = [[],[]]
    
    full_match_pairs = get_fully_matching_sent_indices(s1,s2)
    non_match_areas = []
    
    if len(full_match_pairs) > 0:
        non_match_areas = [ [[0,full_match_pairs[0][0]],[0,full_match_pairs[0][1]]] ]
    
    for i, elem in enumerate(full_match_pairs):
        if i == len(full_match_pairs) - 1:
            non_match_areas.append([[elem[0]+1,len(s1)], [elem[1]+1, len(s2)] ])
        else:
            p11 , p12 = full_match_pairs[i]
            p21,  p22 = full_match_pairs[i+1]
            non_match_areas.append([ [p11+1, p21] , [p12+1, p22] ])
   
    
    d = difflib.Differ()
    for ele in full_match_pairs:
        full_matches.append([ele[0],ele[1]])
    
    if len(full_match_pairs) == 0:
        non_match_areas.append([[0,len(s1)],[0,len(s2)]])
    
    for elem in non_match_areas:
        p11, p12 = elem[0]
        p21, p22 = elem[1]
        if p11 == p12 and p21 == p22:
            continue
        l1 = s1[p11:p12]
        l2 = s2[p21:p22]

        all_combos , mi1, mi2  = get_maximum_matching(l1,l2)

        for a,b,c in all_combos : 
            a1,a2,a3,c1,c2 =  lcs_for_html(wdtk(l1[b]),wdtk(l2[c]))
            partial_matches.append([a2,a3,p11 + b, p21 + c, c1 , c2])

        for ind, ele in enumerate(l1): 
            if ind not in mi1:
                no_matches[0].append(ind + p11)

        for ind, ele in enumerate(l2): 
            if ind not in mi2:
                no_matches[1].append(ind + p21)
                    
    return full_matches, partial_matches, no_matches
    

In [57]:
def generate_html(doc1, doc2, doc_names=None):
    if doc_names is None:
        doc_names = ["Document 1", "Document 2"]
    
    html_obj = f"""
                <table width='100%'>
                <tr>
                <th>{doc_names[0]}</th>
                <th>{doc_names[1]}</th>
                </tr>
                """
    
    a,b,c,d,e = get_matching_regs(doc1, doc2)
    
    for ele in a:
        i1, i2, i3 = ele

        rid1 , s1 = list_d1[i2]
        rid2 , s2 = lis
        a1 = [""]*len(s1)
        a2 = [""]*len(s2)
        
        f, p, n = match_two_regs_for_html(s1,s2)
                
        #full matches
        for e1,e2 in f:
            a1[e1] = f"<p style='color:olive;'>{s1[e1]}</p>"
            a2[e2] = f"<p style='color:olive;'>{s2[e2]}</p>"
        
        # non matches
        for e1 in n[0]:
            a1[e1] = f"<p style='color:red;'>{s1[e1]}</p>"
        for e2 in n[1]:
            a2[e2] = f"<p style='color:red;'>{s2[e2]}</p>"
        
        #partial matches
        for ele in p:
            words1, words2, e1, e2, color_map1, color_map2 = ele
            
            lis1 = []
            lis2 = []
            
            for word,color in zip(words1, color_map1):
                color_string = "olive" if color == 0 else "blue"
                lis1.append(f"<span style='color:{color_string}'>{word}</span>")
            for word, color in zip(words2, color_map2):
                color_string = "olive" if color == 0 else "blue"
                lis2.append(f"<span style='color:{color_string}'>{word}</span>")
            a1[e1] = f"<p>{' '.join(lis1)}</p>"
            a2[e2] = f"<p>{' '.join(lis2)}</p>"
        
        """
        TODO
        """
        
        html_obj += f"""<tr>
        <td>{"".join(a1)}</td>
        <td>{"".join(a2)}</td>
        </tr>"""    
    
    for ele in d:
        html_obj += f"""<tr>
        <td>{list_d1[ele]}</td>
        <td></td>
        </tr>"""

    for ele in e:
        html_obj += f"""<tr>
        <td></td>
        <td>{list_d2[ele]}</td>
        </tr>"""
    
    html_obj += "</table>"
    
    return html_obj

In [58]:
# returned_html = generate_html(list_d1, list_d2)
# print(sorted(key_list))

In [59]:
# key1, key2 = 'PIT_2003','PIT_2015'
# d1 = datum[key1]
# d2 = datum[key2]
# list_d1 = nested_list_to_linear_list(list(d1.values()))
# list_d2 = nested_list_to_linear_list(list(d2.values()))

# list_d1 = list_d1[2:]
# list_d2 = list_d2[2:]
# returned_html = generate_html(list_d1, list_d2,doc_names=[key1,key2])
# display(HTML(returned_html))
# display(HTML("<br/>"))

# for ele in list_d1:
#     print(ele, "~~~")
# print('---------')
# for ele in list_d2:
#     print(ele,"~~~")

# key_list = sorted([ele for ele in list(datum.keys()) if 'PIT' in ele])

# # for i,key1 in enumerate(key_list):
# #     for key2 in key_list[i+1:]:
# #         print(key1,key2)

# returned_html = ""

# for i,key1 in enumerate(key_list[:-1]):
#         key2 = key_list[i+1]
#         print(key1,key2)
#         d1 = datum[key1]
#         d2 = datum[key2]
#         list_d1 = nested_list_to_linear_list(list(d1.values()))
#         list_d2 = nested_list_to_linear_list(list(d2.values()))

#         list_d1 = list_d1[2:]
#         list_d2 = list_d2[2:]
        
#         returned_html += generate_html(list_d1, list_d2,doc_names=[key1,key2]) + "<br/>"
# display(HTML(returned_html))
# with open('../../data/FOR_SENDING/all_combos.html','w') as f:
#     f.write(returned_html)

Stricter

Lenient

-- ? --

In [60]:
from itertools import groupby
import re

def process_regs_list(d1): 
    final_string = []
    for reg_num, sub_reg_num, cont in d1:
        final_string.append([reg_num,cont])
    return final_string

In [61]:
def split_date_string(date,doc_type): 
    dm,y = date.split('_')
    m,d = dm[:3], dm[3:]
    return f"{doc_type} {d}-{m}-{y}"

In [62]:
def generate_html_for_amendments(list_d1, list_d2, doc_names=None, doc_type=None):
    if doc_names is None:
        doc_names = ["Document 1", "Document 2"]
    
    doc_names_0 = split_date_string(doc_names[0] , doc_type)
    doc_names_1 = split_date_string(doc_names[1] , doc_type)
    html_obj = f"""
                <html>
                <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">

                <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>

                <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
                
                <style>
                ul.myList {{
                  list-style-type:disc;
                }}
                
                </style>
                <body>
                <br/>

                <div style="position: sticky; top: 0;">
                
                <ul>
                
                <li style='color:olive;'> <a href="#exactm"> Exact Match </a></li>
                
                <li style="list-style-type: none;"> Insertions or Deletions

                <ul class="myList">
                
                <li style='color:red;'> <a href="#partm">Subregulation</a> </li>
                <li style='color:black;'> <a href="#nomen"> Regulation</a></li >
                
                </ul>
                </li>
                
                <li style='color:blue;'>  <a href="#partm"> Edits </a></li>
                
                </ul>
                </div>
                
                
                <div class="container pt-3">
                <table class="table table-hover table-bordered">
                <tr>
                <th>{doc_names_0}</th>
                <th>{doc_names_1}</th>
                </tr>
                <div id="exactm"></div>
                """
    
    a,b,c,d,e = get_matching_regs(list_d1, list_d2)
    
#     html_obj += ''
    red_blue_flag = True
    
    for ele in a:
        i1, i2, i3 = ele
        
        rid1 , s1 = list_d1[i2]
        rid2 , s2 = list_d2[i3]
        
#         s1 = [ele.replace('\n',"") for i,ele in enumerate((list_d1[i2])) if i == 0 
#                   or (len(ele) > 10 and 'Inserted' not in ele and 'Substituted' not in ele and 'NOTE' not in ele and 'Omitted' not in ele)]
#         s2 = [ele.replace('\n','') for i,ele in enumerate((list_d2[i3])) if i == 0
#                   or (len(ele) > 10 and 'Inserted' not in ele and 'Substituted' not in ele and 'NOTE' not in ele and 'Omitted' not in ele)]
                
        a1 = [""]*len(s1)
        a2 = [""]*len(s2)
        
        f, p, n = match_two_regs_for_html(s1,s2)
                
        #full matches
        for e1,e2 in f:
            a1[e1] = f"<p style='color:olive;'>{s1[e1]}</p>"
            a2[e2] = f"<p style='color:olive;'>{s2[e2]}</p>"
        
        # non matches
        
        for e1 in n[0]:
            if red_blue_flag == True:
                a1[e1] = f" <div id='partm'> <p style='color:red;'>{s1[e1]}</p> </div> "
                red_blue_flag = False
            else:
                a1[e1] = f"<p style='color:red;'>{s1[e1]}</p>"

        for e2 in n[1]:
            if red_blue_flag == True:
                a2[e2] = f"<div id='partm'><p style='color:red;'>{s2[e2]}</p></div>"
                red_blue_flag = False
            else : a2[e2] = f"<p style='color:red;'>{s2[e2]}</p>"
        
        #partial matches
        for ele in p:
            words1, words2, e1, e2, color_map1, color_map2 = ele
            
            lis1 = []
            lis2 = []
            
            for word,color in zip(words1, color_map1):
                color_string = "olive" if color == 0 else "blue"
                lis1.append(f"<span style='color:{color_string}'>{word}</span>")
            for word, color in zip(words2, color_map2):
                color_string = "olive" if color == 0 else "blue"
                lis2.append(f"<span style='color:{color_string}'>{word}</span>")
            a1[e1] = f"<p>{' '.join(lis1)}</p>"
            a2[e2] = f"<p>{' '.join(lis2)}</p>"
        
        """
        TODO
        """
        
        html_obj += f"""<tr>
        <td>{rid1} {"".join(a1)}</td>
        <td>{rid2} {"".join(a2)}</td>
        </tr>"""    
        
    check_nonem = True
    
    for i,ele in enumerate(d):
        rid, con = list_d1[ele]
        if check_nonem == True:
            html_obj += f"""<tr>
                    <td><div id="nomen">{rid} {"<br/>".join(con)}</div></td>
                    <td></td>
                    </tr>"""
            check_nonem = False
        else:
            html_obj += f"""<tr>
            <td>{rid} {"<br/>".join(con)}</td>
            <td></td>
            </tr>"""

    for ele in e:
        rid, con = list_d2[ele]
        if check_nonem == True:
            html_obj += f"""<tr>
                    <td></td>
                    <td><div id="nomen">{rid} {"<br/>".join(con)}</div></td>
                    </tr>"""
            check_nonem = False
        else:
            html_obj += f"""<tr>
            <td></td>
             <td>{rid} {"<br/>".join(con)}</td>
            </tr>"""
    
    html_obj += "</div></body></table></html>"
    
    return html_obj

In [63]:
def generate_comparision_list(list_d1, list_d2, doc_names=None, doc_type=None):
    if doc_names is None:
        doc_names = ["Document 1", "Document 2"]
    
    doc_names_0 = split_date_string(doc_names[0] , doc_type)
    doc_names_1 = split_date_string(doc_names[1] , doc_type)
    
    
    a,b,c,d,e = get_matching_regs(list_d1, list_d2)
    
    red_blue_flag = True
    compare_arr = []
    
    for ele in a:
        
        i1, i2, i3 = ele
        
        rid1 , s1 = list_d1[i2]
        rid2 , s2 = list_d2[i3]
        
        compare_arr.append([s1,s2,rid1,rid2])
    
    for i,ele in enumerate(d):
        rid, con = list_d1[ele]
        compare_arr.append([con, "", rid, "0"])

    for ele in e:
        rid, con = list_d2[ele]
        compare_arr.append(["",con,"0",rid])
    
    
    return compare_arr

In [79]:
def standalone_html(list_d1, doc_names=None, doc_type=None):
    if doc_names is None:
        doc_names = ["Document 1"]
    
    doc_names_0 = split_date_string(doc_names[0] , doc_type)
    count_tot = sum([(match != "") for _,_,match in list_d1])
    count_string = "".join([f"""<li style='color:olive;'> <a href="#rationale{it}"> Rationale {it} </a></li>""" for it in range(count_tot)])
    
    html_obj = f"""
                <html>
                <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">

                <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>

                <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
                
                <style>
                ul.myList {{
                  list-style-type:disc;
                }}
                
                </style>
                
                <body>
                <br/>
                
                <div class="container pt-3">
                <table class="table table-hover table-bordered">
                """ + """<div style="position: sticky; top: 0;">
                <ul>""" +  count_string + """</ul>
                </div>""" + f"""
                <tr>
                <th>{doc_names_0}</th>
                </tr>
                """
    count = 0
    for rid, ele, match in list_d1:
        condition_string = ""
        if match != "":
            condition_string = f"""<br/><br/><p style='color:blue;' id="rationale{count}">Rationale : {match}</p>"""
            count += 1
        html_obj += f"""<tr><td>{rid} {" ".join(ele)}""" + condition_string + """</td></tr>"""
    
    html_obj += "</div></body></table></html>"
    
    return html_obj

In [65]:
def generate_standalone_list(list_d1, doc_names=None, doc_type=None):
    if doc_names is None:
        doc_names = ["Document 1"]
    
    
    answer = []
    for rid, ele in list_d1:
        answer.append([rid," ".join(ele)])
    
    return answer

In [66]:
# with open('../../data/FOR_SENDING/amednments_comparision.html','w') as f:
#     f.write(returned_html)

In [67]:
# """
# """
# for file in glob('../home/buggi/data')
# with open('/home/buggi/RA/data/FOR_SENDING/all_pit_regulations.json','r') as f:
#     all_pit = json.load(f)

# from collections import defaultdict
# all_pit_cleaned = {}

# for ele in all_pit:
#     print(ele)
#     all_pit_cleaned[ele] = []
#     keyl = list(all_pit[ele].keys())[1:]
#     for chap in keyl:
#         for reg in all_pit[ele][chap]:
#             if reg != 'chap_title':
#                 content = all_pit[ele][chap][reg]["content"]
#                 if type(content) is str:
#                     cont = content
#                 else:
#                     cont = " ".join(content)
#                 all_pit_cleaned[ele].append([reg,"",cont.split(".",1)[1].strip()])

# with open('/home/buggi/RA/data/FOR_SENDING/all_pit_supressed.json','w') as f:
#     json.dump(all_pit_cleaned,f)

In [68]:
# all_pit_cleaned.keys()
from datetime import datetime
def date_convo(date_string):
    elem = date_string
    if '.' in date_string:
        elem = date_string.split('.')[0]
    dt1 = datetime.strptime(elem, '%b%d_%Y')
    return dt1

In [69]:
# Running code for Mutual Funds comparision
# import json
# with open('/home/buggi/Downloads/Prohibition_insider_trading.json') as file:
#     clatum = json.load(file)

# all_pit_cleaned = {ele : all_pit_cleaned[ele] for ele in all_pit_cleaned if "2019" not in all_pit_cleaned}

# datum = {}

# for ele in all_pit_cleaned:
#     datum[ele] = all_pit_cleaned[ele]
# for ele in clatum:
#     datum[ele] = clatum[ele]

# key_list = ['Aug7_2003', 'Aug16_2011', 'Jan15_2015', 'Dec31_2018', 'Jan21_2019', 'Sept17_2019', 'Nov11_2019', 'July17_2020', 'Oct29_2020']

# dc = {}
# for ele in key_list:
#     dc[ele] = datum[ele]

# with open('/home/buggi/RA/data/FOR_SENDING/all_pit_combined.json','r') as f:
#     datum = json.load(f)
if True == False:
    datum = {}
    from itertools import groupby

    key_list = list(datum.keys())
    print(key_list)

    def better_key(keyn):
        if "Sept" in keyn:
            return keyn.replace("Sept","Sep")
        elif "July" in keyn:
            return keyn.replace("July","Jul")
        else:
            return keyn

    for i,key1 in enumerate(key_list[:-1]):
            key2 = key_list[i+1]
            d1 = datum[key1]
            d2 = datum[key2]

            list_d1 = process_regs_list(d1)
            list_d2 = process_regs_list(d2)
            generated_html_comparer = generate_html_for_amendments(list_d1, list_d2,doc_names=[better_key(key1),better_key(key2)],doc_type="PIT Regulations")
            with open(f"../demo/All_HTMLS/PIT_Regs/comparision/{better_key(key1)}vs{better_key(key2)}.html",'w') as f:
                f.write(generated_html_comparer)

    for i,key1 in enumerate(key_list):
            d1 = datum[key1]

            list_d1 = process_regs_list(d1)        
            standalone_html_comparer = standalone_html(list_d1, doc_names=[better_key(key1)], doc_type="PIT Regulations")
            with open(f"../demo/All_HTMLS/PIT_Regs/standalone/{better_key(key1)}.html",'w') as f:
                f.write(standalone_html_comparer)

    # Running code for Mutual Funds comparision
    import json
    with open('/home/buggi/Downloads/Mutual_Funds.json') as file:
        datum = json.load(file)

    key_list = list(datum.keys())

    returned_html = ""
    from itertools import groupby

    for i,key1 in enumerate(key_list[:-1]):
            key2 = key_list[i+1]
            d1 = datum[key1]
            d2 = datum[key2]

            list_d1 = process_regs_list(d1)
            list_d2 = process_regs_list(d2)

            generated_html_comparer = generate_html_for_amendments(list_d1, list_d2,doc_names=[better_key(key1),better_key(key2)],doc_type="Mutual Funds Regulations")
            with open(f"../demo/All_HTMLS/Mutual_Funds_Regs/comparision/{better_key(key1)}vs{better_key(key2)}.html",'w') as f:
                f.write(generated_html_comparer)

    for i,key1 in enumerate(key_list):
            d1 = datum[key1]

            list_d1 = process_regs_list(d1)        
            standalone_html_comparer = standalone_html(list_d1, doc_names=[better_key(key1)],doc_type="Mutual Funds Regulations")
            with open(f"../demo/All_HTMLS/Mutual_Funds_Regs/standalone/{better_key(key1)}.html",'w') as f:
                f.write(standalone_html_comparer)


In [70]:
import hashlib
def get_file_name(url, ext):
    return url.split('/')[-1].split('.')[0] + ext


def get_file_name_hashed(url, ext):
    return hashlib.md5(url.encode()).hexdigest() + ext

In [71]:
from glob import glob
all_files_dict = {}
for file in glob('../document_scraping/script_folder/document_all_*.json'):
    print(file)
    with open(file,'r') as f:
        jsonf = json.load(f)
    for ele in jsonf:
        try:
            if ele["file_url"] != None:
                all_files_dict[get_file_name(ele["file_url"],"")] = {"time": ele["time"],
                                                                    "text": ele["text"],
                                                                    "url" : ele["url"],
                                                                     "base": file}

            else:
                all_files_dict[get_file_name_hashed(ele["url"],"")] = {"time": ele["time"],
                                                        "text": ele["text"],
                                                        "url" : ele["url"],
                                                        "base": file}
        except:
            continue

../document_scraping/script_folder/document_all_circulars.json
../document_scraping/script_folder/document_all_debt_offer_documents_draft_filed_with_se.json
../document_scraping/script_folder/document_all_acts.json
../document_scraping/script_folder/document_all_rules.json
../document_scraping/script_folder/document_all_public_issues_other_documents.json
../document_scraping/script_folder/document_all_final_placement_memorandum_filed_with_sebi.json
../document_scraping/script_folder/document_all_invit_public_offer_docs_filed_with_sebi.json
../document_scraping/script_folder/document_all_investor_survey.json
../document_scraping/script_folder/document_all_placement_memorandum_filed_with_sebi.json
../document_scraping/script_folder/document_all_filings_processing_status.json
../document_scraping/script_folder/document_all_takeovers_other_documents.json
../document_scraping/script_folder/document_all_annual_reports.json
../document_scraping/script_folder/document_all_offer_documents_fille

In [72]:
def references_html(list_d1, doc_names=None, doc_type=None):
    if doc_names is None:
        doc_names = ["Document 1"]
    
    doc_names_0 = split_date_string(doc_names[0] , doc_type)
    html_obj = f"""
                <html>
                <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">

                <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>

                <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
                <body>
                <br/>
                
                <div class="container pt-3">
                <table class="table table-hover table-bordered">
                <tr>
                <th colspan="3" class="text-center">{doc_names_0}</th>
                </tr>
                
                <tr>
                <th>Time</th>
                <th>Document</th>
                <th>Match </th>
                </tr>
                """
    # print(list_d1[:3])
    
    list_d1.sort(key=lambda x: x["dtm"])
    for ele in list_d1:
        html_obj += f"""<tr>
        <td>{ele["time"]}</td>
        <td><a href='{ele["url"]}'>{ele["text"]}</a></td>
        <td>{ele["content"]}</td>
        </tr>"""
    
    html_obj += "</div></body></table></html>"
    
    return html_obj

In [73]:
remove_dups = lambda x : list(set(x)) if type(x) is list else x
def clean_grep_out(inp):
    all_rows = inp.strip().split('\n')
    
    first_row = all_rows[0]
    file_name = first_row.split('-')[0] if '-' in first_row else first_row.split(':')[0]
    len_file_name_p = len(file_name) 
    
    cleaned_all_rows = " ".join([ele[len_file_name_p+1:] for ele in all_rows if "Page" not in ele])                                
    
    fn = os.path.basename(file_name).split(".")[0]
    return fn, cleaned_all_rows

In [74]:
import re

def detect_pattern(content,other_patterns):
    pattern = re.compile("amendment.*rationale|amendment.*rationale|regulation.*amendment|amendment.*regulation|SEBI \(.*?\) Regulations" + "|"+
                        "|".join(other_patterns))
    all_matches = []
    final_content = ""
    i = 0
    for match in re.finditer(pattern,content):
        if i == 0:
            final_content += content[i:match.start()]
        final_content += f"<mark>{content[match.start():match.end()]}</mark>"
        i = match.end()
    final_content += content[i:]
    return final_content

In [75]:
# with open('../../data/glossary.json','r') as f:
#     glossary = json.load(f)
# with open('../../data/all_reguls_list.json','r') as f:
#     regs_names = json.load(f)

# file_names = [os.path.basename(file).split('.')[0].replace('_',' ') for file in tqdm(sorted(glob('../../data/ALL_REGULATIONS_JSON_FLATTENED/*.json')))]
# # print(file_names)

# dict_fretaly = {}
# glossary_reved = {}

# for key in glossary:
#     for elem in glossary[key]:
#         if elem in glossary_reved:
#             glossary_reved[elem].append(key)
#         else:
#             glossary_reved[elem] = [key]

# for ele in glossary_reved:
#     key = regs_names[ele]
#     dict_fretaly[key] = glossary_reved[ele] + [key.lower()]

# print(dict_fretaly)
# with open('../../data/filename_to_refs.json','w') as f:
#     json.dump(dict_fretaly)

# from custom_functions import write_file
# filename_to_refs = write_file('../../data/filename_to_refs.json',dict_fretaly)

with open('../../data/filename_to_refs.json','r') as f:
    filename_to_refs = json.load(f)

In [80]:
## Iterates all jsons and generates HTML
from glob import glob
from itertools import groupby
output_folder = '../../data/ALL_REGULATIONS_HTML/'
from tqdm import tqdm as tqdm
import os
import pickle

STANDALONES = True
COMPARISION = False
COMMENTS = False
COMPARISION_CONTENT = False
STANDALONE_CONTENT  = False

with open('../../data/regulation_rationale_match.pkl','rb') as f:
    reg_rat_dict = pickle.load(f)
# print(reg_rat_dict)

for file in tqdm(reversed(sorted(glob('../../data/ALL_REGULATIONS_JSON_FLATTENED_SPLIT_SUBREG/*.json')))):
    print(file)
    with open(file,'r') as f:
        datum = json.load(f)
    
    key_list = sorted(list(datum.keys()), key=date_convo)
    
    folder_name = os.path.basename(file).split('.')[0]
    
    output_subfolder = os.path.join(output_folder, folder_name)
    
    if not os.path.exists(output_subfolder):
        os.mkdir(output_subfolder)
    
    doc_t = " ".join(folder_name.split('_'))
    
    if COMPARISION_CONTENT == True:
        for i,key1 in enumerate(key_list[:-1]):
                key2 = key_list[i+1]
                d1 = datum[key1]
                d2 = datum[key2]
                
                list_d1 = process_regs_list(d1)
                list_d2 = process_regs_list(d2)
                
                generated_html_comparer_arr = generate_comparision_list(list_d1, list_d2,doc_names=[key1,key2],doc_type=doc_t)
                fol_name = os.path.join(output_subfolder,'comparision_content')
                if not os.path.exists(fol_name):
                    os.mkdir(fol_name)

                with open(os.path.join(fol_name,f"{key1}vs{key2}.json"),'w') as f:
                    json.dump(generated_html_comparer_arr,f)
                    
    if COMPARISION == True:
        for i,key1 in enumerate(key_list[:-1]):
                key2 = key_list[i+1]
                d1 = datum[key1]
                d2 = datum[key2]
                
                list_d1 = process_regs_list(d1)
                list_d2 = process_regs_list(d2)
                
                generated_html_comparer = generate_html_for_amendments(list_d1, list_d2,doc_names=[key1,key2],doc_type=doc_t)
                fol_name = os.path.join(output_subfolder,'comparision')
                if not os.path.exists(fol_name):
                    os.mkdir(fol_name)

                with open(os.path.join(fol_name,f"{key1}vs{key2}.html"),'w') as f:
                    f.write(generated_html_comparer)
    
    if STANDALONES == True:
        
        for i,key1 in enumerate(key_list):
                d1 = datum[key1]

                list_d1 = process_regs_list(d1) 
                new_list_d1 = []
                for ele in list_d1:
                    new_list_d1.append(ele + [""])
                
                for rk in reg_rat_dict:
                    if rk[0] == doc_t and rk[1] == key1:
                        new_list_d1[rk[2]][2] = reg_rat_dict[rk]
                
                
                standalone_html_comparer = standalone_html(new_list_d1, doc_names=[key1], doc_type=doc_t)
                
                fol_name = os.path.join(output_subfolder,'standalone')


                if not os.path.exists(fol_name):
                    os.mkdir(fol_name)

                with open(os.path.join(fol_name,f"{key1}.html"),'w') as f:
                    f.write(standalone_html_comparer)
    
    if STANDALONE_CONTENT == True:
        for i,key1 in enumerate(key_list):
                d1 = datum[key1]

                list_d1 = process_regs_list(d1)        
                standalone_html_content = generate_standalone_list(list_d1, doc_names=[key1], doc_type=doc_t)
                
                fol_name = os.path.join(output_subfolder,'standalone_content')


                if not os.path.exists(fol_name):
                    os.mkdir(fol_name)

                with open(os.path.join(fol_name,f"{key1}.json"),'w') as f:
                    json.dump(standalone_html_content,f)
                    
    if COMMENTS == True:
        list_ = []
        import os
        for reg_file in glob('../document_scraping/*.txt'):
            with open(reg_file,'r') as f:
                all_matches = f.read().split('--')
                all_matches = [clean_grep_out(ele) for ele in all_matches]
                key_for_glossary = os.path.basename(folder_name.replace('_',' '))
                matchdct = filename_to_refs.get(key_for_glossary, [key_for_glossary.lower()]) 
                for filen, line in all_matches:    
                    if any([ele in line for ele in matchdct]):
                        line = detect_pattern(line,matchdct)
                        list_.append((filen, line, reg_file))
        from datetime import datetime 
        
        final_dict = {}
        set_indices = set()
        key_lis_copy = key_list[:]
        
        # COPY THE DATE LIST
        
        while len(key_lis_copy) != 0:
            key_date = key_lis_copy.pop(0)
            dt1 = datetime.strptime(key_date, '%b%d_%Y')
            final_dict[key_date] = []
            
            for i,ele in enumerate(list_):
                if i in set_indices:
                    continue

                if ele[0] not in all_files_dict:
                    set_indices.add(i)
                    continue

                tim = all_files_dict[ele[0]]["time"] 
                dt2 = datetime.strptime(tim, '%b %d, %Y')
                if dt2 <= dt1:
                    cpd_dict = all_files_dict[ele[0]]
                    cpd_dict["content"] = ele[1]
                    cpd_dict["dtm"] = dt2
                    final_dict[key_date].append(cpd_dict)
                    set_indices.add(i)
        
        "HANDLE ADDING LEFT-OVERS"
        
        fol_name = os.path.join(output_subfolder,'comments')
        if not os.path.exists(fol_name):
            os.mkdir(fol_name)
                
        for i,key1 in enumerate(key_list):
            standalone_html_comparer = references_html(final_dict[key1], doc_names=[key1], doc_type=doc_t)
            print_name = os.path.join(fol_name,f"Comments_{key1}.html")
            with open(print_name,'w') as f:
                f.write(standalone_html_comparer)
                
#     break
        

47it [00:00, 846.62it/s]

../../data/ALL_REGULATIONS_JSON_FLATTENED_SPLIT_SUBREG/Underwriters.json
../../data/ALL_REGULATIONS_JSON_FLATTENED_SPLIT_SUBREG/Substantial_Acquisition_of_Shares_and_Takeovers.json
../../data/ALL_REGULATIONS_JSON_FLATTENED_SPLIT_SUBREG/Stock_Exchanges_and_Clearing_Corporations.json
../../data/ALL_REGULATIONS_JSON_FLATTENED_SPLIT_SUBREG/Stock_Brokers.json
../../data/ALL_REGULATIONS_JSON_FLATTENED_SPLIT_SUBREG/Share_Based_Employee_Benefits.json
../../data/ALL_REGULATIONS_JSON_FLATTENED_SPLIT_SUBREG/Settlement_of_Administrative_and_Civil_Proceedings.json
../../data/ALL_REGULATIONS_JSON_FLATTENED_SPLIT_SUBREG/Settlement_Proceedings.json
../../data/ALL_REGULATIONS_JSON_FLATTENED_SPLIT_SUBREG/Self_Regulatory_Organisations.json
../../data/ALL_REGULATIONS_JSON_FLATTENED_SPLIT_SUBREG/Research_Analysts.json
../../data/ALL_REGULATIONS_JSON_FLATTENED_SPLIT_SUBREG/Regulatory_Fee_on_Stock_Exchanges.json
../../data/ALL_REGULATIONS_JSON_FLATTENED_SPLIT_SUBREG/Registrars_to_an_Issue_and_Share_Transfer_




In [38]:
# list_ = []
# import os
# for reg_file in glob('../document_scraping/*.txt'):
#     with open(reg_file,'r') as f:
#         all_matches = f.read().split('--')
#         all_matches = [clean_grep_out(ele) for ele in all_matches]
#         for file, line in all_matches:
#             matchdct = [pit]
#             if any([ele in line for ele in matchdct]):
#                 line = detect_pattern(line,matchdct)
#                 list_.append((file, line, reg_file))

# from datetime import datetime 

# count  = 0
# for ele in list_:
#     if ele not in all_files_dict:
#         count += 1
        
# mfsl = ['Aug3_2006', 'Oct31_2007', 'Apr16_2008', 'Apr8_2009', 'Aug30_2011', 'Feb21_2012', 'Sep26_2012', 'Apr16_2013', 'Jun19_2013', 'Aug19_2013', 'May6_2014', 'Dec30_2014', 'May15_2015', 'Mar13_2018', 'Dec6_2018', 'Dec13_2018', 'Sep23_2019', 'Mar6_2020', 'Oct29_2020']

# final_dict = {}
# # setlis = set(list_)
# set_indices = set()

# while len(mfsl) != 0:
#     key_date = mfsl.pop(0)
#     dt1 = datetime.strptime(key_date, '%b%d_%Y')
#     print(dt1)    
#     final_dict[key_date] = []
    
    
#     for i,ele in enumerate(list_):
#         if i in set_indices:
#             continue
        
#         # File not found
#         if ele[0] not in all_files_dict:
#             set_indices.add(i)
#             continue
        
#         tim = all_files_dict[ele[0]]["time"] 
#         dt2 = datetime.s(tim, '%b %d, %Y')
#         if dt2 <= dt1:
#             cpd_dict = all_files_dict[ele[0]]
#             cpd_dict["content"] = ele[1]
#             final_dict[key_date].append(cpd_dict)
#             set_indices.add(i)
            
# key_list = ['Aug3_2006', 'Oct31_2007', 'Apr16_2008', 'Apr8_2009', 'Aug30_2011', 'Feb21_2012', 'Sept26_2012', 'Apr16_2013', 'Jun19_2013', 'Aug19_2013', 'May6_2014', 'Dec30_2014', 'May15_2015', 'Mar13_2018', 'Dec6_2018', 'Dec13_2018', 'Sept23_2019', 'Mar6_2020', 'Oct29_2020']

# dte = {'Sep23_2019': 'Comments2',
#  'Aug19_2013': 'Comments9',
#  'Jun19_2013': 'Comments10',
#  'Dec30_2014': 'Comments7',
#  'Apr8_2009': 'Comments15',
#  'Sep26_2012': 'Comments12',
#  'Oct29_2020': 'Comments0',
#  'Apr16_2013': 'Comments11',
#  'Aug30_2011': 'Comments14',
#  'May15_2015': 'Comments6',
#  'Mar6_2020': 'Comments1',
#  'Dec6_2018': 'Comments4',
#  'Dec13_2018': 'Comments3',
#  'Aug3_2006': 'Comments18',
#  'Oct31_2007': 'Comments17',
#  'Feb21_2012': 'Comments13',
#  'May6_2014': 'Comments8',
#  'Mar13_2018': 'Comments5',
#  'Apr16_2008': 'Comments16'}

# if False:
#     for i,key1 in enumerate(key_list):
#             bk = better_key(key1)
#             standalone_html_comparer = references_html(final_dict[bk], doc_names=[better_key(key1)], doc_type="Mutual Funds Regulations")

#             with open(f"../demo/All_HTMLS/Mutual_Funds_Regs/comments/{dte[better_key(key1)]}.html",'w') as f:
#                 f.write(standalone_html_comparer)
                
# list_ = []
# import os
# for reg_file in glob('../document_scraping/*.txt'):
#     with open(reg_file,'r') as f:
#         all_matches = f.read().split('--')
#         all_matches = [clean_grep_out(ele) for ele in all_matches]
#         for file, line in all_matches:
#             matchdct = ["Prohibition of  Insider  Trading", "PIT","Insider Trading","insider trading"]
#             if any([ele in line for ele in matchdct]):
#                 line = detect_pattern(line,matchdct)
#                 list_.append((file, line, reg_file))

# print(len(list_))
# from datetime import datetime 

# count  = 0
# for ele in list_:
#     if ele not in all_files_dict:
#         count += 1
# print(count, len(list_))        

# pitsl = ['Aug7_2003', 'Aug16_2011', 'Jan15_2015', 'Dec31_2018', 'Jan21_2019', 'Sep17_2019', 'Nov11_2019', 'Jul17_2020', 'Oct29_2020'] 

# final_dict = {}
# # setlis = set(list_)
# set_indices = set()

# while len(pitsl) != 0:
#     key_date = pitsl.pop(0)
#     dt1 = datetime.strptime(key_date, '%b%d_%Y')
#     print(key_date,dt1)    
#     final_dict[key_date] = []
    
    
#     for i,ele in enumerate(list_):
#         if i in set_indices:
#             continue
        
#         # File not found
#         if ele[0] not in all_files_dict:
#             set_indices.add(i)
#             continue
        
#         tim = all_files_dict[ele[0]]["time"] 
#         dt2 = datetime.strptime(tim, '%b %d, %Y')
#         if dt2 <= dt1:
#             cpd_dict = all_files_dict[ele[0]]
#             cpd_dict["content"] = ele[1]
#             final_dict[key_date].append(cpd_dict)
#             set_indices.add(i)
            
# key_list = ['Aug7_2003', 'Aug16_2011', 'Jan15_2015', 'Dec31_2018', 'Jan21_2019', 'Sept17_2019', 'Nov11_2019', 'July17_2020', 'Oct29_2020']

# dte = {'Aug7_2003': 'Comments8',
#  'Aug16_2011': 'Comments6',
#  'Jan15_2015': 'Comments3',
#  'Dec31_2018': 'Comments5',
#  'Jan21_2019': 'Comments4',
#  'Sep17_2019': 'Comments2',
#  'Nov11_2019': 'Comments1',
#  'Jul17_2020': 'Comments0',
#  'Oct29_2020': 'Comments7'}


# if False:
#     for i,key1 in enumerate(key_list):
#             bk = better_key(key1)

#             standalone_html_comparer = references_html(final_dict[bk], doc_names=[better_key(key1)], doc_type="PIT Regulations")

#             with open(f"../demo/All_HTMLS/PIT_Regs/comments/{dte[better_key(key1)]}.html",'w') as f:
#                 f.write(standalone_html_comparer)

#             if better_key(key1) == "Sep17_2019":
#                 print(dte[bk])
#                 copyfile("../demo/safecopy.html", f"../demo/All_HTMLS/PIT_Regs/comments/{dte[better_key(key1)]}.html")

In [41]:
"REGULATIONS MATCH"

import re
c = 0
for ele in list_:
    cnt = ele[1].lower().replace('<mark>','').replace('</mark>','')
    regspecmat = [ele.group() for ele in  re.finditer(r"regulation [1-9][0-9]*(\([1-9][0-9]*\)|[a-z]{0,2})", cnt)]
    if len(regspecmat) != 0:
        print(regspecmat)
        print(cnt)
        print("-----------------------------")
        c+=1
              
print(c)

NameError: name 'list_' is not defined

In [40]:
import os
from glob import glob
for reg_file in glob('../document_scraping/all_amend_rational.match.txt'):
    with open(reg_file,'r') as f:
        all_matches = f.read().split('--')
        all_matches = [clean_grep_out(ele) for ele in all_matches]
        key_for_glossary = os.path.basename(folder_name.replace('_',' '))
        matchdct = filename_to_refs.get(key_for_glossary, [key_for_glossary.lower()]) 
        for filen, line in all_matches:    
            if any([ele in line for ele in matchdct]):
                line = detect_pattern(line,matchdct)
                list_.append((filen, line, reg_file))

NameError: name 'filename_to_refs' is not defined

In [55]:
with open('../document_scraping/all_amend_rational.match.txt') as f:
    all_matches = f.read().split('\n')
    a = []
    for ele in all_matches:
        a.append(ele.split('-')[0])
        a.append(ele.split(':')[0])

In [None]:
# set(a)

In [42]:
# FOR LATEST HTMLS

# list_ = []
# import os
# for reg_file in glob('../document_scraping/*.txt'):
#     with open(reg_file,'r') as f:
#         all_matches = f.read().split('--')
#         all_matches = [clean_grep_out(ele) for ele in all_matches]
#         for file, line in all_matches:
#             matchdct = ["Prohibition of  Insider  Trading", "PIT","Insider Trading","insider trading"]
#             if any([ele in line for ele in matchdct]):
#                 line = detect_pattern(line,matchdct)
#                 list_.append((file, line, reg_file))

# print(len(list_))
# from datetime import datetime 

# count  = 0
# for ele in list_:
#     if ele not in all_files_dict:
#         count += 1
# print(count, len(list_))        

# pitsl = ['Jan28_2021']
# # , 'Aug16_2011', 'Jan15_2015', 'Dec31_2018', 'Jan21_2019', 'Sep17_2019', 'Nov11_2019', 'Jul17_2020', 'Oct29_2020'] 

# final_dict = {}
# # setlis = set(list_)
# set_indices = set()

# while len(pitsl) != 0:
#     key_date = pitsl.pop(0)
#     dt1 = datetime.strptime(key_date, '%b%d_%Y')
#     print(key_date,dt1)    
#     final_dict[key_date] = []
    
    
#     for i,ele in enumerate(list_):
#         if i in set_indices:
#             continue
        
#         # File not found
#         if ele[0] not in all_files_dict:
#             set_indices.add(i)
#             continue
        
#         tim = all_files_dict[ele[0]]["time"] 
#         dt2 = datetime.strptime(tim, '%b %d, %Y')
#         if dt2 <= dt1:
#             cpd_dict = all_files_dict[ele[0]]
#             cpd_dict["content"] = ele[1]
#             cpd_dict["dtm"] = dt2
#             final_dict[key_date].append(cpd_dict)
#             set_indices.add(i)
            
# key_list = ['Jan28_2021']

# dte = {'Jan28_2021': 'Comments0'}


# if True:
#     for i,key1 in enumerate(key_list):
#             bk = better_key(key1)

#             standalone_html_comparer = references_html(final_dict[bk], doc_names=[better_key(key1)], doc_type="PIT Regulations")
#             print(f"../../data/FOR_SENDING/{dte[better_key(key1)]}.html")
#             with open(f"../../data/FOR_SENDING/{dte[better_key(key1)]}.html",'w') as f:
#                 f.write(standalone_html_comparer)

# al = final_dict['Jan28_2021']
# al.sort(key=lambda x: x["dtm"])

# # from custom_functions import write_file
# stand_html = references_html(list_, doc_names=["Jan30_2021"], doc_type="PIT Regulations")
# with open('../../data/FOR_SENDING/all_pit_refs.html','w') as f:
#     f.write(stand_html)