## Jaccard similarity

In [109]:
from ltp import LTP
ltp = LTP()

sentence1 = '上海市市级科技重大专项》'
sentence2= '上海市国家级科研重大项目'

segment1 = ltp.seg([sentence1])[0][0]
segment2 = ltp.seg([sentence2])[0][0]
print(segment1)
print(segment2)

['上海市', '市级', '科技', '重大', '专项', '》']
['上海市', '国家级', '科研', '重大', '项目']


In [3]:
def jaccard(text1, text2):
    intersection = set(segment1)&set(segment2)
    union = set(segment1)|set(segment2)
    result = len(intersection)/len(union)
    return result

segment1 = ['上海市', '市级', '科技', '重大', '专项']
segment2 = ['上海市','国家级', '科研', '重大', '项目']
jaccard(segment1, segment2)

0.25

## The degree of inclusion of text 2 in text 1

In [8]:
class Lexical_analysis:

    @staticmethod
    def inclusion_rate(left_segment, right_segment):

        '''
        Calculate the inclusion degree of the intersection of left segment and right segment in right segment

        Parameters
        ----------
        input: left_segment, right_segment
        Returns
        ----------
        similarity: float
        '''
        intersection = set(left_segment) & set(right_segment)
        similarity = len(intersection) / len(set(right_segment))
        return similarity

    @staticmethod
    def get_search_str(left_segment, right_segment, left_sentence):
        '''
        find the serach string that use for add footnote in left_sentence 

        Parameters
        ----------
        input: left_segment, right_segment
        Returns
        ----------
        search_str: string
        '''    
        intersection = set(left_segment) & set(right_segment)
        # The word with the largest index in left_segment
        position_word = left_segment[max([left_segment.index(word) for word in list(intersection)])]
        # An index of the concluding words in left_segment
        index = left_sentence.rfind(position_word) + len(position_word)-1
        if (index+1) < len(left_sentence):
            if left_sentence[index+1] in '》”’）':
                search_str = left_sentence[:index+2]
            else:
                search_str = left_sentence[:index+1]
        else:
            search_str = left_sentence[:index+1]
        return search_str 

class Dependency_Parser:
    pass

if __name__=="__main__":

    left_sentence = '2015年8月3日，信永中和出具《验资报告》（XYZH[20l5]NJA30092号），对鸿基有限整体变更为股份公司的出资情况进行了验资确认。'
    right_sentence= '《验资报告》（XYZH[20l5]NJA30092号）'


    from ltp import LTP
    ltp = LTP()

    left_segment = ltp.seg([left_sentence])[0][0]
    right_segment = ltp.seg([right_sentence])[0][0]
    print(left_segment)
    print(right_segment)

    similarity = Lexical_analysis.inclusion_rate(left_segment, right_segment)
    print(similarity)

    search_str = Lexical_analysis.get_search_str(left_segment, right_segment, left_sentence)
    print(search_str)


['2015年', '8月', '3日', '，', '信永中和', '出具', '《', '验资', '报告', '》', '（', 'XYZH', '[', '20l5]NJA30092', '号', '）', '，', '对', '鸿基', '有限', '整体', '变更', '为', '股份公司', '的', '出资', '情况', '进行', '了', '验资', '确认', '。']
['《', '验资', '报告', '》', '（', 'XYZH', '[', '20l5]NJA30092', '号', '）']
1.0
2015年8月3日，信永中和出具《验资报告》（XYZH[20l5]NJA30092号）


In [10]:
from itertools import permutations

full_name = '江苏鸿基节能新技术股份有限公司'
short_name = '鸿基节能'

iter = permutations(['发行人', full_name, short_name], 2)
list(iter)

[('发行人', '江苏鸿基节能新技术股份有限公司'),
 ('发行人', '鸿基节能'),
 ('江苏鸿基节能新技术股份有限公司', '发行人'),
 ('江苏鸿基节能新技术股份有限公司', '鸿基节能'),
 ('鸿基节能', '发行人'),
 ('鸿基节能', '江苏鸿基节能新技术股份有限公司')]

In [None]:

def replace_company_name(left_sentence, right_sentence):
    if '发行人' in right_sentence and full_name in left_sentence:
        right_sentence = right_sentence.replace('发行人', full_name)
    elif '发行人' in right_sentence and short_name  in left_sentence:
        right_sentence = right_sentence.replace('发行人', short_name)
    elif full_name in right_sentence and '发行人'  in left_sentence:
        right_sentence = right_sentence.replace(full_name, '发行人')
    elif full_name in right_sentence and short_name in left_sentence:
        right_sentence = right_sentence.replace(full_name, short_name)
    elif short_name in right_sentence and '发行人' in left_sentence:
        right_sentence = right_sentence.replace(short_name, '发行人')
    elif short_name in right_sentence and full_name in left_sentence:
        right_sentence = right_sentence.replace(short_name, full_name)
    else:
        right_sentence = right_sentence
    return right_sentence
