## Jaccard similarity

In [109]:
from ltp import LTP
ltp = LTP()

sentence1 = '上海市市级科技重大专项》'
sentence2= '上海市国家级科研重大项目'

segment1 = ltp.seg([sentence1])[0][0]
segment2 = ltp.seg([sentence2])[0][0]
print(segment1)
print(segment2)

['上海市', '市级', '科技', '重大', '专项', '》']
['上海市', '国家级', '科研', '重大', '项目']


In [3]:
def jaccard(text1, text2):
    intersection = set(segment1)&set(segment2)
    union = set(segment1)|set(segment2)
    result = len(intersection)/len(union)
    return result

segment1 = ['上海市', '市级', '科技', '重大', '专项']
segment2 = ['上海市','国家级', '科研', '重大', '项目']
jaccard(segment1, segment2)

0.25

## The degree of inclusion of text 2 in text 1

In [2]:
class Lexical_analysis:

    @staticmethod
    def inclusion_rate(left_segment, right_segment):

        '''
        Calculate the inclusion degree of the intersection of left segment and right segment in right segment

        Parameters
        ----------
        input: left_segment, right_segment
        Returns
        ----------
        similarity: float
        '''
        intersection = set(left_segment) & set(right_segment)
        similarity = len(intersection) / len(set(right_segment))
        return similarity

    @staticmethod
    def get_search_str(left_segment, right_segment, left_sentence):
        '''
        find the serach string that use for add footnote in left_sentence 

        Parameters
        ----------
        input: left_segment, right_segment
        Returns
        ----------
        search_str: string
        '''    
        intersection = set(left_segment) & set(right_segment)
        # The word with the largest index in left_segment
        position_word = left_segment[max([left_segment.index(word) for word in list(intersection)])]
        # An index of the concluding words in left_segment
        index = left_sentence.rfind(position_word) + len(position_word)-1
        if (index+1) < len(left_sentence):
            if left_sentence[index+1] in '》”’':
                search_str = left_sentence[:index+2]
        else:
            search_str = left_sentence[:index+1]
        return search_str 

class Dependency_Parser:
    pass

if __name__=="__main__":

    left_sentence = '《市级科技重大专项上海市》黄埔区'
    left_sentence2= '上海市市级科技重大项目'
    right_sentence= '上海市国家级科研重大项目'

    left_segment = ['市级', '科技', '重大', '专项','上海市',]
    left_segment2 = ['上海市', '市级', '科技', '重大', '项目']
    right_segment = ['上海市','国家级', '科研', '重大', '项目']

    similarity = Lexical_analysis.inclusion_rate(left_segment, right_segment)
    print(similarity)

    search_str = Lexical_analysis.get_search_str(left_segment, right_segment, left_sentence)
    print(search_str)

    search_str = Lexical_analysis.get_search_str(left_segment2, right_segment, left_sentence2)
    print(search_str)

0.4
《市级科技重大专项上海市》
上海市市级科技重大项目
