## Jaccard similarity

In [97]:
from ltp import LTP
ltp = LTP()

sentence1 = '上海市市级科技重大专项》'
sentence2= '上海市国家级科研重大项目'

segment1 = ltp.seg([sentence1])[0][0]
segment2 = ltp.seg([sentence2])[0][0]
print(segment1)
print(segment2)

['上海市', '市级', '科技', '重大', '专项', '》']
['上海市', '国家级', '科研', '重大', '项目']


In [3]:
def jaccard(text1, text2):
    intersection = set(segment1)&set(segment2)
    union = set(segment1)|set(segment2)
    result = len(intersection)/len(union)
    return result

segment1 = ['上海市', '市级', '科技', '重大', '专项']
segment2 = ['上海市','国家级', '科研', '重大', '项目']
jaccard(segment1, segment2)

0.25

In [122]:
import string
from zhon.hanzi import punctuation
import re
from ltp import LTP
ltp = LTP()
full_name = '江苏鸿基节能新技术股份有限公司'
short_name = '鸿基节能'

ltp.add_words(words=[full_name, short_name], max_window=len(full_name))

class Custom_segment:
    def __init__(self):
        self.stopwords = self.get_stopwords()

    def get_stopwords(self):
        with open('../text_process/baidu_stopwords.txt','r', encoding='utf-8') as f:
            content = f.readlines()
            stopwords = list(map(str.strip, content))
        return  stopwords

    def cut_sentence(self, text):
        sents = ltp.sent_split([text])
        return sents

    def tokenizer(self, sent):
        segs = ltp.seg([sent])[0][0]
        rem_p_segments = self.remove_punctuation(segs)
        rem_sw_segments = self.del_stopwords(rem_p_segments)
        return list(rem_sw_segments)

    def remove_punctuation(self, words):
        new_words = (word for word in words if word not in punctuation and word not in string.punctuation)
        return new_words     

    def del_stopwords(self, words):
        new_words = (word for word in words if word not in self.stopwords)
        return new_words
        
    def __call__(self, text):
        sents = self.cut_sentence(text)
        segments = list(map(self.tokenizer, sents))
        return sents, segments


if __name__=="__main__":
    text = "2021年他叫汤姆去拿《外衣》。汤姆生病了。他去了医院的住院部。"
    custom_segment = Custom_segment()
    sents, segments = custom_segment(text)
    print(f'sents:{sents}, segments:{segments}')
    segments = custom_segment.tokenizer('2021年他叫汤姆去拿《外衣》。')
    print(segments)

sents:['2021年他叫汤姆去拿《外衣》。', '汤姆生病了。', '他去了医院的住院部。'], segments:[['2021年', '汤姆', '去', '外衣'], ['汤姆', '生病'], ['去', '医院', '住院部']]
['2021年', '汤姆', '去', '外衣']


## The degree of inclusion of text 2 in text 1

In [120]:
class Lexical_analysis:

    @staticmethod
    def inclusion_rate(left_segment, right_segment):

        '''
        Calculate the inclusion degree of the intersection of left segment and right segment in right segment

        Parameters
        ----------
        input: left_segment, right_segment
        Returns
        ----------
        similarity: float
        '''
        intersection = set(left_segment) & set(right_segment)
        # print(intersection)
        rate = len(intersection) / len(set(right_segment))
        return rate

    @staticmethod
    def get_search_str(left_segment, right_segment, left_sentence):
        '''
        find the serach string that use for add footnote in left_sentence 

        Parameters
        ----------
        input: left_segment, right_segment
        Returns
        ----------
        search_str: string
        '''    
        intersection = [seg for seg in left_segment if seg in right_segment]
        print(intersection)
        # Look for word that ends in 》） after intersection word 
        # special_words = [word for word in intersection if left_sentence[left_sentence.find(word) + len(word)] in '》）']
        search_str=False
        if left_sentence.find(intersection[-2:] + '》')
        for word in intersection:
            if left_sentence.find( word + '》') !=-1 :
                search_str = left_sentence[:left_sentence.find(word + '》') + len(word)+1]
            elif left_sentence.find( word + '）') !=-1:
                search_str = left_sentence[:left_sentence.find(word + '）') + len(word)+1]
        if search_str==False:
            # The word with the largest index in intersection
            position_word = intersection[-1]
            # An index of the concluding words in left_sentence
            index = left_sentence.rfind(position_word) + len(position_word)-1
            if (index+1) < len(left_sentence):
                if left_sentence[index+1] in '》”’）':
                    search_str = left_sentence[:index+2]
                else:
                    search_str = left_sentence[:index+1]
            else:
                search_str = left_sentence[:index+1]
        return search_str 

class Dependency_Parser:
    pass

if __name__=="__main__":

    left_sentence = '本项目建设单位和实施主体为江苏鸿基节能新技术股份有限公司，不涉及选址和用地情况》'
    right_sentence= '江苏鸿基节能新技术股份有限公司诉讼情况'


    left_segment = custom_segment.tokenizer(left_sentence)
    right_segment =custom_segment.tokenizer(right_sentence)
    print(left_segment)
    print(right_segment)

    similarity = Lexical_analysis.inclusion_rate(left_sentence, right_sentence)
    print(similarity)

    similarity = Lexical_analysis.inclusion_rate(left_segment, right_segment)
    print(similarity)

    search_str = Lexical_analysis.get_search_str(left_segment, right_segment, left_sentence)
    print(search_str)


['项目', '建设', '单位', '实施', '主体', '江苏鸿基节能新技术股份有限公司', '不', '涉及', '选址', '情况']
['江苏鸿基节能新技术股份有限公司', '诉讼', '情况']
0.8947368421052632
0.6666666666666666
['江苏鸿基节能新技术股份有限公司', '情况']
本项目建设单位和实施主体为江苏鸿基节能新技术股份有限公司，不涉及选址和用地情况》


In [82]:
from itertools import permutations

full_name = '江苏鸿基节能新技术股份有限公司'
short_name = '鸿基节能'

iter = permutations(['发行人', full_name, short_name], 2)
list(iter)

[('发行人', '江苏鸿基节能新技术股份有限公司'),
 ('发行人', '鸿基节能'),
 ('江苏鸿基节能新技术股份有限公司', '发行人'),
 ('江苏鸿基节能新技术股份有限公司', '鸿基节能'),
 ('鸿基节能', '发行人'),
 ('鸿基节能', '江苏鸿基节能新技术股份有限公司')]

In [91]:

def replace_company_name(left_sentence, right_sentence):
    if '发行人' in right_sentence and full_name in left_sentence:
        right_sentence = right_sentence.replace('发行人', full_name)
    elif '发行人' in right_sentence and short_name  in left_sentence:
        right_sentence = right_sentence.replace('发行人', short_name)
    elif full_name in right_sentence and '发行人'  in left_sentence:
        right_sentence = right_sentence.replace(full_name, '发行人')
    elif full_name in right_sentence and short_name in left_sentence:
        right_sentence = right_sentence.replace(full_name, short_name)
    elif short_name in right_sentence and '发行人' in left_sentence:
        right_sentence = right_sentence.replace(short_name, '发行人')
    elif short_name in right_sentence and full_name in left_sentence:
        right_sentence = right_sentence.replace(short_name, full_name)
    else:
        right_sentence = right_sentence
    return right_sentence

left_sentence = '本项目建设单位和实施主体为江苏鸿基节能新技术股份有限公司，不涉及选址和用地情况'
right_sentence= '发行人诉讼情况'
print(replace_company_name(left_sentence, right_sentence))

江苏鸿基节能新技术股份有限公司诉讼情况


In [8]:


if  re.findall('《.+?》', left_sentence):
    for special_str in re.findall('《.+?》', left_sentence):
        if right_sentence_replace in special_str:
            if left_sentence[left_sentence.index(special_str)+1] in '》）':
                search_str = left_sentence[:left_sentence.index(special_str)+2]      
            else:                              
                search_str = special_str                            
            index = right_sentences.index(right_sentence)
            insert_footnotes_sentence = right_sentence + '，' + contents_df.loc[index, 'dirIndex']
            result_df = result_df.append(pd.DataFrame({'paragraph_number':[paragraph_number], 'search_str':[search_str],\
                                                        'similarity':[1.1], 'insert_footnotes_sentence':[insert_footnotes_sentence]}))                        


'《a'