In [1]:
# pip install sumy

from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

LANG = "chinese"
tokenizer = Tokenizer(LANG)
stemmer = Stemmer(LANG)
summarizer = Summarizer(stemmer)
summarizer.stop_words = get_stop_words(LANG)

In [2]:
def get_summary(text, max_length=510):
    if len(text) < max_length:
        return text

    parser = PlaintextParser.from_string(text, tokenizer)
    min_length = 0x7fffffff
    for i in range(len(parser.document.sentences), 4, -1):
        sentences = summarizer(parser.document, i)
        summary = ' '.join([str(sentence) for sentence in sentences])
        if len(summary) <= 1.1 * max_length:
            return summary
        elif len(summary) < min_length:
            min_length = len(summary)
            min_summary = summary

    return min_summary if 0.5 * max_length <= min_length <= max_length else text[-(max_length+1):-1]

In [3]:
import pandas as pd

data = pd.read_csv('data/testA.csv')
facts = data.fact.tolist()

In [4]:
from tqdm.autonotebook import tqdm

summ_facts = []

for i in tqdm(range(len(facts))):
    summ_facts.append(get_summary(facts[i]))

  from tqdm.autonotebook import tqdm


  0%|          | 0/25001 [00:00<?, ?it/s]

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.393 seconds.
Prefix dict has been built successfully.


### Save to csv

In [5]:
data['summary'] = summ_facts

data.summary.str.len().describe()

count    25001.000000
mean       484.982081
std         68.692515
min        177.000000
25%        445.000000
50%        508.000000
75%        539.000000
max        561.000000
Name: summary, dtype: float64

In [6]:
data.to_csv('data/testA_summary.csv', index=False)