In [22]:
!pip install jieba



You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [2]:
import numpy as np
import json
import jieba

# Load word vectors

In [124]:
# functions to load the vectors
import io

def load_vectors(fname, limit=50):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for i, line in enumerate(fin):
        if n is not None and i >= limit:
            break
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array([float(i) for i in tokens[1:]])
    return data

In [8]:
load_vectors('../data/cc.zh.300.vec', limit=2)

{'，': array([ 8.000e-03,  3.360e-02,  5.720e-01, -1.105e-01, -2.000e-02,
         1.950e-02, -1.290e-02, -2.580e-02,  2.490e-02, -1.000e-03,
         1.820e-02, -7.400e-03, -2.880e-02, -1.160e-02, -3.550e-02,
         4.820e-02,  4.110e-02,  8.000e-04,  2.610e-02, -3.870e-02,
         1.040e-02, -3.500e-03,  7.150e-02, -1.000e-04, -3.970e-02,
        -3.530e-02,  5.460e-02, -4.620e-02, -4.390e-02,  6.210e-02,
        -2.760e-02,  1.510e-02, -5.280e-02, -1.600e-03,  7.400e-03,
        -3.120e-02,  3.200e-03, -8.430e-02, -6.980e-02, -3.090e-02,
         1.780e-02,  8.100e-03, -2.030e-02,  1.040e-02, -1.739e-01,
        -8.100e-03, -9.520e-02, -1.180e-02,  2.110e-02,  2.140e-02,
        -3.240e-02,  8.500e-03, -1.140e-02, -4.785e-01, -2.160e-02,
         4.690e-02, -2.600e-03,  1.520e-02, -4.380e-02,  5.540e-02,
        -4.410e-02, -2.560e-02, -8.400e-03, -1.210e-02, -3.720e-02,
        -1.090e-02,  7.040e-02,  2.080e-02, -4.570e-02, -2.520e-02,
        -1.150e-02,  4.140e-02,  7.800e-03,

In [19]:
# take a look at the vectors, the rows are ordered by frequency
# vectors = load_vectors('../data/cc.zh.300.vec', limit=5)

# dataset

In [11]:
import json

In [190]:
with open('../data/train_with_summ.txt', 'r', encoding='utf8') as f:
    for _ in range(50):
        line = json.loads(f.readline().strip())
#         print(preprocess_article(line).__len__())

In [171]:
import re

reporter_at_end = r'\(.{2,20}\)$'

def preprocess_article(s, 
                       min_paragraph_length=40,
                       min_sentence_length=30):
    if isinstance(s, dict):
        s = s['article']
        
    s = re.sub(reporter_at_end, '', s)
    # split into paragraphs
    paras = [p for p in s.split('<Paragraph>') if len(p) > min_paragraph_length]
    # split into sentences
    sents = [sent for p in paras for sent in p.split('。') if len(sent) > min_sentence_length]
    return sents

# embedding

In [125]:
# get sentence embedding
VOCAB_SIZE = 10000

vectors = load_vectors('../data/cc.zh.300.vec', limit=VOCAB_SIZE)

In [180]:
from typing import List, Dict
import numpy as np

def get_embedding_matrix(sents: List[str], vectors: Dict[str, np.array], 
                         min_match_rate=0.6,
                         agg='average'):
    """Return (
        list of embeddings with shape (embedding_size, sentence length), 
        list of matching rate for each sentence,
        list of sentence index)
    """ 
    match_rate = list()
    embeddings = list()
    sent_index = list()
    
    for idx, sent in enumerate(sents):
        tokens = list(jieba.cut_for_search(sent))
        
        n_tokens, n_match, emb = len(tokens), 0, list()
        for token in tokens:
            if token in vectors:
                n_match += 1
                emb.append(vectors[token])
        
        mr = n_match * 1.0 / n_tokens   # match rate of the current sentence
        if mr > min_match_rate:
            match_rate.append(n_match * 1.0 / n_tokens)
            sent_index.append(idx)
            
            if agg in ('average', 'avg', 'mean'):
                embeddings.append(np.average(emb, 0))
            elif agg in ('add', 'sum'):
                embeddings.append(np.sum(emb, 0))
            elif agg in ('max'):
                embeddings.append(np.max(emb, 0))
            elif agg in ('min'):
                embeddings.append(np.min(emb, 0))
            else:
                raise ValueError('agg method not supported')
    
    return embeddings, match_rate, sent_index

In [269]:
from sklearn.manifold import TSNE
from sklearn.pipeline import make_pipeline
from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler

In [273]:
def generate_summary(article: str, n_topic=3, min_len=20):
    """
    :param n_topic: Number of topics in the article. For each topic, a sentence with the closest
                    distance will be seleted.
    :param min_len: Minimum number of sentences in the article in order to generate a summary.
    """
    sents = preprocess_article(article)
    if len(sents) < min_len:
        return None
    
    embeddings, match_rate, sent_index = get_embedding_matrix(sents, vectors, agg='average')
    embeddings = np.concatenate([e.reshape(1, -1) for e in embeddings], axis=0)
    embeddings = StandardScaler().fit_transform(embeddings)

    # find the centroid
    KM = KMeans(n_clusters=n_topic)
    KM.fit(embeddings)

    centroids = KM.cluster_centers_   # (n centroids, embeddind shape)

    # for each centroid, find the sentence with closest distance
    summary = list()
    for centroid in centroids:
        idx = np.linalg.norm(embeddings - centroid, ord=2, axis=1).argmin()
        idx = sent_index[idx]
        summary.append((idx, sents[idx]))

    summary = '。'.join(i[1] for i in sorted(summary))+'。'
    return summary

# Get some samples

In [284]:
import pandas as pd

with open('../data/train_with_summ.txt', 'r', encoding='utf8') as f:
    result = list()
    for _ in range(50):
        line = json.loads(f.readline().strip())
        actual_summary = line['summarization']
        gen_summary = generate_summary(line['article'], n_topic=2, min_len=25)
        if gen_summary is not None:
            result.append((gen_summary, actual_summary))
    

In [286]:
result = pd.DataFrame(result)
result.columns = ['generated', 'actual']
result

Unnamed: 0,generated,actual
0,对此，出版《计算机应用基础》的高等教育出版社表示，教材的编写错误率一般控制在万分之一，如果真...,广州一教师因挑出教科书68处错误受处分 学校称其混淆是非
1,凌晨3时30分许，杨某某在张某的陪同下随李某某等人先后来到海淀区金源时代购物中心的金鼎轩餐厅...,法院：李天一案判定李10年刑期属从轻处罚；李律师称要上诉：不能以口供定罪
2,东江科技一名参与原种场职工安置工作的前员工李双告诉《路标》君，该公司很早就关注原种场了，通过...,曲婉婷母亲张明杰负责国企改制时，涉嫌以6160万价格贱卖23亿元国有资产；曾因职工不同意改制...
3,湘财证券的一位医药行业分析师向21世纪网表示：“可能还是市场对于乙肝疫苗的恐慌心理造成股价的...,康泰生物已全面停产 掌舵人此前已举家移民加拿大
4,工作两年却相当于上了5年班，出意外到非定点医院治疗遭遇报销难题，老板“溜”了工人能否讨回工资...,约多上3年班，起诉获赔4.8万；公司合同实行标准工时制度，但近两年时间内，177天休息日都在...
5,老人已经九十多岁，听力衰退，但识得字，于是黄海波举着一块白色的写字板，在上面写下要问的问题：...,嫖娼案发生后的一年零两个月，曾经的“国民女婿”黄海波在做什么
6,公司一位姓程负责人指着一名40多岁的老外向其介绍，对方是海外买家，而且已看中丁先生其中的一幅...,艺术品公司设“雅局”套钱：70元画估价500万，涉事公司107家；谎称能送往港澳台拍卖，聘老...


In [289]:
result.to_clipboard()