In [26]:
import re

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

import datetime

import MeCab

import urllib.parse
import json

import time

from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError

In [27]:
class Gijiroku():
    def __init__(self, clova_file, name, date):
        self.date = date
        self.name = name
        self.paragraph_list = []
        self.read_clova_txt(clova_file)
        self.keywords = []
        self.html = ""
        
    def read_clova_txt(self, clova_txt_path):  
        speaker = ""
        #バラバラな一人の発言をまとめる。
        with open(clova_txt_path) as f:
            lines = f.read()
    
        with open(clova_txt_path) as f:
            for line in f:
                line = line.rstrip()  # 読み込んだ行の末尾には改行文字があるので削除
                if re.search(r'^参加者', line): #参加者line
                    if speaker != ''.join(line.split(' ')[:2]):  #話者が変わるとき
                        speaker = ''.join(line.split(' ')[:2])   #話者を更新
                    else:
                        lines = lines.replace(line,'')
        with open(clova_txt_path, 'w') as f:
            f.write(lines)
  
            
        with open(clova_txt_path) as f:
            #ヘッダーを消して参加者 1まで読み込む
            while True:
                last_point = f.tell()
                line = f.readline()

                if re.match(r'^参加者', line):
                    f.seek(last_point)
                    break

            line = f.readline().rstrip()
            speaker = '参加者 ' + line.split(' ')[1]   #話者を更新
            time = line.split(' ')[2]                 #会話開始時間を記録
            paragraph = Paragraph(speaker, time, '')
            
            for line in f:
                line = line.rstrip()  # 読み込んだ行の末尾には改行文字があるので削除
                if re.search(r'^参加者', line): #参加者line
                    self.paragraph_list.append(paragraph)
                    speaker = '参加者 ' + line.split(' ')[1]   #話者を更新
                    time = line.split(' ')[2]                 #会話開始時間を記録
                    paragraph = Paragraph(speaker, time, '')
                else: #textline
                    paragraph.text += line

            self.paragraph_list.append(paragraph)
        
    def clenging(self):
        delete_list = []
        for i, paragraph in enumerate(self.paragraph_list):
            paragraph.text_clenging()
            if paragraph.text == "":
                delete_list.append(i)
                
        for delete_key in sorted(delete_list, reverse=True):
            self.paragraph_list.pop(delete_key)
         
    def show_all(self):
        for paragraph in self.paragraph_list:
            print(paragraph.speaker, paragraph.time)
            print(paragraph.text)
            
    def delete_aizuti(self):
        ##一番喋っている人を特定
        max_n = 0
        value_count = {}
        for paragraph in self.paragraph_list:
            if paragraph.speaker not in value_count.keys():
                value_count[paragraph.speaker] = 0
            else:
                value_count[paragraph.speaker] += 1
                if max_n < value_count[paragraph.speaker]:
                    max_n = value_count[paragraph.speaker]
                    speaker = paragraph.speaker

        #ファシリテーターの相槌を消す。 上位3つだけ残す
        fasili_dict = {}  #{リストのスライス: 文字数 }
        for i, paragraph in enumerate(self.paragraph_list):
            if paragraph.speaker == speaker: #ファシリテーターを特定
                fasili_dict[i] = len(paragraph.text)

        for i in range(4):
            max_value = 0
            for key, value in fasili_dict.items():
                if value > max_value:
                    max_value = value
                    max_key = key
            fasili_dict.pop(max_key)

        for delete_key in sorted(fasili_dict.keys(), reverse=True):
            self.paragraph_list.pop(delete_key)
            
    def text_merge(self):
        merge_list = []
        speaker = ''
        for i, paragraph in enumerate(self.paragraph_list):
            if speaker != paragraph.speaker:
                speaker = paragraph.speaker
            else:
                merge_list.append(i)

        for merge_index in sorted(merge_list, reverse=True):
            self.paragraph_list[merge_index - 1].text = self.paragraph_list[merge_index - 1].text + self.paragraph_list[merge_index].text
            self.paragraph_list.pop(merge_index)

    def delete_less100(self):
        delete_list = []
        for i, paragraph in enumerate(self.paragraph_list):
            paragraph.remove_less100()
            if paragraph.text == "":
                delete_list.append(i)
                
        for delete_key in sorted(delete_list, reverse=True):
            self.paragraph_list.pop(delete_key)
    
    def summarize(self):
        for paragraph in self.paragraph_list:
            paragraph.summarize()
            
    def merge_wiki_words(self):
        for paragraph in self.paragraph_list:
            paragraph.get_wiki_words()
            self.keywords.extend(paragraph.keywords)
        self.keywords = set(self.keywords)
        
        #wikipediaに存在するか確認
        checked_words = []
        for word in self.keywords:
            request_url = 'https://ja.wikipedia.org/api/rest_v1/page/summary/' + urllib.parse.quote(word)
            req = Request(request_url)
            try:
                with urlopen(req) as res:
                    res_json = res.read()
            except HTTPError as e:
                continue
            except URLError as e:
                continue
            else:
                wiki = json.loads(res_json.decode('utf-8'))
                if (wiki['type'] == 'disambiguation') or (len(wiki['extract']) < 7):
                    continue

                checked_words.append(word)
            time.sleep(0.01)
        self.keywords = checked_words
    
    def render(self):
        #リンクが貼れるように置換するための辞書を作成
        checked_dict = {}
        for word in self.keywords:
            checked_dict[word] = "<span class=\"wiki\">" + word + "</span>"
    
        for paragraph in self.paragraph_list:
            paragraph.render()
        self.clenging()
        
        body=''
        for paragraph in self.paragraph_list:
            body = body + '{html}'.format(html=paragraph.html)

        #辞書をもとにhtml用に書き換え
        for word, read in checked_dict.items():
            body = body.replace(word, read)

        dt_now = datetime.datetime.now()
        date=dt_now.strftime('%Y年%m月%d日') + "の議事録"

        self.html = '''<!DOCTYPE html>
        <html lang="ja">
        <head>
            <meta charset="UTF-8">
            <title>議事録</title>
            <link rel="stylesheet" href="static/style.css">
            <link rel="stylesheet" href="static/nobata.css">
        </head>
            <body>
                <h1>{date}</h1>
                <h1>画像アップロード</h1>
                <form action="/upload" method="post" enctype="multipart/form-data" class="form-img">
                    <div id="drop-zone" style="border: 1px solid; padding: 30px; border-color: white;">
                        <p>ファイルをドラッグ＆ドロップもしくは</p>
                        <input type="file" name="file" id="file-input">
                    </div>
                    <h2>プレビュー</h2>
                    <div id="preview"></div>
                    <h2>アップロードした画像</h2>
                    <div id="uploaded"></div>
                    <input type="submit" style="margin-top: 50px">
                </form>
                <audio controls src="audio/output.mp3" id="audio"></audio>
                {body}
                <script src="https://cdn.jsdelivr.net/npm/axios/dist/axios.min.js"></script>
                <script src="static/wikipedia-preview.development.js"></script>
                <script src="static/file.js"></script>
                <script type="text/javascript">wikipediaPreview.init({{lang: 'ja'}});</script>
            </body>
        </html>'''.format(date=date, body=body) 

        with open("../output/public/gijiroku/gijiroku"+ datetime.datetime.now().strftime('(%Y.%m.%d)') + '.html', 'w', encoding='utf-8') as f: 
            f.write(self.html) 

# Taggerオブジェクトを生成
tokenizer = MeCab.Tagger("-Ochasen")
tokenizer.parse("")

class Paragraph:
    def __init__(self, speaker, time, text):
        self.speaker = speaker
        self.time = time
        self.text = text
        self.keywords = []
        self.html = ""
        
    def text_clenging(self):
        self.text = re.sub(' ', '、', self.text) ##空白削除

        self.text = self.text.replace('です', 'です。').replace('ます','ます。').replace('でした','でした。').replace('ません','ません。').replace('さい','さい。')##ますの後には必ず「。」
        self.text = self.text.replace('っていうこと', 'こと').replace('っていう', 'という').replace('ていう', 'という').replace('かなと', 'かと')##ますの後には必ず「。」

        self.text = re.sub(r'(えー|えーと|えっと|そうですね|まあ|じゃあ|なんか|ちょっと|あの|ということで|っていうの|んじゃないか|一応|とりあえず)', '', self.text) ##削除
        self.text = re.sub(r'ま([^\u3040-\u309F])', r'\1', self.text) ##削除
        self.text = re.sub(r'という([、。])', r'\1', self.text) ##削除
        self.text = re.sub(r'(.)(.)(.)\1\2\3', r'\1\2\3', self.text)#繰り返し文字
        self.text = re.sub(r'([\u3400-\u9FFF\uF900-\uFAFF]|[\uD840-\uD87F][\uDC00-\uDFFF])([\u3400-\u9FFF\uF900-\uFAFF]|[\uD840-\uD87F][\uDC00-\uDFFF])\1\2', r'\1\2', self.text)

        while re.search(r'([ねえま][、。]|[、。].{0,2}[、。])', self.text):
            self.text = re.sub(r'([ねえま]、)', '、', self.text) ##誤字 語感　訂正
            self.text = re.sub(r'([ねえま]。)', '。', self.text) ##誤字　語感　訂正

            self.text = re.sub(r'[、].{0,2}[、。]', '、', self.text) ##削除
            self.text = re.sub(r'[。].{0,2}[、。]', '。', self.text) ##削除
            self.text = re.sub(r'^.{0,2}[、。]', '', self.text) ##削除
            
    def remove_less100(self):
        if len(self.text) <= 100:
            self.text =  ""
    
    def get_wiki_words(self):
        #Wikipediaリンク用の単語を抽出
        words = []
        # 単語の特徴リストを生成
        node = tokenizer.parseToNode(self.text)
        while node:
            # 品詞情報(node.feature)が名詞ならば
            if node.feature.split(",")[0] == u"名詞":
                #ひらがな、漢字、2文字以下のカタカナをパス
                if (re.fullmatch(r'[\u3040-\u309F]+|[0-9]+|.', node.surface) == None)\
                and (re.fullmatch(r'[ァ-ヶ]{0,2}', node.surface) == None)\
                and (re.fullmatch(r'[\u3040-\u309F\u2E80-\u2FDF\u3005-\u3007\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF\U00020000-\U0002EBEF]+', node.surface) == None)\
                and (re.match(r'(宮崎ゼミ|宮崎|ゼミ)',node.surface) == None): 
                    # 単語(node.surface)をwordsに追加
                    words.append(node.surface)
            node = node.next
        #重複を削除
        words = set(words)
        self.keywords = list(words)
    
    def summarize(self):
        count = len(self.text.split('。'))
        count = round(count/4)#25%のこし

        LANGUAGE = "japanese"  # 言語指定
        SENTENCES_COUNT = count  # 要約文数


        # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
        parser = PlaintextParser.from_string(self.text, Tokenizer(LANGUAGE))
        stemmer = Stemmer(LANGUAGE)

        summarizer = Summarizer(stemmer)
        summarizer.stop_words = get_stop_words(LANGUAGE)

        sentences = ""
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            sentences = sentences + sentence.__str__()
            
        self.text = sentences

    def render(self):
        times = self.time.split(':')
        if len(times) == 2:
            sec = int(times[0]) * 60 + int(times[1])
        elif len(times) == 3:
            sec = int(times[0]) * 360 + int(times[1]) * 60 + int(times[2])
        #長すぎる文章は印
        confirm_value = ""
        if len(self.text) > 500:
            confirm_value = "yes"

        key_word = ' '.join(self.keywords[:round(len(self.keywords)/3)])

        if confirm_value == '':
            div = '''
            <div class="confirm_value" time="{time}">
                <p>{value}</p>
            </div>'''.format(value=self.text, time=sec)
        else:
            div = '''
            <div class="confirm_value long" time="{time}">
                <p>{value}</p>
            </div>'''.format(value=self.text, time=sec)

        self.html = '''
        <h2>{speaker}</h2>
        <h3>{key_word}</h3>{div}'''.format(speaker=self.speaker, key_word=key_word, div=div)

In [None]:
class S

In [28]:
gijiroku = Gijiroku('output.txt', '宮崎ゼミ', '10/28')
gijiroku.clenging()
gijiroku.delete_aizuti()
gijiroku.delete_aizuti()
gijiroku.text_merge()
gijiroku.delete_less100()
gijiroku.text_merge()
gijiroku.clenging()
gijiroku.summarize()
gijiroku.delete_less100()
gijiroku.text_merge()
gijiroku.merge_wiki_words()

In [29]:
gijiroku.render()