In [34]:
import time
import re

import pandas as pd 
import MeCab
from wordcloud import WordCloud

FONT_PATH = "/System/Library/Fonts/ヒラギノ丸ゴ ProN W4.ttc"
myNAME = "成瀬 大悟"

In [2]:
def parse_str(string):
    mecab = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
    lines = mecab.parse(string).split('\n')[0:-2]
    words = []
    
    for line in lines:
        tmp = re.split('\t|,', line)
        if tmp[1] in ["名詞"]:
            words.append(tmp[7])  # 1: 表層形, 7: 原形
    
    return " ".join(words)

In [3]:
def append_msg(talk_history, date, time, name, msg, flag):
    s = pd.Series([date, time, name, msg, flag], index=talk_history.columns)
    new_t = talk_history.append(s, ignore_index=True)
    return new_t

In [5]:
def make_wc(friend_name):
    my_words = parse_text(my_texts)
    friend_words = parse_text(friend_texts)

    stop_words =  open("MySlothLib.txt", encoding="utf8").readlines()
    for i,  w in enumerate(stop_words):
        stop_words[i] = w.rstrip('\n')
    mylist = ["笑", "笑笑", "通話", "時間", "今日", "明日", "ん", "の", "する", "ある", "やる", "いい", "こと", "そう", "それ", "おれ", "なん", "俺", "オレ", "これ", "http", "https"]
    stop_words.extend(mylist)

    my_wc = WordCloud(max_font_size=100, \
                   background_color="white", \
                   stopwords=set(stop_words), \
                   width=400, height=400, \
                   font_path=FONT_PATH).generate(my_words)
    fr_wc = WordCloud(max_font_size=100, \
                   background_color="white", \
                   stopwords=set(stop_words), \
                   width=400, height=400, \
                   font_path=FONT_PATH).generate(friend_words)
    all_wc = WordCloud(max_font_size=100, \
                   background_color="white", \
                   stopwords=set(stop_words), \
                   width=400, height=400, \
                   font_path=FONT_PATH).generate(my_words + friend_words)

    my_wc.to_file(f"{myNAME}_to_{friend_name}.png")
    fr_wc.to_file(f"{friend_name}_to_{myNAME}.png")
    all_wc.to_file(f"{friend_name}_and_{myNAME}.png")

In [120]:
def parse_talk_txt(talk_txt):
    title = talk_txt[0]
    save_date = talk_txt[1]
    talk_data = talk_txt[3:]

    # 下の正規表現ほど制約が緩いので、順番を下にすること
    p_date = re.compile(r"(\d{4}\/\d{2}\/\d{2}\((月|火|水|木|金|土|日)\))\n")
    p_video = re.compile(r"(\d{2}:\d{2})\t([\w|\s]+)\t(\[動画\])\n")
    p_photo = re.compile(r"(\d{2}:\d{2})\t([\w|\s]+)\t(\[写真\])\n")
    p_stamp = re.compile(r"(\d{2}:\d{2})\t([\w|\s]+)\t(\[スタンプ\])\n")
    p_msg = re.compile(r'(\d{2}:\d{2})\t([\w|\s]+)\t"?(.+)\n')
    p_deleted = re.compile(r".*メッセージの送信を取り消しました$")
    p_textonly = re.compile(r'(.+)\n')
    p_br  = re.compile(r"\n")
    
    # flag
    # -1 : saved data
    # 10 : talk message
    # 11 : deleted message
    # 20 : stamp
    # 21 : photo
    # 22 : video
    # 30 : call
    # 31 : missed call
    # 32 : canceled call
    # 33 : no answer call
    # 50 : system message unsent
    # 60 : file
    # 70 : create and add album
    # 71 : changed the name of the album
    # 72 : deleted the album
    
    talk_history = pd.DataFrame(columns=["日付", "時刻", "発言者", "内容", "flag"])
    date = time = name = msg = ""
    flag = -1
    max_i = len(talk_data)

    for i, line in enumerate(talk_data):
        if p_date.match(line):
            if flag == -1:
                pass
            else:
                talk_history = append_msg(talk_history, date, time, name, msg, flag)
                flag = -1
                
            date = p_date.match(line).groups()[0]
        elif p_msg.match(line):
            if flag == -1:
                pass
            else:
                talk_history = append_msg(talk_history, date, time, name, msg, flag)
                flag = -1
        
            if p_stamp.match(line):
                flag = 20
                tmp = p_stamp.match(line).groups()
                time = tmp[0]
                name = tmp[1]
                msg = tmp[2]
            elif p_photo.match(line):
                flag = 21
                tmp = p_photo.match(line).groups()
                time = tmp[0]
                name = tmp[1]
                msg = tmp[2]
            elif p_video.match(line):
                flag = 22
                tmp = p_video.match(line).groups()
                time = tmp[0]
                name = tmp[1]
                msg = tmp[2]
            elif p_msg.match(line):
                flag = 10
                tmp = p_msg.match(line).groups()
                time = tmp[0]
                name = tmp[1]
                msg = tmp[2]
            else:
                print("error")
                
        elif p_deleted.match(line):
            pass
        elif p_textonly.match(line):
            msg += '\n'
            msg += p_textonly.match(line).groups()[0].rstrip('"')
        elif p_br.match(line):
            if i+1 < max_i:
                if p_date.match(talk_data[i+1]):
                    continue
                else:
                    msg += "\n"
        else:
            print("\n   exception occurs!!")
            print(f"line :{i}")
            print(line)
    
    return title, save_date, talk_history

In [None]:
def main():
    with open(f"./名前.txt", encoding="UTF-8") as f:
        names = f.read().splitlines()
    friend_name = names[7]

    with open(f"./トーク/[LINE] {friend_name}とのトーク.txt", encoding="UTF-8") as f:
            talk_txt = f.readlines()

    title, save_date, talk_history = parse_talk_txt(talk_txt)
    
    
    pd.set_option('display.max_columns', 100)
    pd.set_option('display.max_rows', 100)
#     print(pd.get_option('display.max_rows'))
#     print(pd.get_option('display.max_columns'))

    print(f"title: {title}")
    print(f"save date: {save_date}")
    display(talk_history[0:50])

start = time.time()    
main()
end = time.time()    
print(end-start, '秒')