In [288]:
import time
import re
import math

import pandas as pd 
import MeCab
from wordcloud import WordCloud

FONT_PATH = "/System/Library/Fonts/ヒラギノ丸ゴ ProN W4.ttc"
myNAME = "成瀬 大悟"

In [None]:
def parse_str(string):
    mecab = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
    lines = mecab.parse(string).split('\n')[0:-2]
    words = []
    
    for line in lines:
        tmp = re.split('\t|,', line)
        if tmp[1] in ["名詞"]:
            words.append(tmp[7])  # 0: 表層形, 7: 原形
    
    return " ".join(words)

In [226]:
def make_wc(talk_history, friend_name, num):
    my_lines = "\n".join(talk_history.query(f"flag == 10 and 発言者 == '{myNAME}'")["内容"])
    fr_lines = "\n".join(talk_history.query(f"flag == 10 and 発言者 == '{friend_name}'")["内容"])
    
    my_words = parse_str(my_lines)
    friend_words = parse_str(fr_lines)

    stop_words =  open("MySlothLib.txt", encoding="utf8").readlines()
    for i,  w in enumerate(stop_words):
        stop_words[i] = w.rstrip('\n')
    mylist = ["URL", "笑", "笑笑", "通話", "時間", "今日", "明日", "ん", "の", "する", "ある", "やる", "いい", "こと", "そう", "それ", "おれ", "なん", "俺", "オレ", "これ", "http", "https"]
    stop_words.extend(mylist)

    my_wc = WordCloud(max_font_size=100, \
                   background_color="white", \
                   stopwords=set(stop_words), \
                   width=400, height=400, \
                   font_path=FONT_PATH).generate(my_words)
    fr_wc = WordCloud(max_font_size=100, \
                   background_color="white", \
                   stopwords=set(stop_words), \
                   width=400, height=400, \
                   font_path=FONT_PATH).generate(friend_words)
    all_wc = WordCloud(max_font_size=100, \
                   background_color="white", \
                   stopwords=set(stop_words), \
                   width=400, height=400, \
                   font_path=FONT_PATH).generate(my_words + friend_words)

    my_wc.to_file(f"./png/{str(num).zfill(2)}{myNAME}_to_{friend_name}.png")
    fr_wc.to_file(f"./png/{str(num).zfill(2)}{friend_name}_to_{myNAME}.png")
    all_wc.to_file(f"./png/{str(num).zfill(2)}{friend_name}_and_{myNAME}.png")

In [307]:
def parse_talk_txt(talk_txt):
    title = talk_txt[0].rstrip("\n")
    save_date = talk_txt[1].rstrip("\n")
    talk_data = talk_txt[3:]

    # 下の正規表現ほど制約が緩いので、順番を下にすること
    p_date = re.compile(r"(\d{4}\/\d{2}\/\d{2}\((月|火|水|木|金|土|日)\))\n")
    
    p_delete_msg1 = re.compile(r"(\d{2}:\d{2})\t([\w\s\.]+)がメッセージの送信を取り消しました")  # 相手が取り消した場合
    p_delete_msg2 = re.compile(r"(\d{2}:\d{2})\tメッセージの送信を取り消しました")  # 自分が取り消した場合
    
    p_video = re.compile(r"(\d{2}:\d{2})\t([\w\s\.]+)\t(\[動画\])")
    p_photo = re.compile(r"(\d{2}:\d{2})\t([\w\s\.]+)\t(\[写真\])")
    p_stamp = re.compile(r"(\d{2}:\d{2})\t([\w\s\.]+)\t(\[スタンプ\])")
    p_address = re.compile(r"(\d{2}:\d{2})\t([\w\s\.]+)\t(\[連絡先\])")
    p_calls = re.compile(r"\d{2}:\d{2}\t[\w\s\.]+\t☎ ")
    p_call = re.compile(r"(\d{2}:\d{2})\t([\w\s\.]+)\t(☎ 通話時間 (\d\d?:){1,3}\d\d)")
    p_missed_call = re.compile(r"(\d{2}:\d{2})\t([\w\s\.]+)\t(☎ 不在着信)\n")
    p_canceled_call = re.compile(r"(\d{2}:\d{2})\t([\w\s\.]+)\t(☎ 通話をキャンセルしました)")
    p_no_answer_call = re.compile(r"(\d{2}:\d{2})\t([\w\s\.]+)\t(☎ 通話に応答がありませんでした)")
    p_invited_call = re.compile(r"(\d{2}:\d{2})\t([\w\s\.]+)\t(☎ グループ音声通話に招待されました。)")
    p_transfer_error = re.compile(r"(\d{2}:\d{2})\t([\w\s\.]+)\t(ⓘ このメッセージは、利用していた端末から移行されなかったため表示できません。)")
    p_msg = re.compile(r'(\d{2}:\d{2})\t([\w\s\.]+)\t"?(.+)')
    
    p_textonly = re.compile(r'(.+)\n')
    p_br  = re.compile(r"\n")
    
    # flag
    # -1 : saved data
    # 10 : talk message
    # 11 : delete message 相手が取り消した場合
    # 12 : delete message 自分が取り消した場合
    # 20 : stamp
    # 21 : photo
    # 22 : video
    # 23 : address
    # 30 : call
    # 31 : missed call 不在着信
    # 32 : canceled call 通話をキャンセルしました
    # 33 : no answer call 通話に応答がありませんでした
    # 34 : invited call ☎ グループ音声通話に招待されました。
    # 50 : system message unsent
    # 51 : transfer_error
    # 60 : file
    # 70 : create and add album
    # 71 : changed the name of the album
    # 72 : deleted the album
    
    talk_history_list = []
    date = time = name = msg = ""
    flag = -1
    max_i = len(talk_data)

    for i, line in enumerate(talk_data):
        if p_date.match(line):
            if flag == -1:
                pass
            else:
                talk_history_list.append([date, time, name, msg, flag])
                flag = -1
                
            date = p_date.match(line).groups()[0]
        elif p_delete_msg1.match(line):
            if flag == -1:
                pass
            else:
                talk_history_list.append([date, time, name, msg, flag])
                flag = -1
            flag = 11
            tmp = p_delete_msg1.match(line).groups()
            time = tmp[0]
            name = tmp[1]
            msg = "メッセージの送信を取り消しました"
        elif p_delete_msg2.match(line):
            if flag == -1:
                pass
            else:
                talk_history_list.append([date, time, name, msg, flag])
                flag = -1
            flag = 12
            tmp = p_delete_msg2.match(line).groups()
            time = tmp[0]
            name = myNAME
            msg = "メッセージの送信を取り消しました"
        elif p_msg.match(line):
            if flag == -1:
                pass
            else:
                talk_history_list.append([date, time, name, msg, flag])
                flag = -1
        
            if p_stamp.match(line):
                flag = 20
                tmp = p_stamp.match(line).groups()
                time = tmp[0]
                name = tmp[1]
                msg = tmp[2]
            elif p_photo.match(line):
                flag = 21
                tmp = p_photo.match(line).groups()
                time = tmp[0]
                name = tmp[1]
                msg = tmp[2]
            elif p_video.match(line):
                flag = 22
                tmp = p_video.match(line).groups()
                time = tmp[0]
                name = tmp[1]
                msg = tmp[2]
            elif p_address.match(line):
                flag = 23
                tmp = p_address.match(line).groups()
                time = tmp[0]
                name = tmp[1]
                msg = tmp[2]
            elif p_calls.match(line):
                if p_call.match(line):
                    flag = 30
                    tmp = p_call.match(line).groups()
                    time = tmp[0]
                    name = tmp[1]
                    msg = tmp[2]
                elif p_missed_call.match(line):
                    flag = 31
                    tmp = p_missed_call.match(line).groups()
                    time = tmp[0]
                    name = tmp[1]
                    msg = tmp[2]
                elif p_canceled_call.match(line):
                    flag = 32
                    tmp = p_canceled_call.match(line).groups()
                    time = tmp[0]
                    name = tmp[1]
                    msg = tmp[2]
                elif p_no_answer_call.match(line):
                    flag = 33
                    tmp = p_no_answer_call.match(line).groups()
                    time = tmp[0]
                    name = tmp[1]
                    msg = tmp[2]
                elif p_invited_call.match(line):
                    flag = 34
                    tmp = p_invited_call.match(line).groups()
                    time = tmp[0]
                    name = tmp[1]
                    msg = tmp[2]
                else:
                    print("error in calls")
                    print(line)
            elif p_transfer_error.match(line):
                flag = 51
                tmp = p_transfer_error.match(line).groups()
                time = tmp[0]
                name = tmp[1]
                msg = tmp[2]
            elif p_msg.match(line):
                flag = 10
                tmp = p_msg.match(line).groups()
                time = tmp[0]
                name = tmp[1]
                msg = tmp[2]
            else:
                print("error in msgs")
        elif p_textonly.match(line):
            msg += '\n'
            msg += p_textonly.match(line).groups()[0].rstrip('"')
        elif p_br.match(line):
            if i < max_i:
                if p_date.match(talk_data[i+1]):
                    continue
                else:
                    msg += "\n"
            else:
                print("error in br")
                
        else:
            print("\n   exception occurs in LINE parser!!")
            print(f"line :{i}")
            print(line)
            
    # 最終行を保存
    talk_history_list.append([date, time, name, msg, flag])
    
    talk_history = pd.DataFrame(talk_history_list, columns=["日付", "時刻", "発言者", "内容", "flag"])
    
    return title, save_date, talk_history

In [308]:
def cleanse_txt(talk_txt):
    p_url = re.compile(r"https?:\/\/[\w:%#\$&\?\(\)~\.=\+\-\/@]+")
    cleansed_talk_txt = []
    for i in talk_txt:
        cleansed_talk_txt.append(p_url.sub('[URL]', i))
    
    return cleansed_talk_txt

In [309]:
def vocabulary_check(talk_history, fr_name):
    mecab = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
    my_txt = "\n".join(talk_history.query(f"flag == 10 and 発言者 == '{myNAME}'")["内容"])
    ft_txt = "\n".join(talk_history.query(f"flag == 10 and 発言者 == '{fr_name}'")["内容"])
    my_mecabs = mecab.parse(my_txt).split('\n')[0:-2]
    fr_mecabs = mecab.parse(ft_txt).split('\n')[0:-2]
    
    my_words = []
    fr_words = []
    
    for line in my_mecabs:
        tmp = re.split('\t|,', line)
        my_words.append(tmp[0])
    for line in fr_mecabs:
        tmp = re.split('\t|,', line)
        fr_words.append(tmp[0])
    
    return len(set(my_words)), len(set(fr_words)), len(my_words), len(fr_words)

In [328]:
def print_voc_info(voc, total_num):
    print(f"\t\t語彙数: {voc}")
    print(f"\t\t語彙pt: {voc / math.log10(total_num)} pt")

In [329]:
def main():
    with open(f"./名前.txt", encoding="UTF-8") as f:
        friend_names = f.read().splitlines()
    
    for i, fr_name in enumerate(friend_names):
#         if i != 0:
#             break
        with open(f"./トーク/[LINE] {fr_name}とのトーク.txt", encoding="UTF-8") as f:
            talk_txt = f.readlines()
            
        # トーク履歴をクレンジング
        cleansed_talk_txt = cleanse_txt(talk_txt)
        # トーク履歴をparse
        title, save_date, talk_history = parse_talk_txt(cleansed_talk_txt)
        # ワードクラウドを作成
#         make_wc(talk_history, fr_name, i)
        # 語彙数を計測
        my_voc, fr_voc, total_my_num, total_fr_num = vocabulary_check(talk_history, fr_name)
        
        th = talk_history
        print("------------------------------------------------")
        print(f"title: {title}")
        print(f"save date: {save_date}\n")
        
        print("\t相手の発言数:", th.query(f"発言者 == '{fr_name}'").query("flag == 10")["flag"].count())
#         print_voc_info(fr_voc, total_fr_num)
        
        print("\tあなたの発言数:", th.query("発言者 == '成瀬 大悟'").query("flag == 10")["flag"].count())
#         print_voc_info(my_voc, total_my_num)
        print("-------------------------------------------------")
    
    return talk_history

In [None]:
start = time.time()    
talk_history = main()
end = time.time()    
print(end-start, '秒')

In [None]:
# flag
    # -1 : saved data
    # 10 : talk message
    # 11 : delete message 相手が取り消した場合
    # 12 : delete message 自分が取り消した場合
    # 20 : stamp
    # 21 : photo
    # 22 : video
    # 23 : address
    # 30 : call
    # 31 : missed call 不在着信
    # 32 : canceled call 通話をキャンセルしました
    # 33 : no answer call 通話に応答がありませんでした
    # 34 : invited call ☎ グループ音声通話に招待されました。
    # 50 : system message unsent
    # 60 : file
    # 70 : create and add album
    # 71 : changed the name of the album
    # 72 : deleted the album