## **Line Chat Analyzer -- Playground**

### **Text to Dataframe**

In [1]:
import os
import pandas as pd
import os
import re
from time import strptime
from time import process_time
from datetime import datetime
from matplotlib import pyplot as plt
from matplotlib import dates as mdates

def timeParser(string):
    t = ''
    if re.match("上午", string):
        t = string[2:7] + ' ' + 'AM'
    else:
        t = string[2:7] + ' ' + 'PM'
    return t


def datetimetotimestamp(date, time):
    datetime_string = date+' '+timeParser(time)
    datetime_format = '%Y-%m-%d %I:%M %p'
    datetime_object = datetime.strptime(datetime_string, datetime_format)
    return datetime_object


def actionMsg(now_date, line):
    t, action = line.rstrip("\n").split("\t", 1)
    member = ''
    msg = ''
    # ooo邀請xxx加入群組
    if re.match("(.*)邀請(.*)(加入|加入群組)$", action):
        member, msg = action.split("邀請", 1)
        msg = '邀請' + msg
    # xxx加入群組
    elif re.match("(.*)加入群組$", action):
        member, msg = action.split("加入", 1)
        msg = '加入' + msg
    # xxx已退出群組
    elif re.match("(.*)已退出群組$", action):
        member, msg = action.split("已退出", 1)
        msg = '已退出' + msg
    # ooo已讓xxx退出群組
    elif re.match("(.*)已讓(.*)退出群組$", line):
        member, msg = action.split("已讓", 1)
        msg = '已讓' + msg
    # ooo已收回訊息
    elif re.match("(.*)已收回訊息$", line):
        member, msg = action.split("已收回", 1)
        msg = '已收回' + msg
    return {'full_time': datetimetotimestamp(now_date, t), 'member': member, 'msg': msg}


def findLastDate(txt_path):
    f = open(txt_path, "r", encoding="utf-8")
    li = []
    for index, line in enumerate(f.readlines()):
        # 略過前 3 行
        if index < 3:
            continue
        # 紀錄日期指標
        if re.match("^\d{4}/\d{2}/\d{2}（(一|二|三|四|五|六|日)）$", line):
            li.append({
                'index': index,
                'date': line.rstrip("\n")[0:10].replace('/', '-')
            })

    csv_path = txt_path.replace('.txt', '.csv')
    if os.path.isfile(csv_path) == True:
        df = pd.read_csv(csv_path, encoding='utf-8')
        now_date = df.iloc[-1:].full_time.astype(
            str).str[0:10].to_string(index=False).strip()
        last_date = list(filter(lambda l: l['date'] == now_date, li))[0]
    else:
        df = pd.DataFrame(columns=['full_time', 'member', 'msg'])
        last_date = li[0]

    return last_date


def msg_txt_to_csv(txt_path):
    print('-----\nhandle msg from:\t' + txt_path)

    # 檢查是否已存在 csv 檔案, 有的話讀入
    csv_path = txt_path.replace('.txt', '.csv')
    if os.path.isfile(csv_path) == True:
        df = pd.read_csv(csv_path, encoding='utf-8')
        now_date = df.iloc[-1:].full_time.astype(
            str).str[0:10].to_string(index=False).strip()
    else:
        df = pd.DataFrame(columns=['full_time', 'sender', 'msg'])
        now_date = '1970-01-01'  # init date
    f = open(txt_path, "r", encoding="utf-8")

    # 剔除 dataframe 中最後一天的對話紀錄, 再從 txt 重新讀入
    last_date = findLastDate(txt_path)  # csv_last_date_line_index
    df = df[~df.full_time.str.contains(last_date['date'], na=False)]
    rows = []
    for index, line in enumerate(f.readlines()[last_date['index']:]):
        # 紀錄日期指標
        if re.match("^\d{4}/\d{2}/\d{2}（(一|二|三|四|五|六|日)）$", line):
            new_date = line.rstrip("\n")[0:10].replace('/', '-')
            if strptime(now_date, "%Y-%m-%d") < strptime(new_date, "%Y-%m-%d"):
                now_date = new_date
            else:
                continue
        # 處理訊息
        else:
            # 處理動作
            if re.match("^(上午|下午)\d{2}:\d{2}\t(.*)(加入|加入群組|退出群組|收回訊息)$", line):
                rows.append(actionMsg(now_date, line))
            # 處理一般對話訊息
            elif re.match("^(上午|下午)\d{2}:\d{2}\t(.*)\t(.*)", line):
                time, member, msg = line.rstrip("\n").split("\t", 2)
                rows.append({
                    'full_time': datetimetotimestamp(now_date, time),
                    'sender': member,
                    'msg': msg
                })
            # 處理一般對話訊息包含多行的情況
            else:
                if len(rows) > 0:
                    rows[-1]['msg'] = rows[-1]['msg'].strip(
                        '"') + '\n' + line.strip('\n"')
    f.close()
    df1 = pd.DataFrame(rows, columns=['full_time', 'sender', 'msg'])
    df = pd.concat([df, df1], ignore_index=True)
    df.to_csv(csv_path, encoding="utf_8_sig", index=False)
    return df


# .txt to .csv
# dir_path = os.getcwd()
# filename = '/chat_history_20220719.txt'
# tStart = process_time()
# msg_txt_to_csv(dir_path + filename)
# tEnd = process_time()
# print('generate:\t\t' + dir_path + '/' + filename.replace('.txt', '.csv') + '\ncast:\t\t\t' + str(tEnd - tStart) + ' sec')

### **Load Dataframe**

In [2]:
import pandas as pd
df = pd.read_csv("./chat_history_20220719.csv")

FileNotFoundError: [Errno 2] No such file or directory: './chat_history_20220719.csv'

### **Overall Statistics**
1. Number of messages sent by A and B respectively.
2. Number of total days, total messages, total calls and total call duration.
3. The day with the highest number of messages (how many?), and the day with the longest call duration (and how long?)
4. Number of messages, stickers, images, videos, and voicemail sent by A and B respectively.
5. Word cloud for all messages.
6. Number of messages vs. Days
7. Call durations vs Days.

In [75]:
df[df.msg.str.startswith("☎", na=False)][:3]

Unnamed: 0,full_time,sender,msg
111,2020-07-12 23:34:00,吳証恩,☎ 通話時間1:02:55
147,2020-07-13 23:06:00,吳証恩,☎ 通話時間0:21
148,2020-07-13 23:56:00,吳証恩,☎ 通話時間50:06


In [104]:
from datetime import datetime, timedelta

def validate_date_format(d, format):
    try:
        datetime.strptime(d, format)
        return True
    except ValueError:
        return False

def str2timedelta(string: str):
    
    format1 = "%H:%M:%S"
    format2 = "%M:%S"
    
    if validate_date_format(string, format1):
        dt_obj = datetime.strptime(string, format1)
        time_delta = timedelta(hours=dt_obj.hour, minutes=dt_obj.minute, seconds=dt_obj.second)
        return time_delta
    elif validate_date_format(string, format2):
        dt_obj = datetime.strptime(string, format2)
        time_delta = timedelta(minutes=dt_obj.minute, seconds=dt_obj.second)
        return time_delta
    else:
        print(f"should use date format {format1} or {format2}")
    

# str1 = '1:02:55'
# str2 = '0:21'
# str3 = '50:06'

# str2timedelta(str3)

datetime.timedelta(seconds=6012)

In [105]:
df[df.msg.str.contains(CALL,na=False, regex=False)]

Unnamed: 0,full_time,sender,msg
111,2020-07-12 23:34:00,吳証恩,☎ 通話時間1:02:55
147,2020-07-13 23:06:00,吳証恩,☎ 通話時間0:21
148,2020-07-13 23:56:00,吳証恩,☎ 通話時間50:06
230,2020-07-15 22:40:00,吳証恩,☎ 通話時間23:52
700,2020-07-20 00:14:00,Christine Yu,☎ 通話時間48:53
...,...,...,...
81589,2022-07-17 16:51:00,Christine Yu,☎ 通話時間4:24
81651,2022-07-17 19:37:00,吳証恩,☎ 通話時間9:59
81878,2022-07-18 22:47:00,吳証恩,☎ 通話時間0:33
81881,2022-07-18 22:48:00,吳証恩,☎ 通話時間0:37


In [None]:
# df[df.msg.str.startswith("☎", na=False)]
# df["msg"][df["msg"].str.startswith("[",na=False) & df["msg"].str.contains("]",na=False, regex=False)].unique()

In [None]:
# constants
User_A = "吳証恩"
User_B = "Christine Yu"
IMAGE = "[照片]"
STICKER = "[貼圖]"
VIDEO = "[影片]"
ALBUM = "[相簿]"
FILE = "[檔案]"
GIFT = "[禮物]"
NOTE = "[記事本]"
CI = "[聯絡資訊]"
CALL = "通話時間"

#### **Number of messages sent by A and B respectively**

In [None]:
print(f"Total messages sent by {User_A}: {df[df.sender == User_A].shape[0]}")
print(f"Total messages sent by {User_B}: {df[df.sender == User_B].shape[0]}")

#### **Total Numbers**

In [None]:
print(f"Number of total days: {pd.to_datetime(df.full_time).dt.normalize().nunique()}")
print(f"Number of total messages: {df.shape[0]}")
print(f"Number of total images: {df[df.msg.str.contains(IMAGE,na=False, regex=False)].shape[0]}")
print(f"Number of total stickers: {df[df.msg.str.contains(STICKER,na=False, regex=False)].shape[0]}")
print(f"Number of total videos: {df[df.msg.str.contains(VIDEO,na=False, regex=False)].shape[0]}")
print(f"Number of total albums: {df[df.msg.str.contains(ALBUM,na=False, regex=False)].shape[0]}")
print(f"Number of total files: {df[df.msg.str.contains(FILE,na=False, regex=False)].shape[0]}")
print(f"Number of total gifts: {df[df.msg.str.contains(GIFT,na=False, regex=False)].shape[0]}")
print(f"Number of total notes: {df[df.msg.str.contains(NOTE,na=False, regex=False)].shape[0]}")
print(f"Number of total C.I.: {df[df.msg.str.contains(CI,na=False, regex=False)].shape[0]}")
print(f"Number of total calls: {df[df.msg.str.contains(CALL,na=False, regex=False)].shape[0]}")
# print(f"Total call duration: {df[df.msg.str.contains(CALL,na=False, regex=False)].shape[0]}")

#### **The day with the highest number of messages (how many?), and the day with the longest call duration (and how long?)**

### **Statistics for A Specific Word**