In [None]:
import csv
import re
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba

0.将QQ导入的聊天记录的txt格式转换为csv格式

如何导入：

一、电脑版QQ左下角“三条杠”点开，右上角有个“消息管理”再打开

二、选择任意一个群聊，右键“导出消息记录”

三、选择为“txt”格式导出，放在当前路径下

In [None]:
# TXT变量改为你群名
TXT = '你的群名'
myTXT = TXT + '.txt'
myCSV = TXT + '.csv'

In [None]:
# 转换格式
with open(myTXT, 'r', encoding='utf-8') as f:
    lines = f.readlines()[8:]

with open(myCSV, 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['时间', '用户ID', '用户名', '聊天'])
    for i in range(0, len(lines)):
        match = re.search(r'(202\d-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) (.*?)(\(\d+\)|\<.*\>)', lines[i])
        if match:
            timestamp = match.group(1)
            username = match.group(2)
            userid = match.group(3)
        else:
            chat = lines[i].strip()
            writer.writerow([timestamp, userid, username, chat])


In [None]:
# 预处理
data = pd.read_csv(myCSV, parse_dates=['时间'])
print(np.sum(data.isnull(),axis=0))
# 删除聊天中值为 null 或 NaN 的行
data = data.dropna(subset=['聊天'])
# 保存结果
data.to_csv(myCSV, index=False)

In [None]:
# 检查是否正确
data.head(10)

In [None]:
data.astype({'时间':'datetime64','用户ID':'str','聊天':'str'})
data['date'] = data.时间.dt.date
data['hour'] = data.时间.dt.hour

In [None]:
# 记录一共多少用户
data.用户ID.nunique()

In [None]:
df = data.groupby('date').用户ID.nunique().reset_index()
plt.figure(figsize=(16,5))
sns.set(style='darkgrid',context='notebook',font_scale=1.2)
sns.lineplot(data=df,x='date',y='用户ID') 
plt.title('群聊每日发言用户数量',pad=15,fontdict={'fontsize':20})
plt.xticks(df.date,rotation=70)
mpl.rcParams['font.sans-serif'] = ['SimHei']

In [None]:
df = data.groupby('date').聊天.count().reset_index()
plt.figure(figsize=(16,5))
sns.set(style='darkgrid',context='notebook',font_scale=1.2)
sns.lineplot(data=df,x='date',y='聊天') 
plt.title('群聊每日消息数',pad=15,fontdict={'fontsize':20})
plt.xticks(df.date,rotation=70)
mpl.rcParams['font.sans-serif'] = ['SimHei']

In [None]:
df = data.groupby('hour').聊天.count().reset_index()
df['hour'] = df.hour.astype('int64')
df.sort_values('hour',inplace=True)
plt.figure(figsize=(10,5))
sns.set(style='darkgrid',context='notebook',font_scale=1.2)
sns.lineplot(data=df,x='hour',y='聊天') 
plt.title('群聊每时段消息数',pad=15,fontdict={'fontsize':20})
plt.xticks(df.hour,rotation=70)
mpl.rcParams['font.sans-serif'] = ['SimHei']

In [None]:
df = data.groupby(['date', 'hour']).聊天.count().reset_index().groupby('hour').聊天.mean().reset_index()
plt.figure(figsize=(10,5))
sns.set(style='darkgrid',context='notebook',font_scale=1.2)
sns.lineplot(data=df,x='hour',y='聊天') 
plt.title('群聊日均每时段消息数',pad=15,fontdict={'fontsize':20})
plt.xticks(df.hour,rotation=70)
mpl.rcParams['font.sans-serif'] = ['SimHei']


In [None]:
df = data.groupby('用户ID').size().sort_values(ascending=False)
df.head(10)

In [None]:
# 选择要绘制的用户
user = '用户名，可从本单元格上方选其一'
df = data.groupby(['date', '用户ID']).size().loc[:, user]

df.plot(kind='line')
plt.title(f'{user} 的每日消息数')
plt.xlabel('date')
plt.ylabel('消息数')
plt.xticks(rotation=70)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.show()

In [None]:
# 停用词
# 将用户ID和用户名都作为停用词并做成集合
text_stop_set = set(data['用户ID'].astype(str).unique()).union(set(data['用户名'].astype(str).unique()))

# 读取常用停用词（也可以在文档末尾添加）
with open('stopwords.txt', 'r', encoding='utf-8') as f1:
    lines1 = f1.readlines()
    for i in range(len(lines1)):
        lines1[i] = lines1[i].strip()
        text_stop_set.add(lines1[i])

# 删除停用词
text_chat = list(jieba.lcut_for_search(' '.join(data['聊天'].astype(str).tolist())))
text_stop_list = list(jieba.lcut_for_search(' '.join(text_stop_set)))
text_gen = ' '.join([x for x in text_chat if x not in text_stop_list])

# 显示词云
wordcloud = WordCloud(font_path="C:\\Windows\\Fonts\\simhei.ttf",
                      scale=4,
                      max_words=200,
                      background_color='white',
                      max_font_size=200,
                      min_font_size=1,
                      collocations=False,
                      width=1600,
                      height=1200
                      ).generate(text_gen)

plt.imshow(wordcloud, interpolation='catrom')
plt.axis("off")
plt.show()
