-
Notifications
You must be signed in to change notification settings - Fork 0
/
whatsapp-wordcloud.py
162 lines (127 loc) · 6.02 KB
/
whatsapp-wordcloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import sys, os
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
import re
from collections import Counter
import json
# Let users know that it's working in the background
print("Generating WordClouds...\n")
# Open and load config file into a config object
try:
cf = open('./config.json', 'r')
config = json.load(cf)
cf.close()
except FileNotFoundError:
msg = "Sorry, the file ./config.json does not exist."
print(msg)
# Open and read the chats from the '_chat.txt' file exported by Whatsapp
try:
file = open(config['chat_filepath'], encoding="utf8")
chat = file.read()
file.close()
except FileNotFoundError:
msg = "Sorry, the file " + config['chat_filepath'] + ' does not exist.'
print(msg)
# Extract all dates to a list
dates = re.findall('\[(.*)\]', chat)
# Remove the dates from the chat to extract only messages
for date in dates:
chat = chat.replace(date,'')
# Split the array based on regex
messages = re.split('\[(.*)\]', chat)
# Remove empty strings from messages array
while('' in messages):
messages.remove('')
# Remove newline characters from messages
for item in range(0,len(messages)):
messages[item] = messages[item].replace('\n','')
# Strings to hold Each User's Name
# Keep colan and space since it helps distinguish from normal name occurrences
my_name = config['my_name'] + ': '
partner_name = config['partner_name'] + ': '
# List to hold the author name of each message. Will be parallel indexed with 'messages' and 'dates'
author = []
# List to hold clean messages
clean_messages = []
# Iterate through our dictionary and copy messages to appropriate individual dictionary
for i in range(0,len(dates)):
message = messages[i]
if my_name in message:
# Remove name and store message in appropriate individual dict
clean_message = message.replace(my_name, '').strip()
# Append cleaned name to author
author.append(my_name.replace(': ',''))
clean_messages.append(clean_message)
if partner_name in message:
# Remove name and store message in appropriate individual dict
clean_message = message.replace(partner_name, '').strip()
author.append(partner_name.replace(': ',''))
clean_messages.append(clean_message)
# create a dictionary with the three lists
CSVdict = {'Date': dates, 'Author': author, 'Message': clean_messages}
# create a Pandas DataFrame from the dictionary
df = pd.DataFrame(CSVdict)
# Preview dataframe
# display(df) # Uncomment this line to preview the dataframe before it is saved.
# Save the csv file
csv_filename = './data/all_messages.csv'
df.to_csv(csv_filename)
print(f'Successfully saved {csv_filename}!\n')
# Drop rows that contain '<attachment:' so only real messages are included.
df = df[df["Message"].str.contains("<attached:") == False]
# Drop rows that contain "deleted this message" so deleted messages aren't included. (messages you deleted)
df = df[df["Message"].str.contains("deleted this message") == False]
# Drop rows that contain "This message was deleted" so deleted messages aren't included. (messages partner deleted)
df = df[df["Message"].str.contains("This message was deleted") == False]
# Drop rows that contain "Missed video call" so missed calls are not included.
df = df[df["Message"].str.contains("Missed video call") == False]
# Drop rows that contain "Missed voice call" so missed calls are not included.
df = df[df["Message"].str.contains("Missed voice call") == False]
# Create dataframe for partner messages only
partner_df = df[df['Author'].str.match(config['partner_name'])]
# Create dataframe for my messages only
my_df = df[df['Author'].str.match(config['my_name'])]
# Create list of each person's messages to be stored separately. Will be used for graph generation.
my_messages = my_df['Message'].tolist()
partner_messages = partner_df['Message'].tolist()
# Create string of all messages (Can be used to generate non-frequency based wordcloud using WordCloud.generate() if needed)
my_messages_string = '. '.join(my_messages)
partner_messages_string = '. '.join(partner_messages)
# Save the message strings for each person for easy access later
filename = './data/' + config['partner_name'].replace(' ', '_') + '_message_string.txt'
file = open(filename, 'w+', encoding='utf8')
file.write(partner_messages_string)
file.close()
print(f'Successfully saved {filename}!')
filename = './data/' + config['my_name'].replace(' ', '_') + '_message_string.txt'
file = open(filename, 'w+', encoding='utf8')
file.write(my_messages_string)
file.close()
print(f'Successfully saved {filename}!\n')
# Convert each person's message list to a dictionary with counted occurrences for each message.
my_word_count_dict = Counter(my_messages)
my_wordcloud = WordCloud(width = 1000, height = 500).generate_from_frequencies(my_word_count_dict)
# Convert each person's message list to a dictionary with counted occurrences for each message.
partner_word_count_dict = Counter(partner_messages)
partner_wordcloud = WordCloud(width = 1000, height = 500).generate_from_frequencies(partner_word_count_dict)
# Plot the my_wordcloud wordcloud and save/display it.
plt.figure(figsize=(15,8))
plt.imshow(my_wordcloud)
plt.axis("off")
#plt.show() # Can be uncommented to preview graph before saving to file.
filename = './output/' + config['my_name'].replace(' ','_') + '_wordcloud.png'
plt.savefig(filename, bbox_inches='tight')
print(f'Successfully saved {filename}!')
plt.close()
# Plot the partner_wordcloud wordcloud and save/display it.
plt.figure(figsize=(15,8))
plt.imshow(partner_wordcloud)
plt.axis("off")
#plt.show() # Can be uncommented to preview graph before saving to file.
filename = './output/' + config['partner_name'].replace(' ','_') + '_wordcloud.png'
plt.savefig(filename, bbox_inches='tight')
print(f'Successfully saved {filename}!')
plt.close()
# Say thanks for using my project!
print("\nWordClouds Successfully Saved!\nGoodbye!")