-
Notifications
You must be signed in to change notification settings - Fork 0
/
generator.py
122 lines (92 loc) · 4.26 KB
/
generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import collections
import json
import re
import numpy
import matplotlib.pyplot as plot
from typing import Tuple
# Common stopwords, mostly taken from the library NLTK
# I've added a few filler words that kept showing up in the graph
STOP_WORDS = ('ourselves', 'd', 'yourself', 'but', 'second', 'go', 'around',
'hey', 'girl', 'someone', 'again', 'one', 'two', 'man', 'woman',
'get', 'back', 'figure', 'there', 'about', 'once', 'during', 'out',
'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for',
'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself',
'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him',
'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these',
'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more',
'himself', 'this', 'down', 'should', 'our', 'their', 'while',
'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no',
'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been',
'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that',
'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not',
'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where',
'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few',
'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by',
'doing', 'it', 'how', 'further', 'was', 'here', 'than', 'll',
've', 're', 'm', 'next', 'guy', 'person', 'like', 'first', 'us',
'another', 'character', 'still')
def process_text(s: str) -> str:
"""Processes text to remove stopwords and special characters
Args:
s (str): The string to process
Returns:
str: The processed text
"""
# This is a hideous abuse of regex probably, but here's what it does:
# \[\[.+?\]\] matches [[]] and any text between
# {{.+?}} matches {{}} and any text between
# [^a-z \n] more obviously matches the inverse of a-z, spaces, and linebreaks
# The rest of the syntax groups them () and matches any of the three |
pattern = re.compile('(\[\[.+?\]\])|({{.+?}})|([^a-z \n])')
lowered = s.lower()
stripped = re.sub(pattern, ' ', lowered)
rm_stopwords = [w for w in stripped.split() if w not in STOP_WORDS]
return ' '.join(rm_stopwords)
def generate_json():
"""Generates a file (generated.json)
The file is sorted via comic number and contains the processed text from
xkcd comics.
"""
with open('xkcd_comics.json') as f:
comics = json.load(f)
processed_dict = {}
for comic in comics:
transcript = process_text(comics[comic]['transcript'])
if transcript:
processed_dict[comic] = transcript
else:
processed_dict[comic] = process_text(comics[comic]['alt'])
print(f'Processed comic #{comic}.', end='\r')
with open('generated.json', 'w') as f:
json.dump(processed_dict, f, indent=2)
print(f'\nSuccessfully wrote {len(processed_dict)} comics to file.')
def _count() -> Tuple[str, int]:
"""Counts the occurrences of "significant" words
Returns:
Tuple[str, int]: The words and their counts respectively
"""
with open('generated.json') as f:
d = json.load(f)
# Joins all the processed text and then counts the individual words
counter = collections.Counter(' '.join(d.values()).split())
return counter.most_common(10)
def make_graph(top_words: tuple):
"""Creates and displays a graph using matplotlib
Note:
This was mostly taken from here: https://pythonspot.com/matplotlib-bar-chart/
Args:
top_words (tuple): Tuple[List(str, int)] generated via :func:`_count()`
"""
with plot.xkcd():
objs = [x[0] for x in top_words]
y_pos = numpy.arange(len(objs))
data = [x[1] for x in top_words]
plot.barh(y_pos, data, align='center', alpha=1)
plot.yticks(y_pos, objs)
plot.xlabel('Ocurrences')
plot.title('Word frequency in XKCD comics')
plot.show()
if __name__ == '__main__':
generate_json()
c = _count()
make_graph(c)