-
Notifications
You must be signed in to change notification settings - Fork 0
/
program.py
394 lines (308 loc) · 13.2 KB
/
program.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
from nltk import *
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
import matplotlib.pyplot as plt
import numpy as np
import string
import math
'''
Part I - Formatting => Dividing the book into arrays of chapter titles, locations, and contents, as well as some data for easy access.
'''
f = open("hpmor.txt")
chapter_names = [] # List of strings
chapter_indices = [] # List of integers
chapters = [] # List of strings
complete_text = ""
words_complete = [] # List of strings
sentences_complete = [] # List of strings
sentence_words = [] # List of lists of strings
sentence_length_in_words = [] # List of integers
chapter_words = [] # List of lists of strings
chapter_sentences = [] # List of lists of strings
chapter_avg_word_length = [] # List of integers
chapter_lengths_words = [] # List of integers
chapter_lengths_sentences = [] # List of integers
norm_chapter_lengths_words = [] # List of floats
norm_chapter_lengths_sentences = [] # List of floats
chapter_word_length_cumulative = [] # List of integers
chapter_sentence_length_cumulative = [] # List of integers
positive_sentences = [] # List of integers
negative_sentences = [] # List of integers
keywords = ['death', 'science', 'space', 'rationality', 'human', 'bias', 'fallacy', 'error', 'plot', 'game', 'battle', 'Dark']
# Extracting chapter names
for i in range(122):
s = f.readline()
index = s.find(' ')
length = len(s)
chapter_names.append(s[index+1:length-1])
# Extracting starting indexes of each chapter
text = f.read()
for i in chapter_names:
index = text.find(i + "\n")
chapter_indices.append(index)
# Separating barely formatted chapter texts
for i in range(len(chapter_indices)):
first_index = chapter_indices[i]
if(i != len(chapter_indices) - 1):
last_index = chapter_indices[i+1]
chapter_text = text[first_index:last_index]
if(i == len(chapter_indices) - 1):
chapter_text = text[first_index:]
chapter_text = chapter_text[chapter_text.find('\n'):].replace('\n', ' ').replace('\r', '').replace('*', '')
chapters.append(chapter_text.strip())
complete_text = complete_text + chapter_text
# Tokenizing text into significant words and sentences (stored in words_complete and sentences_complete),
# storing signficant words in each sentence in sentence_words and their number in sentence_length_in_words
words_complete = word_tokenize(complete_text)
sentences_complete = sent_tokenize(complete_text)
for i in sentences_complete:
words_in_sentence = word_tokenize(i)
sentence_words.append(words_in_sentence)
sentence_length_in_words.append(len(words_in_sentence))
stop_words = corpus.stopwords.words('english')
punctuations = string.punctuation
words_complete = [word for word in words_complete if len(word) > 2]
words_complete = [word for word in words_complete if word.lower() not in stop_words]
for i in punctuations:
words_complete = [word for word in words_complete if i not in word]
# Tokenizing each chapter's text into significant words and sentences (stored in chapter_words and chapter_sentences)
# storing number of words and sentences in chapter_lengths_words and chapter_lengths_sentences,
# and normalized number of words and sentences in norm_chapter_lengths_words and norm_chapter_lengths_sentences
for i in chapters:
words = word_tokenize(i)
sentences = sent_tokenize(i)
words = [word for word in words if len(word) > 2]
words = [word for word in words if word.lower() not in stop_words]
for i in punctuations:
words = [word for word in words if i not in word]
chapter_avg_word_length.append(sum(map(len, words)) / len(words))
chapter_words.append(words)
chapter_sentences.append(sentences)
cumulative = 0
for i in chapter_words:
chapter_lengths_words.append(len(i))
cumulative = cumulative + len(i)
chapter_word_length_cumulative.append(cumulative)
largest = max(chapter_lengths_words)
for i in chapter_lengths_words:
norm_chapter_lengths_words.append(i/largest)
cumulative = 0
for i in chapter_sentences:
chapter_lengths_sentences.append(len(i))
cumulative = cumulative + len(i)
chapter_sentence_length_cumulative.append(cumulative)
largest = max(chapter_lengths_sentences)
for i in chapter_lengths_sentences:
norm_chapter_lengths_sentences.append(i/largest)
'''
Part II - Final Formatting
'''
# Numpy array for chapter numbers
chapter_numbers_axis = np.array(range(1, 123))
# Numpy arrays for number of words in each chapter, and number of sentences in each chapter
chapter_words_axis = np.array(chapter_lengths_words)
chapter_sentences_axis = np.array(chapter_lengths_sentences)
# Numpy arrays for normalized number of words in each chapter, and normalized number of sentences in each chapter (Method of Normalization: Each number is divided by the max in list)
norm_chapter_words_axis = np.array(norm_chapter_lengths_words)
norm_chapter_sentences_axis = np.array(norm_chapter_lengths_sentences)
# Numpy arrays for number of sentences in text, and number of words in each sentence
sentence_numbers_axis = np.array(range(1, len(sentence_words) + 1))
sentence_length_in_words_axis = np.array(sentence_length_in_words)
# Numpy arrays for most frequent words in the book, and their frequency count
fdist = FreqDist(words_complete).most_common(50)
freq_words_axis = np.array([i[0] for i in fdist])
freq_values_axis = np.array([i[1] for i in fdist])
# Numpy arrays for lengths of words in the book, and their frequencies
word_length_fdist = FreqDist(len(word) for word in words_complete).most_common(19)
word_length_axis = np.array([i[0] for i in word_length_fdist])
freq_word_length_axis = np.array([i[1] for i in word_length_fdist])
# Numpy array for average word length in each chapter
chapter_avg_word_length_axis = np.array(chapter_avg_word_length)
'''
Part III - Plotting
'''
def plot_words_sents_per_chapter():
plt.figure(1)
plt.plot(chapter_numbers_axis, chapter_words_axis, label = "Word Count")
plt.plot(chapter_numbers_axis, chapter_sentences_axis, label = "Sentence Count")
plt.xlabel('Chapters')
plt.ylabel('Number')
plt.title("Count of Words and Sentences in Each Chapter")
plt.legend()
def plot_norm_words_sents_per_chapter():
plt.figure(2)
plt.plot(chapter_numbers_axis, norm_chapter_words_axis, label = "Normalized Word Count")
plt.plot(chapter_numbers_axis, norm_chapter_sentences_axis, label = "Normalized Sentence Count")
plt.xlabel('Chapters')
plt.ylabel('Number')
plt.title("Normalized Count of Words and Sentences in Each Chapter")
plt.legend()
def plot_words_per_sentence():
plt.figure(3)
plt.plot(sentence_numbers_axis, sentence_length_in_words_axis)
plt.xlabel('Sentences')
plt.ylabel('No. of Words')
plt.title("Count of Words Per Sentence")
def plot_word_frequencies():
plt.figure(4)
plt.barh(freq_words_axis, freq_values_axis)
plt.xlabel('Frequency')
plt.ylabel('Word')
plt.title("Most Common Words")
def plot_chapter_word_frequency(no):
chapter_fdist = FreqDist(chapter_words[no - 1]).most_common(20)
chapter_freq_words_axis = np.array([i[0] for i in chapter_fdist])
chapter_freq_values_axis = np.array([i[1] for i in chapter_fdist])
plt.figure()
plt.barh(chapter_freq_words_axis, chapter_freq_values_axis)
plt.xlabel('Frequency')
plt.ylabel('Word')
plt.title("Most Common Words in Chapter {}: {}".format(no, chapter_names[no-1]))
def plot_word_length_frequency():
plt.figure(5)
plt.bar(word_length_axis, freq_word_length_axis)
plt.xlabel('Word Length')
plt.ylabel('Frequency')
plt.title("Frequency Distribution of Word Lengths")
def plot_avg_word_length():
plt.figure(6)
plt.plot(chapter_numbers_axis, chapter_avg_word_length_axis)
plt.xlabel('Chapters')
plt.ylabel('Average Word Length')
plt.title('Distribution of Average Word Length across Chapters')
def plot_dispersion_keywords(words):
keywords_axes = {}
keywords_indices_axes = {}
plt.figure(7)
for word in words:
keywords_indices_axes[word] = [i for i in range(len(words_complete)) if words_complete[i] == word]
keywords_axes[word] = [word] * len(keywords_indices_axes[word])
plt.scatter(keywords_indices_axes[word], keywords_axes[word], marker = '|', color='#1f77b4')
for chapter in range(0, 122, 5):
plt.axvline(chapter_word_length_cumulative[chapter], color='#E56967', linestyle='dotted', linewidth=1)
plt.xlabel('Position')
plt.ylabel('Word')
plt.title('Dispersion Plot of Keyword Positions in the Book')
def plot_dispersion_sentiments(chapter):
positive_sentences = []
negative_sentences = []
if chapter == 0:
begin = 0
end = chapter_sentence_length_cumulative[chapter]
else:
begin = chapter_sentence_length_cumulative[chapter - 1]
end = chapter_sentence_length_cumulative[chapter]
for sentence in range(begin, end):
sentence_blob = TextBlob(sentences_complete[sentence], analyzer=NaiveBayesAnalyzer())
if sentence_blob.sentiment.p_pos > 0.5:
positive_sentences.append(sentence)
else:
negative_sentences.append(sentence)
positive_sentences_locations_axis = np.array(positive_sentences)
negative_sentences_locations_axis = np.array(negative_sentences)
positive_sentences_axis = ["Positive Sentences"] * len(positive_sentences)
negative_sentences_axis = ["Negative Sentences"] * len(negative_sentences)
plt.figure(figsize=(15.0, 5.0))
plt.scatter(positive_sentences_locations_axis, positive_sentences_axis, marker = '|', color='#1f77b4')
plt.scatter(negative_sentences_locations_axis, negative_sentences_axis, marker = '|', color='#1f77b4')
plt.xlabel('Position')
plt.ylabel('Sentiment')
plt.title('Dispersion Plot of Positive and Negative Sentences in the Book')
name = "Chapter " + str(chapter+1) + " - " + chapter_names[chapter] + ".png"
plt.savefig(name)
plt.close()
'''
Part IV - Usage
'''
keep_going = True
while keep_going:
print("Which of the following plots would you like to see?")
print("1. Distribution of Words and Sentences per Chapter")
print("2. Normalized Distribution of Words and Sentences per Chapter")
print("3. Distribution of Words per Sentence")
print("4. Distribution of the Most Common Words and their Frequencies")
print("5. Distribution of the Most Common Words and their Frequencies in Each Chapter")
print("6. Distribution of Word Lengths and their Frequencies")
print("7. Distribution of Average Word Length across Chapters")
print("8. Dispersion of Keyword Positions")
print("9. Sentiment Analysis of Each Chapter (The graph will be saved to the folder)")
print("10. Exit the program")
print("If you have multiple choices, enter them separated by comma, or if a range, the first number and the last number separated by a hyphen.")
print("Example: To choose 1 and 4, give the input '1,4' and to choose all of them, choose '1-9'. To exit, give just '10'. All without the quotes, obviously.")
functions = [plot_words_sents_per_chapter, plot_norm_words_sents_per_chapter, plot_words_per_sentence, plot_word_frequencies, plot_chapter_word_frequency,
plot_word_length_frequency, plot_avg_word_length, plot_dispersion_keywords, plot_dispersion_sentiments]
while True:
choice = input()
chosen = []
if choice == "10":
keep_going = False
break
if len(choice) == 1:
if not choice.isnumeric():
print("Enter valid input, you blithering moron.")
continue
chosen = [functions[int(choice) - 1]]
elif len(choice) == 3:
if not choice[0].isnumeric() or choice[1] not in ',-' or not choice[2].isnumeric():
print("Enter valid input, you blithering moron.")
continue
if choice[1] == ',':
chosen = [functions[int(choice[0]) - 1], functions[int(choice[2]) - 1]]
else:
chosen = [functions[i] for i in range(int(choice[0]) - 1, int(choice[2]))]
elif len(choice) % 2 == 1:
flag = 0
for i in range(0, len(choice) - 2, 2):
if not choice[i].isnumeric() or choice[i + 1] != ',':
flag = 1
chosen.append(functions[int(choice[i]) - 1])
chosen.append(functions[int(choice[i + 2]) - 1])
if flag == 1:
print("Enter valid input, you blithering moron.")
continue
if not choice[-1].isnumeric():
print("Enter valid input, you blithering moron.")
continue
else:
print("Enter valid input, you blithering moron.")
continue
break
if plot_chapter_word_frequency in chosen:
print("On which chapter would you like the distribution of most common words and their frequencies?")
while True:
chapter = input()
if not chapter.isnumeric() or int(chapter) < 1 or int(chapter) > 122:
print("Try again, you gibbering dullard.")
continue
break
plot_chapter_word_frequency(int(chapter))
chosen.remove(plot_chapter_word_frequency)
if plot_dispersion_sentiments in chosen:
print("On which chapter would you like the sentiment analysis dispersion? Because it takes a while to generate it for a given chapter, you're limited to one per go.")
while True:
chapter = input()
if not chapter.isnumeric() or int(chapter) < 1 or int(chapter) > 122:
print("Try again, you gibbering dullard.")
continue
break
plot_dispersion_sentiments(int(chapter) - 1)
chosen.remove(plot_dispersion_sentiments)
for function in chosen:
function()
plt.show()
'''
Part V - Testing Section (This part's just to make it easier to debug the plotting functions)
'''
'''
plot_words_sents_per_chapter()
plot_norm_words_sents_per_chapter()
plot_words_per_sentence()
plot_word_frequencies()
plot_chapter_word_frequency(45)
plot_word_length_frequency()
plot_avg_word_length()
plot_dispersion_keywords(keywords)
for i in range(122):
plot_dispersion_sentiments(i)
plt.show()
'''