In [45]:
##Basic Frequency Analysis of a Text File
##
##Fill in the missing statements so that this script
##reads in an input text file, builds a frequency dict,
##displays the top 10 most frequent words with how many
##times they occur, and writes a tag cloud to an
##output html file

import os #useful for exiting the program if a file can't be opened
import string #can use the constant string.punctuation to clean up words
from collections import OrderedDict



def count_words(line, freq_dict):
    """Process a string of text, adding the number of times each word
    appears to a frequency dictionary"""

    # Extract words from one sentence.
    words = line.split()
    # Remove a punctuation from a word and build freq_dict
    for word in words:
        # Covert words with lower cases.
        word = word.lower()
        if word[len(word)-1] in string.punctuation:
            word=(word[:len(word)-1])
        
        # Build the freq_dict.
        if word in freq_dict:
            freq_dict[word] += 1
        else:
            freq_dict[word] = 1
            
def display_top_words(freq_dict, n):
    """displays the top n most frequent words in a frequency dictionary,
    along with the number of times they appear"""
    for i in range(n):
        print(sorted(freq_dict, key=freq_dict.__getitem__, reverse=True)[i] + " :", 
              freq_dict[sorted(freq_dict, key=freq_dict.__getitem__, reverse=True)[i]])


def add_html_headers(body_text):
    """Adds header and footer around html body text
    to make a complete html file."""
    htm="""<html>
    <div style = "width: 680px; background-color: #aaaacc; border: 1px gray solid;
    text-align = center;">""" + body_text + "</div></html>"
    return htm

def make_html_word(word, font_size):
    htm='<span style="font-size:' + str(font_size) + 'px;">' + word + '</span>'
    return htm

min_font = 5 # set the minimum font size for the output html file

def make_tag_cloud(freq_dict):
    """creates an html tag cloud from a frequency dictionary.
    turns each word into an html word and then adds html headers.
    returns a html page as a string."""
    htm = ""
    # Create the body_text
    for word in freq_dict:
        htm += make_html_word(word, freq_dict[word])
    
    return add_html_headers(htm)

####Example of using filedialog to prompt the user for a file
##from tkinter import filedialog
##filename = filedialog.askopenfilename()
##print(filename)  # test


## Load input file
try:
    text = open("MongoDB.txt", 'r')
except IOError:
    print("Can't open input file")
    os._exit(1)

article = text.read().splitlines()

freq_dict = {}
for line in article:
    count_words(line, freq_dict)
    

## build frequency dictionary here

text.close()

## output top words
print("Number of distinct words:", len(freq_dict))
display_top_words(freq_dict, 10)


## create output file
out_file = open("output.htm", "w")
print(make_tag_cloud(freq_dict), file=out_file)

out_file.close()



Number of distinct words: 175
the : 15
and : 9
to : 7
has : 6
of : 6
mongodb : 5
ipo : 5
its : 5
a : 5
for : 4
