编写大型程序时，先得明白要解决的问题。这看似简单，但未能正确认识要解决的问题是极其常见的编程错误。(不要上去就做，先弄清楚要解决的问题是什么)

问题描述：计算并打印有关文本文件内容的统计数据。我们想知道给定文本文件包含多少个字符、行和单词。除单词数外，还想知道文件中出现次数最多的前10个单词，并按出现次数排列它们。

In [1]:
book = open('Are the Planets Inhabited.txt').read()
len(book)

253167

In [2]:
book.count('\n')

4671

In [3]:
len(book.split())

43345

最终代码：

In [77]:
keep = {'a','b','c','d','e','f','g',
        'h','i','j','k','l','m','n',
        'o','p','q','r','s','t','u',
        'v','w','x','y','z',' ','-',"'"}
stop_word = {'the','and','i','to','of','a','you','my',
             'that','in','at','if','which','where'}

def normalize(s):
    '''Convert s to a normalized string.'''
    # 返回保留需要的字符后的结果
    return ''.join(c for c in s.lower() if c in keep)

def make_freq_dict(s):
    '''Return a dictionary whose keys are the words of s, and whose values are the counts of those word.'''
    s = normalize(s)
    words = s.split()
    d = {}    # 单词频率字典
    for w in words:
        if w in d:
            d[w] += 1
        elif w not in stop_word: # 添加了‘忽略排除词’的功能
            d[w] = 1
    return d

def print_file_stats(fname):
    '''Print statistics for the given file.'''
    s = open(fname).read()
    num_chars = len(s)
    num_lines = s.count('\n')
    
    d1 = make_freq_dict(s)
    num_words = sum(d1[w] for w in d1) # 统计单词总数
    
    # 创建一个列表，其中元素为由出现的单词和其出现次数组成的元组
    # 并按出现次数排序
    lst = [(d1[w],w) for w in d1]
    lst.sort()
    lst.reverse()
    
    # 打印输出结果
    print("The file '%s' has: " %fname)
    print('    %s characters' % num_chars)
    print('    %s lines' % num_lines)
    print('    %s words' % num_words)
    print('    %s types of words' % len(d1))
    
    sum_char = 0
    for word in d1:
        sum_char += d1[word]*len(word)
    print('    %s average len of words' % (sum_char / num_words))
    sum_hapax = sum(d1[w] for w in d1 if d1[w] == 1)
    print('    %s hapax legomenon' % sum_hapax)
    
    print('\n The top 10 frequent words are:')
    i = 1
    for count, word in lst[:5]:
        print('%2s. %4s %s' %(i,count,word))
        i += 1
        
def print_file_stats_lines(fname):
    num_chars = 0
    num_lines = 0
    d = {}    # 单词频率字典
    
    with open(fname) as f:
        for line in f:
            num_chars += len(line)
            num_lines += 1
            line = ''.join(c for c in line.lower() if c in keep)
            words = line.split()
            for w in words:
                if w in d:
                    d[w] += 1
                elif w not in stop_word: # 添加了‘忽略排除词’的功能
                    d[w] = 1
    num_words = sum(d[w] for w in d)
    lst = [(d[w],w) for w in d]
    lst.sort()
    lst.reverse()
    
    print("The file '%s' has: " %fname)
    print('    %s characters' % num_chars)
    print('    %s lines' % num_lines)
    print('    %s words' % num_words)
    print('    %s types of words' % len(d))
    
    sum_char = 0
    for word in d:
        sum_char += d[word]*len(word)
    print('    %s average len of words' % (sum_char / num_words))
    num_hapax = sum(d[w] for w in d if d[w] == 1)
    print('    %s hapax legomenon' % num_hapax)
    
    print('\n The top 10 frequent words are:')
    i = 1
    for count, word in lst[:5]:
        print('%2s. %4s %s' %(i,count,word))
        i += 1
    
def main():
    print_file_stats('Are the Planets Inhabited.txt')
    print('#######')
    print('#######')
    print_file_stats_lines('Are the Planets Inhabited.txt')

if __name__ == '__main__':
    main()

The file 'Are we.txt' has: 
    166 characters
    7 lines
    23 words
    18 types of words
    4.6521739130434785 average len of words
    15 hapax legomenon

 The top 10 frequent words are:
 1.    4 is
 2.    2 said
 3.    2 are
 4.    1 six
 5.    1 say
#######
#######
The file 'Are we.txt' has: 
    166 characters
    8 lines
    27 words
    20 types of words
    3.7777777777777777 average len of words
    15 hapax legomenon

 The top 10 frequent words are:
 1.    4 is
 2.    2 there
 3.    2 said
 4.    2 dog
 5.    2 are
