In [1]:
import os, re, string, nltk
from nltk.tokenize import word_tokenize, sent_tokenize

textFiles = [f for f in os.listdir('.') if '.txt' in f]    # get list of every file in current directory that ends with .txt 
rawCorpus=[]
for f in textFiles:
    try:
        file = open(f,'rt')                                    # open each text file for reading
        print (f"     Reading from: '{f}' . . .")
        rawCorpus.append(file.read().replace('\n', ' '))       # replace carrage returns with spaces       
        file.close()                                           # close each file
        corpus = ' '.join(rawCorpus)                           # make the list into a string 
    except:
        print("Error reading files.")
    
try:
    if len(corpus)>100: 
        print(f"\n   First & last 50 characters of Corpus:\n'{corpus[:50]}' . . . '{corpus[-50:]}'")
    else:
        print(f"\n   Corpus is under 100 characters:\n'{corpus}'")
except:
    print("The corpus does not exist.")

try:    
    sentences = sent_tokenize(corpus)
    print(f"\n   NLTK identified {len(sentences)} sentences:")
    print(f"      1st sentence: {sentences[0]}")
    print(f"\n      Last sentence: {sentences[-1:]}")

    tokens = word_tokenize(corpus)
    tokens = [w.lower() for w in tokens]
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    punct_free_words = [re_punc.sub('', w) for w in tokens]
    words = [word for word in punct_free_words if word.isalpha()]
    print(f"\n   NLTK identified {len(words)} words. Here are the first 10 words: {words[:10]}")

except:
    print("There is nothing to tokenize.")
    
def ngrams(input, n):
    output = {}
    for i in range(len(input) - n + 1):
        g = ' '.join(input[i:i+n])
        output.setdefault(g, 0)
        output[g] += 1
    return output

from collections import Counter
try:
    COUNTS = Counter(ngrams(words,1))
    print(f"\n   The most common unigrams are: {(COUNTS.most_common(10))}")
    
    COUNTS = Counter(ngrams(words,2))
    print(f"\n   The most common bigrams are: {(COUNTS.most_common(10))}")
    
    COUNTS = Counter(ngrams(words,3))
    print(f"\n   The most common trigrams are: {(COUNTS.most_common(10))}")
    
    COUNTS = Counter(ngrams(words,4))
    print(f"\n   The most common quadgrams are: {(COUNTS.most_common(10))}")
except:
    print("There is an error with ngrams.")

     Reading from: 'austen-emma.txt' . . .
     Reading from: 'big.txt' . . .
     Reading from: 'tragedy-of-macbeth.txt' . . .

   First & last 50 characters of Corpus:
'[Emma by Jane Austen 1816]  VOLUME I  CHAPTER I   ' . . . '. Exeunt Omnes.   FINIS. THE TRAGEDIE OF MACBETH. '

   NLTK identified 66729 sentences:
      1st sentence: [Emma by Jane Austen 1816]  VOLUME I  CHAPTER I   Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to unite some of the best blessings of existence; and had lived nearly twenty-one years in the world with very little to distress or vex her.

      Last sentence: ['THE TRAGEDIE OF MACBETH.']

   NLTK identified 1281513 words. Here are the first 10 words: ['emma', 'by', 'jane', 'austen', 'volume', 'i', 'chapter', 'i', 'emma', 'woodhouse']

   The most common unigrams are: [('the', 85990), ('of', 44737), ('and', 43762), ('to', 34332), ('a', 24513), ('in', 24361), ('that', 14584), ('he', 14324), ('was', 13880