In [41]:
import os
import nltk
import re
import numpy
import matplotlib
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import FreqDist
import pandas as pd

In [43]:
def get_text_info(text):
    """
    Uses NLTK to calculate: tokens, types, lexical diversity
    
    Args:
        text (str): a string containing the file or text
        
    Returns: 
        dict: a dictionary containing tokens, types, and lexical diversity
    """
    tokens = nltk.word_tokenize(text)
    n_tokens = len(tokens)
    n_types = len(set(tokens))
    return {
            'tokens': n_tokens,
            'types': n_types,
        }
def process_dir(path):
    """
    Reads all the files in a directory. Processes them using the 'get_text_info' function
    
    Args: 
        path (str): path to the directory where the files are
        
    Returns:
        dict: a dictionary with file names as keys and the tokens, types, lexical diversity, as values
    
    """
    file_info = {}

    for filename in os.listdir(path):
        if filename.endswith(".txt"):    
            file_path = os.path.join(path, filename)      
            with open(file_path, 'r', encoding="utf-8") as f:
                text = f.read()
                file_info[filename] = get_text_info(text)
    return file_info

In [45]:
path = './data'

filesInfo = process_dir(path)


In [47]:
df = pd.DataFrame.from_dict(filesInfo, orient='index')
df

Unnamed: 0,tokens,types
Meditations_Marcus_Aurelius.txt,81803,6602
StarTrekII.txt,22065,3673
Winnie_the_Pooh_AA_Milne.txt,30602,2650


In [49]:
df['lex_div'] = df['types']/df['tokens']
df

Unnamed: 0,tokens,types,lex_div
Meditations_Marcus_Aurelius.txt,81803,6602,0.080706
StarTrekII.txt,22065,3673,0.166463
Winnie_the_Pooh_AA_Milne.txt,30602,2650,0.086596


In [63]:
def text_cleaner(filename):
    path = './data'
    file_path = os.path.join(path, filename)

    # open one file at a time, to read it, and with utf encoding
    with open(file_path, 'r', encoding="utf-8") as f:   
        text = f.read()
                
    text_clean = re.sub(r'[\,\.\"\”\“\*]', '', text,)
    return text_clean

In [65]:
text_clean = text_cleaner(filename)
text_tokens=word_tokenize(text_clean)
text_tokens

['THE',
 'PROJECT',
 'GUTENBERG',
 'EBOOK',
 'WINNIE-THE-POOH',
 'WINNIE-THE-POOH',
 '_BY',
 'A',
 'A',
 'MILNE_',
 '_JUVENILES_',
 'When',
 'We',
 'Were',
 'Very',
 'Young',
 '_The',
 'best',
 'book',
 'of',
 'verses',
 'for',
 'children_',
 '_ever',
 'written_',
 '--',
 'A',
 'EDWARD',
 'NEWTON',
 'in',
 '_The',
 'Atlantic',
 'Monthly_',
 'Fourteen',
 'Songs',
 'from',
 'When',
 'We',
 'Were',
 'Very',
 'Young',
 'Words',
 'by',
 'A',
 'A',
 'Milne',
 'Music',
 'by',
 'H',
 'Fraser-Simson',
 'Decorations',
 'by',
 'E',
 'H',
 'Shepard',
 'The',
 'King',
 "'s",
 'Breakfast',
 'Words',
 'by',
 'A',
 'A',
 'Milne',
 'Music',
 'by',
 'H',
 'Fraser-Simson',
 'Decorations',
 'by',
 'E',
 'H',
 'Shepard',
 '_ESSAYS_',
 'Not',
 'That',
 'It',
 'Matters',
 'The',
 'Sunny',
 'Side',
 'If',
 'I',
 'May',
 '_MYSTERY',
 'STORY_',
 'The',
 'Red',
 'House',
 'Mystery',
 'WINNIE-THE-POOH',
 'BY',
 'A',
 'A',
 'MILNE',
 'McCLELLAND',
 '&',
 'STEWART',
 'LTD',
 'PUBLISHERS',
 '-',
 '-',
 'TORONTO',
 '

In [67]:
freq_dist = FreqDist(text_tokens)
freq_dist.most_common(10)

[('and', 829),
 ('the', 678),
 ('he', 578),
 ('to', 555),
 ('said', 540),
 ('a', 517),
 ('it', 483),
 ('I', 459),
 ('of', 396),
 ('Pooh', 351)]