|  Name | Student ID | Section Contributed | Section Edited | Other Contributions |
| --- | --- | --- | --- | --- |
| Dueck, Ellie | 301462367 | 
| Flett, Iain | 301581520 | 

In [94]:
import os
import nltk
import numpy
import re
import matplotlib
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import FreqDist
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")

In [23]:
def get_text_info(text):
    """
    Uses NLTK to calculate: tokens, types, lexical diversity
    
    Args:
        text (str): a string containing the file or text
        
    Returns: 
        dict: a dictionary containing tokens, types, and lexical diversity
    """
    tokens = nltk.word_tokenize(text)
    n_tokens = len(tokens)
    n_types = len(set(tokens))
    return {
            'tokens': n_tokens,
            'types': n_types,
        }
def process_dir(path):
    """
    Reads all the files in a directory. Processes them using the 'get_text_info' function
    
    Args: 
        path (str): path to the directory where the files are
        
    Returns:
        dict: a dictionary with file names as keys and the tokens, types, lexical diversity, as values
    
    """
    file_info = {}

    for filename in os.listdir(path):
        if filename.endswith(".txt"):    
            file_path = os.path.join(path, filename)      
            with open(file_path, 'r', encoding="utf-8") as f:
                text = f.read()
                file_info[filename] = get_text_info(text)
    return file_info

In [25]:
path = './data'

filesInfo = process_dir(path)


In [27]:
df = pd.DataFrame.from_dict(filesInfo, orient='index')
df

Unnamed: 0,tokens,types
Meditations_Marcus_Aurelius.txt,81803,6602
StarTrekII.txt,22065,3673
Winnie_the_Pooh_AA_Milne.txt,30602,2650


In [29]:
df['lex_div'] = df['types']/df['tokens']
df

Unnamed: 0,tokens,types,lex_div
Meditations_Marcus_Aurelius.txt,81803,6602,0.080706
StarTrekII.txt,22065,3673,0.166463
Winnie_the_Pooh_AA_Milne.txt,30602,2650,0.086596


The top 10 most frequent words and their counts:

In [84]:
def text_cleaner(text):
    #removes unwanted punctuation from text
    text_clean = re.sub(r'[\,\.\"\”\“\*\)\(\-\!\?]', '', text,)
    return text_clean

path = './data'
word_freq = {}
    #reads and processes files to be cleaned and counted
for filename in os.listdir(path):
        if filename.endswith(".txt"):    
            file_path = os.path.join(path, filename)      
            with open(file_path, 'r', encoding="utf-8") as f:
                text = text_cleaner(f.read())
                tokens = word_tokenize(text)
                word_freq[filename] = FreqDist(tokens).most_common(10)

In [92]:
print("\n".join(f"\n{filename}: {words}" for filename, words in word_freq.items()))


Meditations_Marcus_Aurelius.txt: [('and', 3049), ('the', 2400), ('of', 2281), ('to', 1916), ('that', 1882), ('is', 1394), ('it', 1118), ('in', 1060), ('a', 993), ('be', 963)]

StarTrekII.txt: [('the', 564), ('to', 354), ('KIRK', 271), ('and', 263), ('a', 233), ('of', 227), ('I', 206), ('is', 203), ('you', 191), ("'s", 172)]

Winnie_the_Pooh_AA_Milne.txt: [('and', 815), ('the', 678), ('he', 577), ('to', 555), ('said', 539), ('a', 509), ('it', 480), ('I', 453), ('of', 393), ('Pooh', 351)]


Named Entities:

In [99]:
for filename in os.listdir(path):
        if filename.endswith(".txt"):    
            file_path = os.path.join(path, filename)      
            with open(file_path, 'r', encoding="utf-8") as f:
                text = f.read()
                doc = nlp(text)

In [107]:
named_ents = []

# go through the entities and append each to the list
for ent in doc.ents:
    named_ents.append((ent.text, ent.label_))
    
print(named_ents)

[('A. A. MILNE', 'PERSON'), ('JUVENILES', 'PERSON'), ('EDWARD\n\n    NEWTON', 'ORG'), ('Fourteen', 'CARDINAL'), ('A. A. Milne', 'PERSON'), ('H. Fraser-Simson', 'PERSON'), ('A. A. Milne', 'PERSON'), ('H. Fraser-Simson', 'PERSON'), ('The Red House Mystery\n\n\n\n\n\n\n\n\n\n                            WINNIE-THE-POOH\n\n                            BY A. A. MILNE\n\n\n\n                      McCLELLAND & STEWART', 'ORG'), ('LTD', 'ORG'), ('TORONTO', 'GPE'), ('Copyright', 'GPE'), ('Canada', 'GPE'), ('1926', 'DATE'), ('McClelland & Stewart', 'ORG'), ('Limited\n\n                          Publishers', 'ORG'), ('Toronto', 'GPE'), ('October,', 'DATE'), ('1926', 'DATE'), ('Second', 'ORDINAL'), ('July, 1927', 'DATE'), ('Third', 'ORDINAL'), ('December, 1928', 'DATE'), ('Fourth', 'ORDINAL'), ('December, 1929\n\n                     ', 'DATE'), ('Fifth', 'ORDINAL'), ('March', 'DATE'), ('1931', 'DATE'), ('Canada', 'GPE'), ("YOU'RE SURPRISED", 'PERSON'), ('Christopher Robin', 'PERSON'), ('Christopher

In [111]:
# create a df for the entities, from the list above 
df_ents = pd.DataFrame(named_ents)
# name the columns
df_ents.columns = ['Entity', 'Label']
# print
df_ents

Unnamed: 0,Entity,Label
0,A. A. MILNE,PERSON
1,JUVENILES,PERSON
2,EDWARD\n\n NEWTON,ORG
3,Fourteen,CARDINAL
4,A. A. Milne,PERSON
...,...,...
1648,eBooks,ORG
1649,Project Gutenberg,ORG
1650,the Project Gutenberg Literary\n\nArchive Foun...,ORG
1651,eBooks,ORG
