|  Name | Student ID | Section Contributed | Section Edited | Other Contributions |
| --- | --- | --- | --- | --- |
| Dueck, Ellie | 301462367 | 
| Flett, Iain | 301581520 | 

**references**
<br>
Meditations: https://www.gutenberg.org/cache/epub/2680/pg2680.txt
<br>
Star Trek: https://www.scifiscripts.com/scripts/startrek2_wrathofkhan.txt
<br>
Winnie the Pooh: https://www.gutenberg.org/cache/epub/67098/pg67098.txt

In [49]:
import os
import nltk
import numpy
import re
import matplotlib
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import FreqDist
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")

### **Length and Lexical Diversity:**

In [51]:
def get_text_info(text):
    """
    Uses NLTK to calculate: tokens, types, lexical diversity
    
    Args:
        text (str): a string containing the file or text
        
    Returns: 
        dict: a dictionary containing tokens, types, and lexical diversity
    """
    tokens = nltk.word_tokenize(text)
    n_tokens = len(tokens)
    n_types = len(set(tokens))
    return {
            'tokens': n_tokens,
            'types': n_types,
        }
def process_dir(path):
    """
    Reads all the files in a directory. Processes them using the 'get_text_info' function
    
    Args: 
        path (str): path to the directory where the files are
        
    Returns:
        dict: a dictionary with file names as keys and the tokens, types, lexical diversity, as values
    
    """
    file_info = {}

    for filename in os.listdir(path):
        if filename.endswith(".txt"):    
            file_path = os.path.join(path, filename)      
            with open(file_path, 'r', encoding="utf-8") as f:
                text = f.read()
                file_info[filename] = get_text_info(text)
    return file_info

In [54]:
path = './data'

filesInfo = process_dir(path)


In [56]:
df = pd.DataFrame.from_dict(filesInfo, orient='index')
df

Unnamed: 0,tokens,types
Meditations_Marcus_Aurelius.txt,81803,6602
StarTrekII.txt,22065,3673
Winnie_the_Pooh_AA_Milne.txt,29936,2459


In [58]:
df['lex_div'] = df['types']/df['tokens']
df

Unnamed: 0,tokens,types,lex_div
Meditations_Marcus_Aurelius.txt,81803,6602,0.080706
StarTrekII.txt,22065,3673,0.166463
Winnie_the_Pooh_AA_Milne.txt,29936,2459,0.082142


### **The top 10 most frequent words and their counts:**

In [61]:
def text_cleaner(text):
    #removes unwanted punctuation from text
    text_clean = re.sub(r'[\,\.\"\”\“\*\)\(\-\!\?]', '', text,)
    return text_clean

path = './data'
word_freq = {}
    #reads and processes files to be cleaned and counted
for filename in os.listdir(path):
        if filename.endswith(".txt"):    
            file_path = os.path.join(path, filename)      
            with open(file_path, 'r', encoding="utf-8") as f:
                text = text_cleaner(f.read())
                tokens = word_tokenize(text)
                word_freq[filename] = FreqDist(tokens).most_common(10)

In [77]:
print("\n".join(f"\n{filename}: {words}" for filename, words in word_freq.items()))


Meditations_Marcus_Aurelius.txt: [('and', 3049), ('the', 2400), ('of', 2281), ('to', 1916), ('that', 1882), ('is', 1394), ('it', 1118), ('in', 1060), ('a', 993), ('be', 963)]

StarTrekII.txt: [('the', 564), ('to', 354), ('KIRK', 271), ('and', 263), ('a', 233), ('of', 227), ('I', 206), ('is', 203), ('you', 191), ("'s", 172)]

Winnie_the_Pooh_AA_Milne.txt: [('and', 792), ('the', 652), ('he', 576), ('said', 539), ('to', 537), ('a', 502), ('it', 479), ('I', 453), ('of', 377), ('Pooh', 351)]


### **Named Entities:**

In [66]:
for filename in os.listdir(path):
        if filename.endswith(".txt"):    
            file_path = os.path.join(path, filename)      
            with open(file_path, 'r', encoding="utf-8") as f:
                text = f.read()
                doc = nlp(text)

In [67]:
named_ents = []

# go through the entities and append each to the list
for ent in doc.ents:
    named_ents.append((ent.text, ent.label_))
    
print(named_ents)

[('A. A. MILNE', 'PERSON'), ('JUVENILES', 'PERSON'), ('EDWARD\n    NEWTON', 'ORG'), ('Fourteen', 'CARDINAL'), ('A. A. Milne', 'PERSON'), ('H. Fraser-Simson', 'PERSON'), ('E. H. Shepard', 'PERSON'), ('A. A. Milne', 'PERSON'), ('H. Fraser-Simson', 'PERSON'), ('E. H. Shepard', 'PERSON'), ('The Sunny Side', 'WORK_OF_ART'), ('The Red House Mystery', 'ORG'), ('A. A. MILNE\n\n                      ', 'PERSON'), ('McCLELLAND & STEWART', 'ORG'), ('LTD', 'ORG'), ('TORONTO', 'GPE'), ('Copyright', 'GPE'), ('Canada', 'GPE'), ('1926', 'DATE'), ('McClelland & Stewart', 'ORG'), ('Limited\n                          Publishers', 'ORG'), ('Toronto', 'GPE'), ('October,', 'DATE'), ('1926', 'DATE'), ('Second', 'ORDINAL'), ('July, 1927\n                     ', 'DATE'), ('Third', 'ORDINAL'), ('December', 'DATE'), ('1928', 'DATE'), ('Fourth', 'ORDINAL'), ('December, 1929\n                     ', 'DATE'), ('Fifth', 'ORDINAL'), ('March', 'DATE'), ('1931', 'DATE'), ('Canada', 'GPE'), ("YOU'RE SURPRISED", 'PERSON'

In [68]:
# create a df for the entities, from the list above 
df_ents = pd.DataFrame(named_ents)
# name the columns
df_ents.columns = ['Entity', 'Label']
# print
df_ents

Unnamed: 0,Entity,Label
0,A. A. MILNE,PERSON
1,JUVENILES,PERSON
2,EDWARD\n NEWTON,ORG
3,Fourteen,CARDINAL
4,A. A. Milne,PERSON
...,...,...
1594,Kanga,GPE
1595,Roo,PERSON
1596,Eeyore,PERSON
1597,Pooh,PERSON
