In [2]:
#Import Spacy
import spacy
import pandas as pd
from collections import Counter

In [3]:
#Loading the english pipeline (small and big)
nlp = spacy.load("en_core_web_sm")

In [87]:
# Paragraph from the ugly duckling
txt = "It was so beautiful out on the country, it was summer- the wheat fields were golden, the oats were green, and down among the green meadows the hay was stacked. There the stork minced about on his red legs, clacking away in Egyptian, which was the language his mother had taught him. Round about the field and meadow lands rose vast forests, in which deep lakes lay hidden. Yes, it was indeed lovely out there in the country."
txt_da = "Der var så dejligt ude på landet; det var sommer, kornet stod gult, havren grøn, høet var rejst i stakke nede i de grønne enge, og der gik storken på sine lange, røde ben og snakkede ægyptisk, for det sprog havde han lært af sin moder. Rundt om ager og eng var der store skove, og midt i skovene dybe søer; jo, der var rigtignok dejligt derude på landet!"
txt_dog = "dog"

In [27]:
#with open('ugly_duckling.txt') as f:
 #   lines = f.readlines()

In [88]:
# Running spacy pipeline on text
doc = nlp(txt)
doc_da = nlp(txt_da)
doc_dog = nlp(txt_dog)

In [33]:
def filter_pos(doc):

    filtered_tokens = []

    # Only include words of certain word-classes
    for token in doc:
        if token.pos_ in ["AUX", "ADJ", "NOUN", "VERB"]:
        #if (token.pos_ == "AUX" or token.pos_ == "ADJ" or token.pos_ == "NOUN" or token.pos == "VERB"):
            # Add the token in its lemma form to the list
            filtered_tokens.append(token.lemma_)

    return filtered_tokens

In [34]:
filtred = filter_pos(doc)
print(filtred)

['be', 'beautiful', 'country', 'be', 'wheat', 'field', 'be', 'golden', 'oat', 'be', 'green', 'green', 'meadow', 'hay', 'be', 'stack', 'stork', 'mince', 'red', 'leg', 'clack', 'be', 'language', 'mother', 'have', 'teach', 'field', 'meadow', 'land', 'rise', 'vast', 'forest', 'deep', 'lake', 'lie', 'hidden', 'be', 'lovely', 'country']


In [36]:
#Frida's alternative to the below:
#counts number of cases of the different class types (pos)
pos_counts = Counter([token.pos_ for token in doc])
# Counter is a container that will hold the count of each of the elements present in the container
    
# For class type tag (pos) and count (both from above), take the pos tag and the count divided by length to get freq
pos_list = [(pos, count/len(doc)) for pos, count in pos_counts.items()]
#outputs as a list of tuples dict(list_of_tuples) to convert to dictionary


<class 'dict'>
{'PRON': 0.06741573033707865, 'AUX': 0.06741573033707865, 'ADV': 0.0898876404494382, 'ADJ': 0.10112359550561797, 'ADP': 0.10112359550561797, 'DET': 0.12359550561797752, 'NOUN': 0.1797752808988764, 'PUNCT': 0.12359550561797752, 'X': 0.011235955056179775, 'VERB': 0.0898876404494382, 'CCONJ': 0.02247191011235955, 'PROPN': 0.011235955056179775, 'INTJ': 0.011235955056179775}


In [40]:
#preparing a function for class 4 in the same format
def term_freq(tokens) -> dict:
    """
    Takes in a list of tokens (str) and return a dictionary of term frequency of each token
    (doc = a list of tokens)
    """
    
    #counts number of cases of the different class types (pos)
    pos_counts = Counter([token.pos_ for token in doc])

    # For class type tag (pos) and count (both from above),
    # take the pos tag and the count divided by length to get freq
    tf_lst = [(pos, count/len(doc)) for pos, count in pos_counts.items()]
    #convert the list of tuples to a dictionary
    tf_dict = dict(tf_lst)

    return tf_dict

In [41]:
#testing the function
term_freq(doc)

{'PRON': 0.06741573033707865,
 'AUX': 0.06741573033707865,
 'ADV': 0.0898876404494382,
 'ADJ': 0.10112359550561797,
 'ADP': 0.10112359550561797,
 'DET': 0.12359550561797752,
 'NOUN': 0.1797752808988764,
 'PUNCT': 0.12359550561797752,
 'X': 0.011235955056179775,
 'VERB': 0.0898876404494382,
 'CCONJ': 0.02247191011235955,
 'PROPN': 0.011235955056179775,
 'INTJ': 0.011235955056179775}

In [14]:

def calc_ratio(doc):
    
    # Calculate the total number of tokens in doc
    n_tokens = len(doc)
    
    # Calculate ratio of verbs
    n_verbs = []
    for token in doc:
        if (token.pos_ == "AUX" or token.pos_ == "VERB"):
            n_verbs.append(token)  
    n_verbs = len(n_verbs)
    verb_ratio = n_verbs / n_tokens * 100
    
     # Calculate ratio of nouns
    n_nouns = []
    for token in doc:
        if (token.pos_ == "NOUN"):
            n_nouns.append(token)  
    n_nouns = len(n_nouns)
    noun_ratio = n_nouns / n_tokens * 100
    
    # Calculate ratio of adjectives
    n_adj = []
    for token in doc:
        if (token.pos_ == "ADJ"):
            n_adj.append(token)  
    n_adj = len(n_adj)
    adj_ratio = n_adj / n_tokens * 100
    
    # Make dataframe
    data = {"Ratio of Verbs": [round(verb_ratio,2)],
            "Ratio of Nouns": [round(noun_ratio,2)],
            "Ratio of Adjectives": [round(adj_ratio,2)]}
    
    df = pd.DataFrame(data)
    
    return df

In [15]:
calc_ratio(doc)

Unnamed: 0,Ratio of Verbs,Ratio of Nouns,Ratio of Adjectives
0,15.73,17.98,10.11


In [89]:
# prepping some input for the function below
doc_EN_DA_dog = [doc, doc_da, doc_dog]
print(type(doc_EN_DA_dog))
print(doc_EN_DA_dog)

<class 'list'>
[It was so beautiful out on the country, it was summer- the wheat fields were golden, the oats were green, and down among the green meadows the hay was stacked. There the stork minced about on his red legs, clacking away in Egyptian, which was the language his mother had taught him. Round about the field and meadow lands rose vast forests, in which deep lakes lay hidden. Yes, it was indeed lovely out there in the country., Der var så dejligt ude på landet; det var sommer, kornet stod gult, havren grøn, høet var rejst i stakke nede i de grønne enge, og der gik storken på sine lange, røde ben og snakkede ægyptisk, for det sprog havde han lært af sin moder. Rundt om ager og eng var der store skove, og midt i skovene dybe søer; jo, der var rigtignok dejligt derude på landet!, dog]


In [84]:
# Preparing another function for class 4 in line with this

#doc_freq(t) = occurrence of t in document

def doc_freq(doc_lst) -> dict:
    """
    Takes in a list of documents which each is a list of tokens (str) and return a dictionary of frequencies for each token over all the documents. E.g. {"Aarhus": 20, "the": 2301, ...}
    """
    #empty list (can only append to lists, not to counters)
    all_counters_lst = []

    #Iterating through docs
    for doc in doc_lst:
        #append to list a counter with frequencies of pos in each doc
        all_counters_lst.append(Counter([token.pos_ for token in doc]))

    #Empty counter (.update works on counters, not lists)
    all_counters = Counter()

    #iterating thorugh counters, updating (=adding)
    for counter in all_counters_lst:
        all_counters.update(counter)

    return all_counters

In [91]:
#testing the function above
doc_freq(doc_lst = doc_EN_DA_dog)

Counter({'PRON': 8,
         'AUX': 6,
         'ADV': 9,
         'ADJ': 9,
         'ADP': 13,
         'DET': 11,
         'NOUN': 39,
         'PUNCT': 24,
         'X': 3,
         'VERB': 13,
         'CCONJ': 2,
         'PROPN': 33,
         'INTJ': 1})

In [None]:
#Last

#Tokens have the attribute .head (corresponding to the parsing dependencies)
#Indices for token = token.i (subtract token.i for two words)
#Get the ABSOLUTE value (to avoid negative values)
#Use either 8 or 9 (including all words or all relations) - whether you include the word "admitted" or not