# Assignment 1 - Extracting linguistic features using spaCy

First we install and import the relevant packages:

In [1]:
import spacy
import pandas as pd
import os

Then we download a particular model from spaCy to work with:

In [2]:
nlp = spacy.load("en_core_web_md")

### Solving the tasks

To work with all the texts in the folder called "in", then we have to create a "for loop".

In [23]:
# 1. Loop over each text file in the folder called in

main_folder_path = ("../in/USEcorpus") #the folder that we will be working in
sorted_dir = sorted(os.listdir(main_folder_path)) #sorting all the subfolders

for folder in sorted_dir: #creating a "for loop" to reach all the subfolders
    folder_path = os.path.join(main_folder_path, folder) 
    filenames = sorted(os.listdir(folder_path)) #sorting all the files in the different subfolders
    folder_info = [] #Define a empty list for later use
    
    for filename in filenames: #creating a new "for loop" to reach all the files in the subfolders
        filepath = folder_path + "/" + filename
        
        with open(filepath, encoding="latin-1") as f: 
            text = f.read() #opening all the files in one text
            
        doc = nlp(text) #creating a "doc" of all the files
        
        #2.1 Extract relative frequency of Nouns, Verbs, Adjective, and Adverbs per 10,000 words
        
        #First we define four variables, where the count of the different POS are 0        
        noun_count = 0
        verb_count = 0
        adj_count = 0
        adv_count = 0
        
        #Then we create a "for loop", where for each token of the specific POS, then it'll and one to the counter.        
        for token in doc:
            if token.pos_ == "NOUN":
                noun_count += 1
            elif token.pos_ == "VERB":
                verb_count += 1
            elif token.pos_ == "ADJ":
                adj_count += 1
            elif token.pos_ == "ADV":
                adv_count += 1
                
        #Then we try to find the relative frequency of each of the POS
        relative_freq_noun = (noun_count/len(doc)) * 10000
        relative_freq_verb = (verb_count/len(doc)) * 10000
        relative_freq_adj = (adj_count/len(doc)) * 10000
        relative_freq_adv = (adv_count/len(doc)) * 10000
        
        #2.2 Extract total number of unique PER, LOC, ORGS
        
        persons = set() #First create a new set
        for ent in doc.ents:
            if ent.label_ == 'PERSON': #Find all the entities in the doc that has the label PERSON
                persons.add(ent.text) #Then add them to the set cwe created
        num_persons = len(persons) #This is how we find out how many unique PERSON the previous code has found
        
        organisations = set()
        for ent in doc.ents:
            if ent.label_ == 'ORG':
                organisations.add(ent.text)
        num_organisations = len(organisations)
        
        locations = set()
        for ent in doc.ents:
            if ent.label_ == 'LOC':
                locations.add(ent.text)
        num_locations = len(locations)
        
        # 3. For each sub-folder (a1, a2, a3, ...) save a table which shows the information                
        file_info = [filename, relative_freq_noun, relative_freq_verb, relative_freq_adj, relative_freq_adv, 
                     num_persons, num_organisations, num_locations] #
        
        folder_info.append(file_info)
        
        df = pd.DataFrame(folder_info, 
                         columns=["Filename", "RelFreq NOUN", "RelFreq VERB", "RelFreq ADJ", "RelFreq ADV", 
                                  "Unique PER", "Unique LOC", "Unique ORG"])
        

            
        

In [24]:
df

Unnamed: 0,Filename,RelFreq NOUN,RelFreq VERB,RelFreq ADJ,RelFreq ADV,Unique PER,Unique LOC,Unique ORG
0,0140.c1.txt,1582.150101,920.892495,474.64503,434.077079,39,7,0
1,0165.c1.txt,1721.212121,783.838384,589.89899,307.070707,29,7,1
2,0200.c1.txt,1110.555278,1010.505253,615.307654,600.30015,23,3,1
3,0219.c1.txt,1355.263158,1013.157895,532.894737,605.263158,28,6,0
4,0238.c1.txt,1105.955143,1067.285383,386.697602,409.899459,19,3,0
5,0501.c1.txt,1208.191126,1010.238908,457.337884,525.59727,19,5,2
6,0502.c1.txt,1318.124208,1248.415716,386.565272,570.342205,17,3,3


### 2. Extract the following information:
- Relative frequency of Nouns, Verbs, Adjective, and Adverbs per 10,000 words
- Total number of unique PER, LOC, ORGS

First we define four variables, where the count of the different POS are 0

In [56]:
noun_count = 0
verb_count = 0
adj_count = 0
adv_count = 0

Then we create a "for loop", where for each token of the specific POS, then it'll and one to the counter.

In [None]:
for token in doc:
    if token.pos_ == "NOUN":
        noun_count += 1
    elif token.pos_ == "VERB":
        verb_count += 1
    elif token.pos_ == "ADJ":
        adj_count += 1
    elif token.pos_ == "ADV":
        adv_count += 1

Then we can run the different count variables, and find out how many there are in the text of each POS

In [48]:
noun_count

207

In [49]:
verb_count

191

In [50]:
adj_count

68

In [51]:
adv_count

64

Then we try to find the relative frequency of each of the POS

In [64]:
relative_freq_noun = (noun_count/len(doc)) * 10000

round(relative_freq_noun, 2)

1311.79

In [65]:
relative_freq_verb = (verb_count/len(doc)) * 10000

round(relative_freq_verb, 2)

1210.39

In [66]:
relative_freq_adj = (adj_count/len(doc)) * 10000

round(relative_freq_adj, 2)

430.93

In [67]:
relative_freq_adv = (adv_count/len(doc)) * 10000

round(relative_freq_adv, 2)

405.58

Now we try to find out how many unique PERSON, ORG and LOC there is in the texts.

We'll start with unique number of PERSONs.

In [74]:
persons = set() #First create a new set
for ent in doc.ents:
    if ent.label_ == 'PERSON': #Find all the entities in the doc that has the label PERSON
        persons.add(ent.text) #Then add them to the set cwe created

num_persons = len(persons) #This is how we find out how many unique PERSON the previous code has found

print(num_persons)

15


Then repeat for ORG and LOC.

In [70]:
organisations = set()
for ent in doc.ents:
    if ent.label_ == 'ORG':
        organisations.add(ent.text)

num_organisations = len(organisations)

print(num_organisations)

5


In [71]:
locations = set()
for ent in doc.ents:
    if ent.label_ == 'LOC':
        locations.add(ent.text)

num_locations = len(locations)

print(num_locations)

0


### 3. For each sub-folder (a1, a2, a3, ...) save a table which shows the following information:

In [72]:
annotations = []
for token in doc:
    annotations.append((token.text, token.pos_))