# Assignment 1 - Extracting linguistic features using spaCy

First we install and import the relevant packages:

In [6]:
import spacy
import pandas as pd
import os
import re

Then we download a particular model from spaCy to work with:

In [7]:
nlp = spacy.load("en_core_web_md")

### Solving the tasks

To work with all the texts in the folder called "in", then we have to create a "for loop".

In [8]:
# 1. Loop over each text file in the folder called in

main_folder_path = ("../in/USEcorpus") #the folder that we will be working in
sorted_dir = sorted(os.listdir(main_folder_path)) #sorting all the subfolders

for folder in sorted_dir: #creating a "for loop" to reach all the subfolders
    folder_path = os.path.join(main_folder_path, folder) 
    filenames = sorted(os.listdir(folder_path)) #sorting all the files in the different subfolders
    folder_info = [] #Define a empty list for later use
    
    for filename in filenames: #creating a new "for loop" to reach all the files in the subfolders
        filepath = folder_path + "/" + filename
        
        with open(filepath, encoding="latin-1") as f: 
            text = f.read() #opening all the files in one text
            
        pattern = r"<.*?>"
        cleaned_text = re.sub(pattern, "", text)

        doc = nlp(cleaned_text)
        
        #2.1 Extract relative frequency of Nouns, Verbs, Adjective, and Adverbs per 10,000 words
        
        #First we define four variables, where the count of the different POS are 0        
        noun_count = 0
        verb_count = 0
        adj_count = 0
        adv_count = 0
        
        #Then we create a "for loop", where for each token of the specific POS, then it'll and one to the counter.        
        for token in doc:
            if token.pos_ == "NOUN":
                noun_count += 1
            elif token.pos_ == "VERB":
                verb_count += 1
            elif token.pos_ == "ADJ":
                adj_count += 1
            elif token.pos_ == "ADV":
                adv_count += 1
                
        #Then we try to find the relative frequency of each of the POS
        relative_freq_noun = round((noun_count/len(doc)) * 10000, 2)
        relative_freq_verb = round((verb_count/len(doc)) * 10000, 2)
        relative_freq_adj = round((adj_count/len(doc)) * 10000, 2)
        relative_freq_adv = round((adv_count/len(doc)) * 10000, 2)
        
        #2.2 Extract total number of unique PER, LOC, ORGS
        
        persons = set() #First create a new set
        for ent in doc.ents:
            if ent.label_ == 'PERSON': #Find all the entities in the doc that has the label PERSON
                persons.add(ent.text) #Then add them to the set cwe created
        num_persons = len(persons) #This is how we find out how many unique PERSON the previous code has found
        
        organisations = set()
        for ent in doc.ents:
            if ent.label_ == 'ORG':
                organisations.add(ent.text)
        num_organisations = len(organisations)
        
        locations = set()
        for ent in doc.ents:
            if ent.label_ == 'LOC':
                locations.add(ent.text)
        num_locations = len(locations)
        
        # 3. For each sub-folder (a1, a2, a3, ...) save a table which shows the information                
        file_info = [filename, relative_freq_noun, relative_freq_verb, relative_freq_adj, relative_freq_adv, 
                     num_persons, num_organisations, num_locations] #
        
        folder_info.append(file_info)
        
        df = pd.DataFrame(folder_info, 
                         columns=["Filename", "RelFreq NOUN", "RelFreq VERB", "RelFreq ADJ", "RelFreq ADV", 
                                  "Unique PER", "Unique LOC", "Unique ORG"])


        outpath = os.path.join("..", "out", folder + ".csv")
        df.to_csv(outpath)
        

            
        

In [9]:
df

Unnamed: 0,Filename,RelFreq NOUN,RelFreq VERB,RelFreq ADJ,RelFreq ADV,Unique PER,Unique LOC,Unique ORG
0,0140.c1.txt,1573.58,933.55,472.89,403.59,38,5,0
1,0165.c1.txt,1742.49,816.41,580.83,284.32,27,3,0
2,0200.c1.txt,1177.65,1021.64,649.22,508.3,17,8,0
3,0219.c1.txt,1379.31,974.8,563.66,484.08,26,6,0
4,0238.c1.txt,1092.9,1163.15,398.13,288.84,18,3,0
5,0501.c1.txt,1231.93,1025.46,461.11,426.7,14,5,0
6,0502.c1.txt,1321.84,1219.67,434.23,408.68,15,5,0
