# Assignment 1
#### Extracting linguistic features using ```spaCy```

In [1]:

# import modules 
import spacy
import pandas as pd 
import os
import re
import string


# loading the spacy model
# define pipeline
nlp = spacy.load("en_core_web_md")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


##### ```Preprocessing``` function

In [2]:
# looking at the dataframe it seems the text has some spaces enoced as "\n". We want to replace these with spaces. 
# we also want to get rid of the doc.id and title between the brackets 

# make function to clean the text
def clean_text(text):
    """
    This function removes all non-alphanumeric characters (\W+), extra white spaces "\n" and characters between < and > (.*?), and trailing whitespaces (.strip). 
    Next it makes all characters lowercase. 
    
    """
    cleaned_text = re.sub(r'<.*?>|\n|\W+', ' ', text).strip().lower()

    return cleaned_text



### Get relative frequency of ```Nouns, Verbs, Adjective, and Adverbs``` per 10,000 words

In [3]:
  
def rel_freq(doc):
    """
    This function calculates the relative frequencies of Nous, Verbs Adjectives and Adverbs the each text.  
    
    """
  # Total word count in the document
  total_word_count = len(doc)

  # make dictionary 
  counts_per_pos = {"NOUN": 0, "ADV": 0, "ADJ": 0, "VERB": 0} 

    # Count occurrences of each POS
  for token in doc: 
      if token.pos_ in counts_per_pos: # only take NOUN, ADV, ADj and VERB
              # get text and label 
          counts_per_pos[token.pos_] += 1
        
     # Calculate relative frequencies per 10,000 words
      relative_frequencies_per_10000 = {} # initialize empty dictionary 
      for pos, count in counts_per_pos.items(): # iterate through the dictionary and make the values relative 
          relative_frequency = (count / total_word_count) * 10000
          relative_frequencies_per_10000[pos] = relative_frequency

  return relative_frequencies_per_10000


### Get total number of *unique* PER, LOC, ORGS

In [4]:

def unique_ents(doc):
    """
    This function calculates the amount of different people, locations and organizations mentioned in the text.  
    
    """
    counts_per_label = {"PERSON": set(), "LOC": set(), "ORG": set()}

    seen_entities = set()  # Keep track of seen entities in each document
    for entity in doc.ents:
        if entity.label_ in counts_per_label and entity.text not in seen_entities:
            # Increment the count for each label
            counts_per_label[entity.label_].add(entity.text)
            seen_entities.add(entity.text)  # Add the entity to seen_entities set

    # After processing all documents, convert sets to counts
    counts_per_label = {label: len(entities) for label, entities in counts_per_label.items()}

    return counts_per_label

### Loop over each text file in the folder called ```in```

In [5]:

def processing(path):    

    """
    This function loads in the data. Then it applies the clean_text function to the raw text before converting it into a spaCy doc.
    Next it applies the two functions: rel_freq() and unique_ents() to extract the linguistic features. 
    """
    # open it and read it 
    with open(path, encoding="latin-1") as f:
        text = f.read()
    
    text = clean_text(text)
    doc = nlp(text)

    relative_frequencies_per_10000 = rel_freq(doc) 
    counts_per_label = unique_ents(doc)

    data = relative_frequencies_per_10000 | counts_per_label  # merge the two dictionaries 

    return data



### For each sub-folder (a1, a2, a3, ...) save a table which shows the following information:
| Filename  | RelFreq NOUN | RelFreq VERB | RelFreq ADJ | RelFreq ADV | Unique PER | Unique LOC | Unique ORG |
|-----------|--------------|--------------|-------------|-------------|------------|------------|------------|
| file1.txt | ---          | ---          | ---         | ---         | ---        | ---        | ---        |
| file2.txt | ---          | ---          | ---         | ---         | ---        | ---        | ---        |
| etc       | ---          | ---          | ---         | ---         | ---        | ---        | ---        |


In [6]:
# get current working directory 
os.getcwd()

'/work/CDS-language/CDS-language/Assignments/Assignment1_lang/src'

In [7]:
# define paths
file_path = os.path.join(
    "..", 
    "in",
    "USEcorpus")

output_path = os.path.join(
    "..", 
    "out")

dirs = sorted(os.listdir(file_path))


# loop through the directories 
for directory in dirs: 
    subfolder = os.path.join(file_path, directory) # path.join instead of "datapath + "/" + directory"
    filenames = sorted(os.listdir(subfolder))

    corpus_texts = [] # make empty list to append the texts 

    for text_file in filenames: # loop through the files
        path = os.path.join(subfolder,text_file)
        data = processing(path)
        
        corpus_texts.append({'File': text_file, 'Folder': directory, **data}) # use dictionary, so it's easier to convert to df with the folder and file name


        # Convert the list of dictionaries to a pandas DataFrame
    corpus_df = pd.DataFrame(corpus_texts)
    #rename the columns
    corpus_df = corpus_df.rename({'PERSON': 'Unique PER', 
                      'LOC': 'Unique LOC', 
                      'ORG': 'Unique ORG', 
                      'NOUN': 'RelFreq NOUN', 
                      'ADV': 'RelFreq ADV', 
                      'ADJ': 'RelFreq ADJ', 
                      'VERB': 'RelFreq VERB'}, axis='columns')
    # save in the folders with the tables 
    corpus_df.to_csv(os.path.join(output_path, f"{directory}_spacy.csv"), index = False)



