# Assignment 1
#### Extracting linguistic features using ```spaCy```

In [3]:
# import modules 
import spacy
import pandas as pd 
import os

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Loop over each text file in the folder called ```in```

In [4]:
# get current working directory 
os.getcwd()

'/work/CDS-language/CDS-language/Assignments/Assignment1_lang/src'

In [5]:
datapath = os.path.join(
    "..", 
    "in",
    "USEcorpus")

dirs = sorted(os.listdir(datapath))

corpus_texts = [] # make empty list to append the texts 

# loop through the paths 
for directory in dirs: 
    subfolder = os.path.join(datapath,directory) # path.join instead of "datapath + "/" + directory"
    filenames = sorted(os.listdir(subfolder))
    for text_file in filenames:
        path = os.path.join(subfolder,text_file)
        print(path)
        # open it and read it 
        with open(path, encoding="latin-1") as f:
            text = f.read()
            corpus_texts.append({'text': text, 'folder': directory, 'filename': text_file}) # use dictionary, so it's easier to convert to df with the folder and file name


# Convert the list of dictionaries to a pandas DataFrame
corpus_df = pd.DataFrame(corpus_texts)

../in/USEcorpus/a1/0100.a1.txt
../in/USEcorpus/a1/0101.a1.txt
../in/USEcorpus/a1/0102.a1.txt
../in/USEcorpus/a1/0103.a1.txt
../in/USEcorpus/a1/0104.a1.txt
../in/USEcorpus/a1/0105.a1.txt
../in/USEcorpus/a1/0106.a1.txt
../in/USEcorpus/a1/0107.a1.txt
../in/USEcorpus/a1/0108.a1.txt
../in/USEcorpus/a1/0109.a1.txt
../in/USEcorpus/a1/0110.a1.txt
../in/USEcorpus/a1/0111.a1.txt
../in/USEcorpus/a1/0112.a1.txt
../in/USEcorpus/a1/0113.a1.txt
../in/USEcorpus/a1/0114.a1.txt
../in/USEcorpus/a1/0115.a1.txt
../in/USEcorpus/a1/0116.a1.txt
../in/USEcorpus/a1/0118.a1.txt
../in/USEcorpus/a1/0119.a1.txt
../in/USEcorpus/a1/0120.a1.txt
../in/USEcorpus/a1/0121.a1.txt
../in/USEcorpus/a1/0122.a1.txt
../in/USEcorpus/a1/0123.a1.txt
../in/USEcorpus/a1/0124.a1.txt
../in/USEcorpus/a1/0125.a1.txt
../in/USEcorpus/a1/0127.a1.txt
../in/USEcorpus/a1/0128.a1.txt
../in/USEcorpus/a1/0130.a1.txt
../in/USEcorpus/a1/0131.a1.txt
../in/USEcorpus/a1/0133.a1.txt
../in/USEcorpus/a1/0134.a1.txt
../in/USEcorpus/a1/0135.a1.txt
../in/US

##### ```Preprocessing```
"The text files contain some extra information that such as document ID and other metadata that occurs between pointed brackets ```<>```. Make sure to remove these as part of your preprocessing steps!"

In [6]:
# looking at the dataframe it seems the text has some spaces enoced as "\n". We want to replace these with spaces. 
# we also want to get rid of the doc.id and title between the brackets 

# make function to clean the text
def clean_text_column(df, column_name):
    """
    This function removes extra white spaces "\n" and characters between < and >, and trailing whitespaces (str.strip) 
    """
    df[column_name] = df[column_name].str.replace('\s+', ' ', regex=True)\
                                    .str.replace('<.*?>', '', regex=True)\
                                    .str.strip() 
    return df

corpus_df = clean_text_column(corpus_df, 'text')
corpus_df

Unnamed: 0,text,folder,filename
0,Introduction In this essay I am going to evalu...,a1,0100.a1.txt
1,"Ever since I first started studying English, a...",a1,0101.a1.txt
2,1. Introduction This essay is going to give yo...,a1,0102.a1.txt
3,It all started in fourth grade. I was ten year...,a1,0103.a1.txt
4,"Hey dude, what's up! That's what it can sound ...",a1,0104.a1.txt
...,...,...,...
1492,HEATHCLIFF - A HEARTBROKEN SAVAGE It is certa...,c1,0200.c1.txt
1493,The Elements of Violence and Passion in Emily ...,c1,0219.c1.txt
1494,Heathcliff - Victim of Passion and Environment...,c1,0238.c1.txt
1495,Heathcliff- What or Who is He? Introduction I...,c1,0501.c1.txt


#### Get relative frequency of ```Nouns, Verbs, Adjective, and Adverbs``` per 10,000 words

In [7]:
# loading the spacy model
# define pipeline
nlp = spacy.load("en_core_web_md")

# make a list of docs 
doc_list = [nlp(text) for text in corpus_df['text']]

type(doc_list[0])


spacy.tokens.doc.Doc

In [8]:
# see some attributes 
import pandas as pd

def get_relative_frequencies(doc_list):
    tokens_label = []
    desired_pos = ["NOUN", "ADV", "ADJ", "VERB"]

    for docs in doc_list:
        for token in docs: 
            if token.pos_ in desired_pos: # only take NOUN, ADV, ADj and VERB
                # get text and label 
                tokens_label.append([token.text, token.pos_]) 

    # create a dataframe from the list of lists using pandas - with column names 
    df = pd.DataFrame(tokens_label, columns=["text", "pos"]) 

    # get relative frequencies per 10,000 words
    counts = df["pos"].value_counts()
    relative_freqs_per_10000 = (counts / len(df)) * 10000

    return relative_freqs_per_10000


get_relative_frequencies(doc_list)

pos
NOUN    4215.332938
VERB    2800.830170
ADJ     1746.038228
ADV     1237.798664
Name: count, dtype: float64

####   Get total number of *unique* PER, LOC, ORGS

In [9]:
# NER - named entity recognition

def unique_ents(doc_list):
# Initialize dictionaries to store counts for each label
    counts_per_label = {"PERSON": 0, "LOC": 0, "ORG": 0}

# docs is a list of spaCy Doc objects
    for doc in doc_list:
        for entity in doc.ents:
            if entity.label_ in counts_per_label:
            # Increment the count for each label
                counts_per_label[entity.label_] += 1

# Print the counts for each label
    for label, count in counts_per_label.items():
        print(f"Number of unique occurrences for {label}: {count}")


unique_ents(doc_list)

Number of unique occurrences for PERSON: 11761
Number of unique occurrences for LOC: 803
Number of unique occurrences for ORG: 4651


### For each sub-folder (a1, a2, a3, ...) save a table which shows the following information:
| Filename  | RelFreq NOUN | RelFreq VERB | RelFreq ADJ | RelFreq ADV | Unique PER | Unique LOC | Unique ORG |
|-----------|--------------|--------------|-------------|-------------|------------|------------|------------|
| file1.txt | ---          | ---          | ---         | ---         | ---        | ---        | ---        |
| file2.txt | ---          | ---          | ---         | ---         | ---        | ---        | ---        |
| etc       | ---          | ---          | ---         | ---         | ---        | ---        | ---        |


In [14]:
doc_list[0]

Introduction In this essay I am going to evaluate my ability to use the English language. I am going to assess my strengths and weaknesses in the four skills of listening, reading, speaking and writing. Eight years ago I moved to the US and I stayed there for two years. The evaluation of my English is based on how competent I feel today, at this point. I must honestly say that I have lost a lot of my confidence in the English language since my days in the US and that includes all four skills more or less. The four skills Listening, is the one of the four skills that I feel most confident in, as we are being exposed to it almost everyday, especially through television. I feel that I understand most of what's being said unless the vocabulary include to much technical terms, is too academic, or if it's spoken with a lot of dialect. For me it's easier to understand American English compere to Brittish English for obvious reasons. Some times it can be a little bit confusing when, the above,

      PERSON  LOC  ORG
0          0    0    0
1          1    0    0
2          1    0    0
3          1    0    1
4          0    1    2
...      ...  ...  ...
1492      80    0   12
1493      73    0    7
1494      63    0    4
1495      54    0    6
1496      38    0   26

[1497 rows x 3 columns]


In [11]:

def get_relative_frequencies(doc_list, per_doc=True):
    # Initialize a list to store the relative frequencies for each document
    relative_freqs_per_doc = []

    desired_pos = ["NOUN", "ADV", "ADJ", "VERB"]

    total_counts = pd.Series()  # To store total counts across all documents

    for docs in doc_list:
        tokens_label = []
        
        for token in docs: 
            if token.pos_ in desired_pos:
                # only take NOUN, ADV, ADJ, and VERB
                # get text and label 
                tokens_label.append([token.text, token.pos_]) 

        # create a dataframe from the list of lists using pandas - with column names 
        df = pd.DataFrame(tokens_label, columns=["text", "pos"]) 

        # get relative frequencies per 10,000 words for this document
        counts = df["pos"].value_counts()
        relative_freqs = (counts / len(df)) * 10000

        # Append the relative frequencies to the list
        relative_freqs_per_doc.append(relative_freqs)

        # Add counts for this document to the total counts
        total_counts = total_counts.add(counts, fill_value=0)

    # Calculate total relative frequencies
    total_relative_freqs = (total_counts / total_counts.sum()) * 10000

    if per_doc:
        # Print relative frequencies per document
        for doc_index, freqs in enumerate(relative_freqs_per_doc, start=1):
            print(f"Relative frequencies for Document {doc_index}:")
            print(freqs)
            print()
    else:
        # Print total relative frequencies
        print("Total Relative Frequencies:")
        print(total_relative_freqs)
        print()

# Example usage:
# Assuming you have a list of spaCy doc objects named doc_list
get_relative_frequencies(doc_list, per_doc=True)  # To print relative frequencies per document
#get_relative_frequencies(doc_list, per_doc=False)  # To print total relative frequencies



Relative frequencies for Document 1:
pos
NOUN    3676.975945
VERB    3024.054983
ADJ     1993.127148
ADV     1305.841924
Name: count, dtype: float64

Relative frequencies for Document 2:
pos
VERB    3235.294118
NOUN    3039.215686
ADV     2189.542484
ADJ     1535.947712
Name: count, dtype: float64

Relative frequencies for Document 3:
pos
NOUN    3850.931677
VERB    3136.645963
ADJ     1770.186335
ADV     1242.236025
Name: count, dtype: float64

Relative frequencies for Document 4:
pos
VERB    3750.000000
NOUN    3018.292683
ADJ     1646.341463
ADV     1585.365854
Name: count, dtype: float64

Relative frequencies for Document 5:
pos
NOUN    3508.196721
VERB    3180.327869
ADV     1803.278689
ADJ     1508.196721
Name: count, dtype: float64

Relative frequencies for Document 6:
pos
VERB    3575.949367
NOUN    3196.202532
ADV     1645.569620
ADJ     1582.278481
Name: count, dtype: float64

Relative frequencies for Document 7:
pos
NOUN    3387.533875
VERB    2845.528455
ADV     1924.119241

In [12]:
df

NameError: name 'df' is not defined

In [None]:
# slice data into individual columns and rows
# series = a single column
df["pos"].value_counts() # count how many times each value appears in this column