## LINGUISTIC AND COGNITIVE FEATURES EXTRACTION

This notebook shows how to apply the functions to extract linguistic and cognitive features from the speech transcripts contained in a csv file.

In [None]:
from lexicalrichness import LexicalRichness
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re
import spacy
from spacy.matcher import Matcher

In [None]:
#df: path containing speech transcripts stored in csv format. The csv should contain 3 main columns:

#  1 - names: speaker ID for each subject recorded.
#  2 - sentence transcripts
#  3 - label representing the class (i.e., control (CN), Alzheimer's Disease (AD), Parkinson's Disease (PD)) if you want to conduct further analysis later.

df = pd.read_csv("/export/b14/afavaro/LexicalRichness/transcripts_data.csv")

In [None]:
def compute_lexical_diversity(transcript):
    
    lex = LexicalRichness(transcript)
   # word_count = lex.words
    unique_word_count =  lex.terms
    type_token_ratio = lex.ttr
   # root_type_token_ratio = lex.rttr
    corrected_type_token_ratio = lex.cttr
   # mean_segmental_type_token_ratio = lex.msttr(segment_window=12) #25
    moving_average_type_token_ratio = lex.mattr(window_size=13) #25
   # measure_textual_lexical_diversity= lex.mtld(threshold=0.72)
   # hypergeometric_distribution_diversity = lex.hdd(draws=13)
   # herdan_lexical_diversity_measure = lex.Herdan
    summer_lexical_diversity_measure=lex.Summer
    dugast_lexical_diversity_measure =lex.Dugast
   # maas_lexical_diversity_measure = lex.Maas
    
    return unique_word_count, type_token_ratio, corrected_type_token_ratio, moving_average_type_token_ratio, summer_lexical_diversity_measure, dugast_lexical_diversity_measure

In [None]:
def load_files(data):
    
    speakers = data['idx'].tolist()
    sentences = data['names'].tolist()
    labels = data['label'].tolist()
    lex_vals = np.array([compute_lexical_diversity(sent) for sent in sentences])
    names = ["unique_word_count", "type_token_ratio", "corrected_type_token_ratio", "moving_average_type_token_ratio", "summer_lexical_diversity_measure", "dugast_lexical_diversity_measure"]
    frame = pd.DataFrame({"speakers": speakers, "labels": labels, "sentences": sentences, **{name:val for name, val in zip(names,lex_vals.T)}})

    return frame

In [None]:
df = load_files(df)

In [None]:
#load the Spacy model for extracting data for English: "en_core_web_sm" 
nlp = spacy.load('en_core_web_sm')

In [None]:
df['sentence'] = df['sentences'].str.lower()

In [None]:
# Create a function to preprocess the text
#Customized list of stopwords
stopwords = list(stopwords.words('english'))

def preprocess(text):
  # Create Doc object
    doc = nlp(text, disable=['ner'])
    # Generate lemmas
    lemmas = [token.lemma_ for token in doc]
    # Remove stopwords and non-alphabetic characters
    a_lemmas = [lemma for lemma in lemmas 
            if lemma.isalpha() and lemma not in stopwords]

    return ' '.join(a_lemmas)

df['Item'] = df['sentence'].apply(preprocess)

In [None]:
def count_words(string):
    # Split the string into words
    words = string.split()
    # Return the number of words
    return len(words)

#Application to the raw data to get the full word count
df['Word_Count'] = df['sentence'].apply(count_words)

#Application to the preprocessed data to get the content-word count
df['Word_Count_No_stop_words'] = df['Item'].apply(count_words)

In [None]:
def word_length(string):
    #Get the length of the full text in characters
    chars = len(string)
    #Split the string into words
    words = string.split()
    #Compute the average word length and round the output to the second decimal point
    if len(words)!=0:
        avg_word_length = chars/len(words)
   
        return round(avg_word_length, 2)

df['Avg_Word_Length'] = df['Item'].apply(word_length)

In [None]:
def sentence_counter(text):

    doc = nlp(text)
    #Initialize a counter variable
    counter = 0
    #Update the counter for each sentence which can be found in the doc.sents object returned by the Spacy model
    for sentence in doc.sents:
        counter = counter + 1
    return counter
#Note that this function is applied to the raw text in order to identify sentence boundaries
df['Sentence_Count'] = df['sentence'].apply(sentence_counter)

In [None]:
def avg_sent_length(text):

    doc = nlp(text)
    #Initialize a counter variable
    sent_number = 0
    #Update the counter for each sentence which can be found in the doc.sents object returned by the Spacy model
    for sent in doc.sents:
        sent_number = sent_number + 1
    #Get the number of words
    words = text.split()
    #Compute the average sentence length and round it to the second decimal point
    avg_sent_length = len(words)/sent_number

    return round(avg_sent_length, 2)

#Note that this function is applied to the raw text in order to identify sentence boundaries
df['Avg_Sent_Length_in_Words'] = df['sentence'].apply(avg_sent_length)

In [None]:
def nouns(text, model=nlp):

    # Create doc object 
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    # Return number of nouns
    return pos.count('NOUN')

df['Noun_Count'] = df['Item'].apply(nouns)

In [None]:
def verbs(text, model=nlp):
    '''This function returns the number of verbs in an item'''
    # Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    # Return number of verbs
    return pos.count('VERB')

df['Verb_Count'] = df['Item'].apply(verbs)

In [None]:
def adjectives(text, model=nlp):

    # Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    # Return number of adjectives
    return pos.count('ADJ')

df['Adjective_Count'] = df['Item'].apply(adjectives)

In [None]:
def adverbs(text, model=nlp):

    # Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    # Return number of adverbs
    return pos.count('ADV')

df['Adverb_Count'] = df['Item'].apply(adverbs)

In [None]:
def numeral(text, model=nlp):

    # Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    # Return number of adverbs
    return pos.count('NUM')

df['Numeral_Count'] = df['sentence'].apply(numeral) #meglio estrarlo dall'originale

In [None]:
def aux(text, model=nlp):

    # Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    # Return number of adverbs
    return pos.count('AUX')

df['Auxiliary_Count'] = df['sentence'].apply(aux) #meglio estrarlo dall'originale

In [None]:
def get_nps(text):

    doc = nlp(text)
    NP_count = 0
    for np in doc.noun_chunks:
        NP_count = NP_count + 1
    return NP_count
    #print(np)

df['Number_of_NPs'] = df['Item'].apply(get_nps)

In [None]:
def get_pps(text):

    doc = nlp(text)
    pps = 0
    for token in doc:
        # You can try this with other parts of speech for different subtrees.
        if token.pos_ == 'ADP':
            
            #Use the command below if you wanted to get the actual PPs
            #pp = ' '.join([tok.orth_ for tok in token.subtree])
            #This command counts the number of PPs
            pps = pps + 1
            
    return pps

df['Number_of_PPs'] = df['Item'].apply(get_pps)

In [None]:
pattern = [{'POS': 'VERB', 'OP': '?'},
           {'POS': 'ADV', 'OP': '*'},
           {'POS': 'AUX', 'OP': '*'},
           {'POS': 'VERB', 'OP': '+'}]


def get_vps(text):

    doc = nlp(text)
    vps = 0
    # instantiate a Matcher instance
    matcher = Matcher(nlp.vocab)
    matcher.add("Verb phrase", [pattern], on_match=None) #new syntax of the command
    # call the matcher to find matches 
    matches = matcher(doc)
    spans = [doc[start:end] for _, start, end in matches]
    for match in matches:
        vps = vps +1
    return vps
    
df['Number_of_VPs'] = df['Item'].apply(get_vps)

In [None]:
#Connectives to instruct, recount and sequence
temporal_connectives = ['afterwards', 'once', 'at this moment', 'at this point', 'before', 'finally', 
                        'here', 'in the end', 'lastly', 'later on', 'meanwhile', 'next', 'now', 
                        'on another occasion', 'previously','since', 'soon', 'straightaway', 'then', 
                        'when', 'whenever', 'while']

#Connectives to show cause or conditions
causal_connectives = ['accordingly', 'all the same', 'an effect of', 'an outcome of', 'an upshot of',
                      'as a consequence of', 'as a result of', 'because', 'caused by', 'consequently',
                      'despite this', 'even though', 'hence', 'however', 'in that case', 'moreover',
                      'nevertheless', 'otherwise', 'so', 'so as', 'stemmed from', 'still', 'then',
                      'therefore', 'though', 'under the circumstances', 'yet']

#Connectives for showing results
exemplifying_connectives = ['accordingly', 'as a result', 'as exemplified by', 'consequently', 'for example',
                            'for instance', 'for one thing', 'including', 'provided that', 'since', 'so',
                            'such as', 'then', 'therefore', 'these include', 'through', 'unless', 'without']

#Connectives to show similarity or add a point
additive_connectives = ['and', 'additionally', 'also', 'as well', 'even', 'furthermore', 'in addition', 'indeed',
                        'let alone', 'moreover', 'not only']

#Connectives showing a difference or an opposite point of view
contrastive_connectives = ['alternatively', 'anyway', 'but', 'by contrast', 'differs from', 'elsewhere',
                           'even so', 'however', 'in contrast', 'in fact', 'in other respects', 'in spite of this',
                           'in that respect', 'instead', 'nevertheless', 'on the contrary', 'on the other hand',
                           'rather', 'though', 'whereas', 'yet']

In [None]:
def temporal_connectives_count(text):

    count = 0
    for string in temporal_connectives:
        for match in re.finditer(string, text):
            count +=  1

    return count

#Note that we apply the function to the raw text (and remember that it is important to lowercase all words)
df['Temporal_Connectives_Count'] = df['sentence'].apply(temporal_connectives_count)
#df.head()

In [None]:
def causal_connectives_count(text):

    count = 0
    for string in causal_connectives:
        for match in re.finditer(string, text):
            count +=  1
    return count

df['Causal_Connectives_Count'] = df['sentence'].apply(causal_connectives_count)

In [None]:
def exemplifying_connectives_count(text):

    count = 0
    for string in exemplifying_connectives:
        for match in re.finditer(string, text):
            count +=  1
    return count

df['Exemplifying_Connectives_Count'] = df['sentence'].apply(exemplifying_connectives_count)

In [None]:
def additive_connectives_count(text):

    count = 0
    for string in additive_connectives:
        for match in re.finditer(string, text):
            count +=  1
    return count

df['Additive_Connectives_Count'] = df['sentence'].apply(additive_connectives_count)

In [None]:
def contrastive_connectives_count(text):

    cont_con = 0
    for string in contrastive_connectives:
        if string in text:
            cont_con = cont_con + 1
    return cont_con

df['Contrastive_Connectives_Count'] = df['sentence'].apply(contrastive_connectives_count)

In [None]:
filled_pause = ["uhm"]

def filled_pauses(text):
    
    cont_pauses = 0
    for string in filled_pause:
        for match in re.finditer(string, text):
            cont_pauses += 1
    return cont_pauses

df['Filled_Pauses'] = df['sentence'].apply(filled_pauses)

In [None]:
def uncertainty(text):

    cont_con = 0
    if "?" in text:
        cont_con = cont_con + 1
    if "why" in text:
        cont_con = cont_con + 1
    if "might" in text:
        cont_con = cont_con + 1
    if "can" in text:
        cont_con = cont_con + 1
    if "may" in text:
        cont_con = cont_con + 1
    if "sure" in text:
        cont_con = cont_con + 1     
   # if "I" in text:
      #  cont_con = cont_con + 1 
    if "uhm" in text:
        cont_con = cont_con + 1 
    if "ah" in text:
        cont_con = cont_con + 1 
    if "should" in text:
        cont_con = cont_con + 1 
    if "looks like" in text:
        cont_con = cont_con + 1

    return cont_con

df['uncertainty'] = df['sentence'].apply(certanty)

In [None]:
def repetitions(text):

    repetition = 0
    text = text.split()
    d = dict()
    
    for line in text:
        line = line.strip()
        line = line.lower()
        words = line.split(" ")
        for word in words:

            if word in d:
                d[word] = d[word] + 1
            else:
                d[word] = 1
    
    for key in list(d.keys()):
        if key not in stopwords:
            if d[key] > 1:
                repetition +=1

    return repetition

df['repetition'] = df['sentence'].apply(repetitions)

In [None]:
def informational_verb(text):

    cont_con = 0

    if "washing" in text:
        cont_con = cont_con + 1
    if "overflowing" in text:
        cont_con = cont_con + 1
    if "hanging" in text:
        cont_con = cont_con + 1
    if "trying to help" in text:
        cont_con = cont_con + 1
    if "falling" in text:
        cont_con = cont_con + 1
    if "wobbling" in text:
        cont_con = cont_con + 1
    if "drying" in text:
        cont_con = cont_con + 1
    if "ignoring" in text:
        cont_con = cont_con + 1
    if "reaching" in text:
        cont_con = cont_con + 1
    if "reaching up" in text:
        cont_con = cont_con + 1
    if "asking for cookie" in text:
        cont_con = cont_con + 1
    if "laughing" in text:
        cont_con = cont_con + 1
    if "standing" in text:
        cont_con = cont_con + 1

    return cont_con

df['informational_verb'] = df['sentence'].apply(informational_verb)

In [None]:
def informational_content(text):
    
    cont_con = 0
    
    if "mother" in text:
        cont_con = cont_con + 1
    if "sister" in text:
        cont_con = cont_con + 1
    if "cookie" in text:
        cont_con = cont_con + 1
    if "cookie jar" in text:
        cont_con = cont_con + 1
    if "curtains" in text:
        cont_con = cont_con + 1
    if "cabinet" in text:
        cont_con = cont_con + 1
    if "brother" in text:
        cont_con = cont_con + 1
    if "kitchen" in text:
        cont_con = cont_con + 1
    if "sink" in text:
        cont_con = cont_con + 1
    if "garden" in text:
        cont_con = cont_con + 1
    if "fall" in text:
        cont_con = cont_con + 1
    if "dishes" in text:
        cont_con = cont_con + 1     
    if "stool" in text:
        cont_con = cont_con + 1 
    if "poddle" in text:
        cont_con = cont_con + 1 
    
    return cont_con

df['informational'] = df['sentence'].apply(informational_content)