## Linguistic Feature Extraction with Spacy using ADDRESSO 2021 data set

- Lexical features
- Syntactic features
- Features of cohesion




path = '/export/b15/rpapagari/Tianzi_work/ADReSSo_NoVAD_IS2021_dataset/data_ADReSSo_diagnosis_cv10_text_v7_Longformer_TrainDevTest/cv_6/utt2csvpath'
train = '/export/b15/rpapagari/Tianzi_work/ADReSSo_NoVAD_IS2021_dataset/data_ADReSSo_diagnosis_cv10_text_v7_Longformer_TrainDevTest/cv_6/train.tsv'
dev = '/export/b15/rpapagari/Tianzi_work/ADReSSo_NoVAD_IS2021_dataset/data_ADReSSo_diagnosis_cv10_text_v7_Longformer_TrainDevTest/cv_6/dev.tsv'
test = '/export/b15/rpapagari/Tianzi_work/ADReSSo_NoVAD_IS2021_dataset/data_ADReSSo_diagnosis_cv10_text_v7_Longformer_TrainDevTest/cv_6/test.tsv'


def data_to_csv(path_trans, train, dev, test):
    
    path_ordered = []
    sentences = []
    
    read_train = pd.read_csv(train, header=None)
    read_dev = pd.read_csv(dev, header=None)
    read_test = pd.read_csv(test, header=None)
    data = pd.concat([read_train, read_dev, read_test], ignore_index=True)
    
    read_trans = pd.read_csv(path_trans, header=None)
    patients = (data[0].tolist())
    labels = (data[1].tolist())
    path_to_transcript= read_trans[1].tolist()
    
    for patient in patients:
        for path in path_to_transcript:
            if os.path.basename(path).split('.csv')[0] == patient:
                path_ordered.append(path)
                
    for transcript in path_ordered:
        with open(transcript, 'r') as f:
            transcript_ = f.readlines()
        #print(transcript_)
            transcript_ = transcript_[0]
            sentences.append(transcript_)

            
    
    dict = {'idx': patients, 'label': labels, 'sentence': sentences} 
    df = pd.DataFrame(dict)
    return df
    
    


df = data_to_csv(path, train, dev, test)


## Pre-Processing 

We perform the following preprocessing steps:

- **Convert all words to lowercase**. This is important for matching certain phrases later on, where the matching is case-sensitive and the word or phrase of interest would not be matched to a capitalized version of the same word or phrase.

- **Tokenize the data**. This step refers to identifying the boundaries of individual words and sentences within each item.

- **Lemmatization**. This process converts each word to its base form (e.g. 'has' to 'have', 'are' to 'be') for the purpose of easier matching.

- **Removal of words that contain non-alphabetic characters** (e.g., numbers, '#', '@', etc.)

- **Stopword removal**. This step refers to the removal of words that may not be important for the analysis (e.g. 'a', 'an', 'the', etc.). One could use a predefined list of stopwords (usually these remove all forms of the verb 'to be', as well as pronouns) or create a customized list of stopwords depending on what is important for the specific application.



def compute_lexical_diversity(transcript):
    
    lex = LexicalRichness(transcript)
   # word_count = lex.words
    unique_word_count =  lex.terms
    type_token_ratio = lex.ttr
   # root_type_token_ratio = lex.rttr
    corrected_type_token_ratio = lex.cttr
   # mean_segmental_type_token_ratio = lex.msttr(segment_window=12) #25
    moving_average_type_token_ratio = lex.mattr(window_size=13) #25
   # measure_textual_lexical_diversity= lex.mtld(threshold=0.72)
   # hypergeometric_distribution_diversity = lex.hdd(draws=13)
   # herdan_lexical_diversity_measure = lex.Herdan
    summer_lexical_diversity_measure=lex.Summer
    dugast_lexical_diversity_measure =lex.Dugast
   # maas_lexical_diversity_measure = lex.Maas
    
    return unique_word_count, type_token_ratio, corrected_type_token_ratio, moving_average_type_token_ratio, summer_lexical_diversity_measure, dugast_lexical_diversity_measure
    


def load_files(data):
    
    speakers = data['idx'].tolist()
    sentences = data['sentence'].tolist()
    labels = data['label'].tolist()
    lex_vals = np.array([compute_lexical_diversity(sent) for sent in sentences])
    names = ["unique_word_count", "type_token_ratio", "corrected_type_token_ratio", "moving_average_type_token_ratio", "summer_lexical_diversity_measure", "dugast_lexical_diversity_measure"]
    frame = pd.DataFrame({"speakers": speakers, "labels": labels, "sentences": sentences, **{name:val for name, val in zip(names,lex_vals.T)}})
    return frame


df = load_files(df)


#load the Spacy model for extracting data for English: "en_core_web_sm" 
nlp = spacy.load('en_core_web_sm')


#lower case the transcript 
df['sentence'] = df['sentence'].str.lower()


# Create a function to preprocess the text

#Customized list of stopwords 
stopwords = ['a', 'an', 'the', 'with', 'to', 'be', 'have', 'for', 'has']

def preprocess(text):
    '''This is a function to perform tokenization, lemmatization, removal of non-alphabetic characters
    and stopword removal'''
  # Create Doc object
    doc = nlp(text, disable=['ner'])
    # Generate lemmas
    lemmas = [token.lemma_ for token in doc]
    # Remove stopwords and non-alphabetic characters
    a_lemmas = [lemma for lemma in lemmas 
            if lemma.isalpha() and lemma not in stopwords]
    return ' '.join(a_lemmas)

In [None]:

df['Item'] = df['sentence'].apply(preprocess)


# Shallow Features

## Count Words

In [None]:
def count_words(string):
    '''This function returns the number of words in a string'''
    # Split the string into words
    words = string.split()
    # Return the number of words
    return len(words)

#Application to the raw data to get the full word count

df['Word_Count'] = df['sentence'].apply(count_words)

#Application to the preprocessed data to get the content-word count

df['Word_Count_No_stop_words'] = df['Item'].apply(count_words)
df.head()


### Word Length

In [None]:
def word_length(string):
    '''This function returns the average word length in characters for the words in an item'''
    #Get the length of the full text in characters
    chars = len(string)
    #Split the string into words
    words = string.split()
    #Compute the average word length and round the output to the second decimal point
    avg_word_length = chars/len(words)
    return round(avg_word_length, 2)

#Application to the preprocessed data

df['Avg_Word_Length'] = df['Item'].apply(word_length)
#df.head()

### Sentence Counter

In [None]:
def sentence_counter(text):
    '''This function returns the number of sentences in an item'''
    doc = nlp(text)
    #Initialize a counter variable
    counter = 0
    #Update the counter for each sentence which can be found in the doc.sents object returned by the Spacy model
    for sentence in doc.sents:
        counter = counter + 1
    return counter

#Note that this function is applied to the raw text in order to identify sentence boundaries

df['Sentence_Count'] = df['sentence'].apply(sentence_counter)


### Average Sentence Lenght in Words

In [None]:
def avg_sent_length(text):
    '''This function returns the average sentence length in an item'''
    doc = nlp(text)
    #Initialize a counter variable
    sent_number = 0
    #Update the counter for each sentence which can be found in the doc.sents object returned by the Spacy model
    for sent in doc.sents:
        sent_number = sent_number + 1
    #Get the number of words
    words = text.split()
    #Compute the average sentence length and round it to the second decimal point
    avg_sent_length = len(words)/sent_number
    return round(avg_sent_length, 2)

#Note that this function is applied to the raw text in order to identify sentence boundaries
df['Avg_Sentence_Length_in_Words'] = df['sentence'].apply(avg_sent_length)
#df.head()

# Syntactic Features

https://spacy.io/usage/linguistic-features#pos-tagging

### Noun Count

In [None]:
def nouns(text, model=nlp):
    '''This function returns the number of nouns in an item'''
    # Create doc object 
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    # Return number of nouns
    return pos.count('NOUN')

df['Noun_Count'] = df['Item'].apply(nouns)
#df.head()

### Verb Count

In [None]:
def verbs(text, model=nlp):
    '''This function returns the number of verbs in an item'''
    # Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    # Return number of verbs
    return pos.count('VERB')

df['Verb_Count'] = df['Item'].apply(verbs)

#df.head()


### Adjective Count 

In [None]:
def adjectives(text, model=nlp):
    '''This function returns the number of adjectives in an item'''
    # Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    # Return number of adjectives
    return pos.count('ADJ')

df['Adjective_Count'] = df['Item'].apply(adjectives)

#df.head()

### Adverb Count

In [None]:
def adverbs(text, model=nlp):
    '''This function returns the number of adverbs in an item'''
    # Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    # Return number of adverbs
    return pos.count('ADV')

df['Adverb_Count'] = df['Item'].apply(adverbs)
#df.head()

### Numeral Count

In [None]:
def numeral(text, model=nlp):
    '''This function returns the number of numerals (e.g., billion) in an item'''
    # Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    # Return number of adverbs
    return pos.count('NUM')

df['Numeral Count'] = df['sentence'].apply(numeral) #meglio estrarlo dall'originale
#df.head()

### Auxiliary Count

In [None]:
def aux(text, model=nlp):
    '''This function returns the number of auxiliary in an item'''
    # Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    # Return number of adverbs
    return pos.count('AUX')

df['Auxiliary_Count'] = df['sentence'].apply(aux) #meglio estrarlo dall'originale
#df.head()

### Number Noun Phrases

In [None]:
def get_nps(text):
    '''This is a function that outputs the number of noun phrases in an item'''
    doc = nlp(text)
    NP_count = 0
    for np in doc.noun_chunks:
        NP_count = NP_count + 1
    return NP_count
    #print(np)

df['Number_of_NPs'] = df['Item'].apply(get_nps)
#df.head()

### Number Prepositioanal Phrases

In [None]:
def get_pps(text):
    '''This is a function that outputs the number of prepositional phrases in an item'''
    doc = nlp(text)
    pps = 0
    for token in doc:
        # You can try this with other parts of speech for different subtrees.
        if token.pos_ == 'ADP':
            
            #Use the command below if you wanted to get the actual PPs
            #pp = ' '.join([tok.orth_ for tok in token.subtree])
            
            #This command counts the number of PPs
            pps = pps + 1
            
    return pps

df['Number_of_PPs'] = df['Item'].apply(get_pps)
#df.head()

### Number Verb Phrases

In [None]:
pattern = [{'POS': 'VERB', 'OP': '?'},
           {'POS': 'ADV', 'OP': '*'},
           {'POS': 'AUX', 'OP': '*'},
           {'POS': 'VERB', 'OP': '+'}]


def get_vps(text):
    '''This function returns the number of verb phrases in an item'''
    doc = nlp(text)
    vps = 0
    # instantiate a Matcher instance
    matcher = Matcher(nlp.vocab)
    matcher.add("Verb phrase", [pattern], on_match=None) #new syntax of the command
    # call the matcher to find matches 
    matches = matcher(doc)
    spans = [doc[start:end] for _, start, end in matches]
    for match in matches:
        vps = vps +1
    return vps
    
df['Number_of_VPs'] = df['Item'].apply(get_vps)
#df.head()

### Features of Cohesion




#First, we create lists of different types of connectives that we will later match to the text

#Connectives to instruct, recount and sequence
temporal_connectives = ['afterwards', 'once', 'at this moment', 'at this point', 'before', 'finally', 
                        'here', 'in the end', 'lastly', 'later on', 'meanwhile', 'next', 'now', 
                        'on another occasion', 'previously','since', 'soon', 'straightaway', 'then', 
                        'when', 'whenever', 'while']


#Connectives to show cause or conditions
causal_connectives = ['accordingly', 'all the same', 'an effect of', 'an outcome of', 'an upshot of',
                      'as a consequence of', 'as a result of', 'because', 'caused by', 'consequently',
                      'despite this', 'even though', 'hence', 'however', 'in that case', 'moreover',
                      'nevertheless', 'otherwise', 'so', 'so as', 'stemmed from', 'still', 'then',
                      'therefore', 'though', 'under the circumstances', 'yet']


#Connectives for showing results
exemplifying_connectives = ['accordingly', 'as a result', 'as exemplified by', 'consequently', 'for example',
                            'for instance', 'for one thing', 'including', 'provided that', 'since', 'so',
                            'such as', 'then', 'therefore', 'these include', 'through', 'unless', 'without']


#Connectives to show similarity or add a point
additive_connectives = ['and', 'additionally', 'also', 'as well', 'even', 'furthermore', 'in addition', 'indeed',
                        'let alone', 'moreover', 'not only']

#Connectives showing a difference or an opposite point of view
contrastive_connectives = ['alternatively', 'anyway', 'but', 'by contrast', 'differs from', 'elsewhere',
                           'even so', 'however', 'in contrast', 'in fact', 'in other respects', 'in spite of this',
                           'in that respect', 'instead', 'nevertheless', 'on the contrary', 'on the other hand',
                           'rather', 'though', 'whereas', 'yet']

### Temporal Connectives Count

In [None]:
def temporal_connectives_count(text):
    '''This function counts the number of temporal connectives in a text'''
    count = 0
    for string in temporal_connectives:
        for match in re.finditer(string, text):
            count +=  1
    return count

#Note that we apply the function to the raw text (and remember that it is important to lowercase all words)
df['Temporal_Connectives_Count'] = df['sentence'].apply(temporal_connectives_count)



### Causal Connectives Count


def causal_connectives_count(text):
    '''This function counts the number of causal connectives in a text'''
    count = 0
    for string in causal_connectives:
        for match in re.finditer(string, text):
            count +=  1
    return count

df['Causal_Connectives_Count'] = df['sentence'].apply(causal_connectives_count)
#df.head()

### Exemplifying Connectives Count

In [None]:
def exemplifying_connectives_count(text):
    '''This function counts the number of exemplifying connectives in a text'''
    count = 0
    for string in exemplifying_connectives:
        for match in re.finditer(string, text):
            count +=  1
    return count

df['Exemplifying_?Connectives_Count'] = df['sentence'].apply(exemplifying_connectives_count)
#df.head()

### Additive Connectives Count

In [None]:
def additive_connectives_count(text):
    '''This function counts the number of additive connectives in a text'''
    count = 0
    for string in additive_connectives:
        for match in re.finditer(string, text):
            count +=  1
    return count

df['Additive_Connectives_Count'] = df['sentence'].apply(additive_connectives_count)
#df.head()

### Contrastive connectives Count

In [None]:
def contrastive_connectives_count(text):
    '''This function counts the number of contrastive connectives in a text'''
    cont_con = 0
    for string in contrastive_connectives:
        if string in text:
            cont_con = cont_con + 1
    return cont_con

df['Contrastive_Connectives_Count'] = df['sentence'].apply(contrastive_connectives_count)
#df.head()


In [None]:
filled_pause = ["uhm"]
    

def filled_pauses(text):
    
    cont_pauses = 0
    for string in filled_pause:
        for match in re.finditer(string, text):
            cont_pauses += 1
    return cont_pauses

df['Filled_Pauses'] = df['sentence'].apply(filled_pauses)
#df[:40]

In [None]:
def certanty(text):
    
    '''Function design to capture the level of certainty of patients in providing the description
    of the image. To operationalize uncertanty I chose modals verbs as cue and interrogative marks. '''
    
    cont_con = 0
    if "?" in text:
        cont_con = cont_con + 1
    if "why" in text:
        cont_con = cont_con + 1
    if "might" in text:
        cont_con = cont_con + 1
    if "can" in text:
        cont_con = cont_con + 1
    if "may" in text:
        cont_con = cont_con + 1
    if "sure" in text:
        cont_con = cont_con + 1     
   # if "I" in text:
      #  cont_con = cont_con + 1 
    if "uhm" in text:
        cont_con = cont_con + 1 
    if "ah" in text:
        cont_con = cont_con + 1 
    if "should" in text:
        cont_con = cont_con + 1 
    if "looks like" in text:
        cont_con = cont_con + 1
        
    
    return cont_con

df['certanty'] = df['sentence'].apply(certanty)

#%
