In [1]:
import pandas as pd
import numpy as np
import re
import os
import json
import matplotlib.pyplot as plt
import csv

import seaborn as sns
from scipy.stats import shapiro, levene, iqr
from scipy.stats import ttest_ind, mannwhitneyu, chisquare
from scipy.stats import ttest_rel, wilcoxon, chi2_contingency

# 1. Get transcriptions

## 1.1 Manual

In [2]:
# Re-use previos processing to save excel with columns: user, label, partition
df_all = pd.read_excel('0-publication/transcripts_ADReSS-IS2020_selected_v2.xlsx')
df_meta = df_all.loc[:,['user','label','partition']].copy()
# df_meta.to_excel('0-publication/metainfo.xlsx',index=False)

df_meta.head(2)

Unnamed: 0,user,label,partition
0,S001,0,train
1,S002,0,train


In [3]:
# Read transcripts files processed by Simone's code
path = '0-publication/data_manual/original_punc/'
with open(path+'trainCC_manual-v2_punc.txt') as f:
    reader = csv.reader(f, delimiter="\t")
    d = list(reader)
df_train_cc = pd.DataFrame(d, columns = ['user','manual'])

with open(path+'trainCD_manual-v2_punc.txt') as f:
    reader = csv.reader(f, delimiter="\t")
    d = list(reader)
df_train_cd = pd.DataFrame(d, columns = ['user','manual'])

with open(path+'test_manual-v2_punc.txt') as f:
    reader = csv.reader(f, delimiter="\t")
    d = list(reader)
df_test = pd.DataFrame(d, columns = ['user','manual'])

df = pd.concat([df_train_cc,df_train_cd,df_test])
df.reset_index(inplace=True, drop=True)

# Add space between punctuation and words
df['manual_Punc'] = ''
for i in range(df.shape[0]):
    text_ans = df.loc[i,'manual']
    text = re.sub('([.,!?()])', r' \1 ', text_ans) # add space
    text = re.sub('\s{2,}', ' ', text) # remove double spaces
    df.loc[i,'manual_Punc'] = text

# Create a version without punctuation
df['manual_noPunc'] = ''
for i in range(df.shape[0]):
    text_ans = df.loc[i,'manual_Punc']
    text = re.sub('([.,!?()])', r'', text_ans) # remove punctuaction
    text = re.sub('\s{2,}', ' ', text) # remove double spaces
    df.loc[i,'manual_noPunc'] = text
df.drop(columns = ['manual'], inplace=True)


# ----------------------------------------------------------
# PUT EVERYTHING IN LOWERCASE
# ----------------------------------------------------------
df['manual_Punc'] = df['manual_Punc'].str.lower()
df['manual_noPunc'] = df['manual_noPunc'].str.lower()

# Add meta information and save file
df_meta = pd.read_excel('0-publication/metainfo.xlsx')
df_final = pd.merge(df_meta, df, on='user')
# df_final.to_excel('0-publication/transcripts_ADReSS-IS2020_selected_v3.xlsx', index=False)

In [4]:
df_final.head(2)

Unnamed: 0,user,label,partition,manual_Punc,manual_noPunc
0,S001,0,train,tell me everything that you see going on in th...,tell me everything that you see going on in th...
1,S002,0,train,just look at the picture and tell me everythin...,just look at the picture and tell me everythin...


## 1.2 ASR

In [5]:
df = pd.read_excel('data/ADReSS-IS2020/transcripts_ADReSS-IS2020_unk_whisper.xlsx')

df_select2 = df.loc[:,['user',
                       'ASR_wav2vec2xlsr_full_par_inv',
                       'ASR_whisperLargeX_full_par_inv']].copy()
# ====================
# wav2vec2xlsr
# ====================
# Only need to rename column
df_select2.rename(columns={"ASR_wav2vec2xlsr_full_par_inv": "ASR_wav2vec2_noPunc"}, inplace=True)

# ====================
# whisper
# ====================
# Add space between punctuation and words
df_select2['ASR_whisper_Punc'] = ''
for i in range(df_select2.shape[0]):
    text_ans = df_select2.loc[i,'ASR_whisperLargeX_full_par_inv']
    text = re.sub('([.,!?()])', r' \1 ', text_ans) # add space
    text = re.sub('\s{2,}', ' ', text) # remove double spaces
    df_select2.loc[i,'ASR_whisper_Punc'] = text

# Create a version without punctuation
df_select2['ASR_whisper_noPunc'] = ''
for i in range(df_select2.shape[0]):
    text_ans = df_select2.loc[i,'ASR_whisper_Punc']
    text = re.sub('([.,!?()])', r'', text_ans) # remove punctuaction
    text = re.sub('\s{2,}', ' ', text) # remove double spaces
    df_select2.loc[i,'ASR_whisper_noPunc'] = text
df_select2.drop(columns = ['ASR_whisperLargeX_full_par_inv'], inplace=True)

# ----------------------------------------------------------
# PUT EVERYTHING IN LOWERCASE
# ----------------------------------------------------------
df_select2['ASR_wav2vec2_noPunc'] = df_select2['ASR_wav2vec2_noPunc'].str.lower()
df_select2['ASR_whisper_Punc'] = df_select2['ASR_whisper_Punc'].str.lower()
df_select2['ASR_whisper_noPunc'] = df_select2['ASR_whisper_noPunc'].str.lower()

In [6]:
df_select2.head(2)

Unnamed: 0,user,ASR_wav2vec2_noPunc,ASR_whisper_Punc,ASR_whisper_noPunc
0,S001,tell me everything that you see going on in th...,tell me everything that you see going on in t...,tell me everything that you see going on in t...
1,S002,yes just look at the picture i wild ever telea...,"just look at the picture . drawing , paperwor...",just look at the picture drawing paperwork th...


In [7]:
# Unir las transcripciones manual con las automaticas
df_all = pd.read_excel('0-publication/transcripts_ADReSS-IS2020_selected_v3.xlsx')

df_final = pd.merge(df_all, df_select2, on='user')
# df_final.to_excel('0-publication/transcripts_ADReSS-IS2020_selected_v3.xlsx', index=False)

In [8]:
df_final.head(2)

Unnamed: 0,user,label,partition,manual_Punc,manual_noPunc,ASR_wav2vec2_noPunc,ASR_whisper_Punc,ASR_whisper_noPunc
0,S001,0,train,tell me everything that you see going on in th...,tell me everything that you see going on in th...,tell me everything that you see going on in th...,tell me everything that you see going on in t...,tell me everything that you see going on in t...
1,S002,0,train,just look at the picture and tell me everythin...,just look at the picture and tell me everythin...,yes just look at the picture i wild ever telea...,"just look at the picture . drawing , paperwor...",just look at the picture drawing paperwork th...


# 2. Encode pauses

* <50ms(0.05s) => excluded
* menor 0.5s   => ,
* 0.5s - 2s    => .
* mayor 2s     => ...

## 2.1 Manual

In [9]:
# Read transcripts files processed by Simone's code (encoded pauses)
path = '0-publication/data_manual/encodedPauses/'

with open(path+'trainCC_manual-v2_pause.txt') as f:
    reader = csv.reader(f, delimiter="\t")
    d = list(reader)
df_train_cc = pd.DataFrame(d, columns = ['user','manual_pauses'])

with open(path+'trainCD_manual-v2_pause.txt') as f:
    reader = csv.reader(f, delimiter="\t")
    d = list(reader)
df_train_cd = pd.DataFrame(d, columns = ['user','manual_pauses'])

with open(path+'test_manual-v2_pause.txt') as f:
    reader = csv.reader(f, delimiter="\t")
    d = list(reader)
df_test = pd.DataFrame(d, columns = ['user','manual_pauses'])

df = pd.concat([df_train_cc,df_train_cd,df_test])
df.reset_index(inplace=True, drop=True)

# ----------------------------------------------------------
# PUT EVERYTHING IN LOWERCASE
# ----------------------------------------------------------
df['manual_pauses'] = df['manual_pauses'].str.lower()

df_all = pd.read_excel('0-publication/transcripts_ADReSS-IS2020_selected_v3.xlsx')
df_final = pd.merge(df_all, df, on='user')
# df_final.to_excel('0-publication/transcripts_ADReSS-IS2020_selected_v3.xlsx', index=False)

In [10]:
df_final.head(2)

Unnamed: 0,user,label,partition,manual_Punc,manual_noPunc,ASR_wav2vec2_noPunc,ASR_whisper_Punc,ASR_whisper_noPunc,manual_pauses
0,S001,0,train,tell me everything that you see going on in th...,tell me everything that you see going on in th...,tell me everything that you see going on in th...,tell me everything that you see going on in t...,tell me everything that you see going on in t...,"tell me , everything , that you see going on i..."
1,S002,0,train,just look at the picture and tell me everythin...,just look at the picture and tell me everythin...,yes just look at the picture i wild ever telea...,"just look at the picture . drawing , paperwor...",just look at the picture drawing paperwork th...,". just , look , at the , picture . and . tell ..."


## 2.2 ASR

In [11]:
# ================================================
# Calculate pauses
# ================================================
list_row = []
# Training partitions
for group in ['cc','cd']:
    path = 'data/ADReSS-IS2020/train/transcription_ASR_full_whisper_large_X/'+ group + '/' 
    files = os.listdir(path)
    sel_files = [f for f in files if f.endswith('_alignements.json')]
    for file in sel_files:
        user = file.split('_')[0]
        # Load data
        with open(path + file, 'r') as f:
            data = json.load(f)
        df_timestamps = pd.DataFrame.from_dict(data)
        
        text = ''
        for i in range(df_timestamps.shape[0]-1):
            word = df_timestamps.loc[i,'text']
            dur = df_timestamps.loc[i+1,'start'] - df_timestamps.loc[i,'end']
            
            # Correct word (in case punctuation is added at the end)
            if word.endswith('.'):
                word = word[:-1] #+' .'    <= THE ONLY PUNCTUATION WILL BE THE ENCODED PAUSES!
            if word.endswith(','):
                word = word[:-1] #+' ,'
            if word.endswith('?'):
                word = word[:-1] #+' ?'
            if word.endswith('!'):
                word = word[:-1] #+' !'
            
            # Encode pauses in transcriptions
            if dur > 0.05:
                if dur < 0.5:
                    pause = "," 
                elif 0.5 <= dur <= 2:
                    pause = "."
                elif dur > 2:
                    pause = "..."
                text = text + " " + word + " " + pause
            else:
                text = text + " " + word
        new_row = {'user':user,'ASR_whisper_pauses':text}
        list_row.append(new_row)
        
# Test partitions               
path = 'data/ADReSS-IS2020/test/transcription_ASR_full_whisper_large_X/'
files = os.listdir(path)
sel_files = [f for f in files if f.endswith('_alignements.json')]
for file in sel_files:
    user = file.split('_')[0]
    # Load data
    with open(path + file, 'r') as f:
        data = json.load(f)
    df_timestamps = pd.DataFrame.from_dict(data)
    
    text = ''
    for i in range(df_timestamps.shape[0]-1):
        word = df_timestamps.loc[i,'text']
        dur = df_timestamps.loc[i+1,'start'] - df_timestamps.loc[i,'end']
        
        # Correct word (in case punctuation is added at the end)
        if word.endswith('.'):
            word = word[:-1] #+' .'    <= THE ONLY PUNCTUATION WILL BE THE ENCODED PAUSES!
        if word.endswith(','):
            word = word[:-1] #+' ,'
        if word.endswith('?'):
            word = word[:-1] #+' ?'
        if word.endswith('!'):
            word = word[:-1] #+' !'
            
        # Encode pauses in transcriptions
        if dur > 0.05:
            if dur < 0.5:
                pause = "," 
            elif 0.5 <= dur <= 2:
                pause = "."
            elif dur > 2:
                pause = "..."
            text = text + " " + word + " " + pause
        else:
            text = text + " " + word
        
    new_row = {'user':user,'ASR_whisper_pauses':text}
    list_row.append(new_row)
    
df = pd.DataFrame.from_dict(list_row)

# ----------------------------------------------------------
# PUT EVERYTHING IN LOWERCASE
# ----------------------------------------------------------
df['ASR_whisper_pauses'] = df['ASR_whisper_pauses'].str.lower()


df_all = pd.read_excel('0-publication/transcripts_ADReSS-IS2020_selected_v3.xlsx')
df_final = pd.merge(df_all, df, on='user',how = 'right')
# df_final.to_excel('0-publication/transcripts_ADReSS-IS2020_selected_v3.xlsx',index=False)

In [12]:
df_final.head(2)

Unnamed: 0,user,label,partition,manual_Punc,manual_noPunc,ASR_wav2vec2_noPunc,ASR_whisper_Punc,ASR_whisper_noPunc,manual_pauses,ASR_whisper_pauses
0,S001,0,train,tell me everything that you see going on in th...,tell me everything that you see going on in th...,tell me everything that you see going on in th...,tell me everything that you see going on in t...,tell me everything that you see going on in t...,"tell me , everything , that you see going on i...","tell me everything that you see going , on in..."
1,S002,0,train,just look at the picture and tell me everythin...,just look at the picture and tell me everythin...,yes just look at the picture i wild ever telea...,"just look at the picture . drawing , paperwor...",just look at the picture drawing paperwork th...,". just , look , at the , picture . and . tell ...","just , look at the picture ... drawing paperw..."


# 3. Calculate features: LIWC, speech rate, pauses

In [46]:
def calculate_ratio(selected, df_transcripts, df_liwc, df_egemaps):
    """ FOR BOTH """
    df_features = df_liwc.copy()
    df_features['caract/duration'] = 0
    
    for user in df_egemaps.subject.values:
        duration = df_egemaps.loc[df_egemaps.subject == user]['duration'].values[0]
        text = df_transcripts.loc[df_transcripts.user == user][selected].values[0] 
        n_caracters = len(text)
        ratio = n_caracters/duration
        ix = df_features.loc[df_features.user == user].index
        df_features.loc[ix,'caract/duration'] = ratio
    return df_features

def process_silences(df_silences):
    """ FOR MANUAL TRANSCRIPTIONS """
    new = df_silences["info"].str.split(" ", n = 2, expand = True)
    df_silences.insert(2,'start',new[0])
    df_silences.insert(3,'end',new[1])
    df_silences.insert(4,'dur',new[2])
    df_silences.drop(columns=['info'],inplace=True)
    
def calculate_pauses_ASR(user, df_timestamps, df_turns, row_list):
    """ FOR AUTOMATIC TRANSCRIPTIONS """
    # Select the words that correspond to investigator to add a mark
    df_timestamps['INV'] = ''
    df_user = df_turns.loc[(df_turns.fn == user)&(df_turns.spk == 'INV')].copy()
    df_user.reset_index(inplace=True,drop=True)
    if df_user.shape[0] != 0:
        for i in range(df_user.shape[0]):
            ts = df_user.loc[i,'ts']
            te = df_user.loc[i,'te']
            time_stamp = df_user.loc[i,'time_stamp']
            for j in range(df_timestamps.shape[0]):
                start_word = df_timestamps.loc[j,'start']
                end_word = df_timestamps.loc[j,'end']
                if (start_word > ts) and (start_word < te):
                    df_timestamps.loc[j,'INV'] = time_stamp 
    # Calculate pauses
    for i in range(0,df_timestamps.shape[0]-1):
        inv = df_timestamps.loc[i, 'INV']
        # Select the rows that not correspond to investigator
        if inv == '':
            end = df_timestamps.loc[i, 'end']
            start = df_timestamps.loc[i+1, 'start']
            pause = start - end
            if pause > 0.05:          #   <=== MINIMUM THRESHOLD: 50ms 
                new_row = {'user':user,'start':end,'end':start,'dur':pause}
            else:
                new_row = {'user':user,'start':end,'end':start,'dur':np.nan}
            row_list.append(new_row) 
            
def calculate_features(df_analyze):
    """ FOR BOTH """
    users = []; n_pauses = []; sum_pause_durations = []; mean_pause_durations = []; #mean_n_pauses_intervals = []
    for user in np.unique(df_analyze.user.values):
        df_user_ini = df_analyze.loc[df_analyze.user == user].copy()
        df_user = df_user_ini[~df_user_ini['dur'].isna()].copy()
        df_user.reset_index(inplace=True, drop = True)
        n_pause = df_user.shape[0]
        sum_pause_duration = np.sum(df_user.dur.values)
        mean_pause_duration = np.mean(df_user.dur.values)

        users.append(user)
        n_pauses.append(n_pause); 
        sum_pause_durations.append(sum_pause_duration); 
        mean_pause_durations.append(mean_pause_duration)

    df = pd.DataFrame(list(zip(users,n_pauses,sum_pause_durations,mean_pause_durations)),  # mean_n_pauses_intervals
                      columns=['user','n_pauses','sum_pause_durations','mean_pause_durations']) # 'mean_n_pauses_intervals'
    return df

## 2.1 Manual

In [56]:
# df_transcripts = pd.read_excel('data/ADReSS-IS2020/transcripts_ADReSS-IS2020_unk_whisper.xlsx')  
df_transcripts = pd.read_excel('0-publication/transcripts_ADReSS-IS2020_selected_v3.xlsx')  
df_egemaps = pd.read_excel('features/features_audioFile_functionals.xlsx')

for i in ['punc','nopunc']:
    if i == 'punc':
        df_liwc = pd.read_excel('0-publication/features/manual_Punc_LIWC2015.xlsx') 
        selected = 'manual_Punc'
    else:
        df_liwc = pd.read_excel('0-publication/features/manual_noPunc_LIWC2015.xlsx') 
        selected = 'manual_noPunc'

    # ================================================
    # Calculate ratio characters / duration
    # ================================================
    df_features_liwc_ratio = calculate_ratio(selected, df_transcripts, df_liwc, df_egemaps)

    # ================================================
    # Calculate pauses
    # ================================================
    with open('0-publication/data_manual/silences/trainCC_sil-durs.txt') as f:
        reader = csv.reader(f, delimiter="\t")
        d = list(reader)
    df_silences_cc = pd.DataFrame(d, columns=['user','interval_no','info'])
    process_silences(df_silences_cc)
    
    with open('0-publication/data_manual/silences/trainCD_pause-durs.txt') as f:
        reader = csv.reader(f, delimiter="\t")
        d = list(reader)
    df_silences_cd = pd.DataFrame(d, columns=['user','interval_no','info'])
    process_silences(df_silences_cd)
    
    with open('0-publication/data_manual/silences/test_sil-durs.txt') as f:
        reader = csv.reader(f, delimiter="\t")
        d = list(reader)
    df_silences_test = pd.DataFrame(d, columns=['user','interval_no','info'])
    process_silences(df_silences_test)
    
    df_silences = pd.concat([df_silences_cc,df_silences_cd,df_silences_test])
    df_silences.reset_index(inplace=True, drop=True)
    df_silences["dur"] = pd.to_numeric(df_silences["dur"])

    df_analyze = df_silences.loc[df_silences.dur>0].copy()
    df_analyze.reset_index(inplace=True,drop=True)
    df_features_pauses = calculate_features(df_analyze)

    # ================================================
    # Merge everything
    # ================================================
    df_features = pd.merge(df_features_liwc_ratio, df_features_pauses, on="user", how="right").copy()
    if i == 'punc':
        df_features.to_excel('0-publication/features/manual_Punc_LIWC2015_pause.xlsx',index=False)
    else:
        df_features.to_excel('0-publication/features/manual_noPunc_LIWC2015_pause.xlsx',index=False)

## 2.2 ASR

In [28]:
df_transcripts = pd.read_excel('data/ADReSS-IS2020/transcripts_ADReSS-IS2020_unk_whisper.xlsx') # 'manual_par_inv', 'manualPunc_par_inv','manualPuncS_par_inv', 
df_egemaps = pd.read_excel('features/features_audioFile_functionals.xlsx')
# df_liwc = pd.read_excel('0-publication/LIWC2015 Results_ASR_whisperLargeX_full_par_inv.xlsx') 

name_feature = 'ASR_whisper_Punc_LIWC2015'
df_liwc = pd.read_excel('0-publication/features/'+name_feature+'.xlsx') 
selected = 'ASR_whisperLargeX_full_par_inv'

# ================================================
# Calculate ratio characters / duration
# ================================================
df_features_liwc_ratio = calculate_ratio(selected, df_transcripts, df_liwc, df_egemaps)

# ================================================
# Calculate pauses
# ================================================
df1 = pd.read_csv('data/ADReSS-IS2020/train/transcription_manual/cc/all_clean_train_cc.csv')
df2 = pd.read_csv('data/ADReSS-IS2020/train/transcription_manual/cd/all_clean_train_cd.csv')
df3 = pd.read_csv('data/ADReSS-IS2020/test/transcription_manual/all_clean_test_lower.csv')
df_turns = pd.concat([df1,df2,df3])
df_turns.reset_index(inplace=True,drop=True)

row_list = []
# Training partitions
for group in ['cc','cd']:
    path = 'data/ADReSS-IS2020/train/transcription_ASR_full_whisper_large_X/'+ group + '/' 
    files = os.listdir(path)
    sel_files = [f for f in files if f.endswith('_alignements.json')]
    for file in sel_files:
        user = file.split('_')[0]
        # Load data
        with open(path + file, 'r') as f:
            data = json.load(f)
        df_timestamps = pd.DataFrame.from_dict(data)
        calculate_pauses_ASR(user, df_timestamps, df_turns, row_list)     
# Test partitions               
path = 'data/ADReSS-IS2020/test/transcription_ASR_full_whisper_large_X/'
files = os.listdir(path)
sel_files = [f for f in files if f.endswith('_alignements.json')]
for file in sel_files:
    user = file.split('_')[0]
    # Load data
    with open(path + file, 'r') as f:
        data = json.load(f)
    df_timestamps = pd.DataFrame.from_dict(data)
    
    calculate_pauses_ASR(user, df_timestamps, df_turns, row_list) 
df_analyze = pd.DataFrame.from_records(row_list)
df_features_pauses = calculate_features(df_analyze)

# ================================================
# Merge everything
# ================================================
df_features = pd.merge(df_features_liwc_ratio, df_features_pauses, on="user", how="right").copy()

# df_features.to_excel('0-publication/features/'+name_feature+'_pause.xlsx',index=False)

In [29]:
df_features.head(2)

Unnamed: 0,user,label,partition,ASR_whisper_Punc,WC,Analytic,Clout,Authentic,Tone,WPS,...,Exclam,Dash,Quote,Apostro,Parenth,OtherP,caract/duration,n_pauses,sum_pause_durations,mean_pause_durations
0,S001,0,train,tell me everything that you see going on in t...,196,87.12,82.09,19.39,18.27,15.08,...,0.0,0.0,0,3.06,0,0.0,11.624726,97,41.450053,0.42732
1,S002,0,train,"just look at the picture . drawing , paperwor...",19,35.06,98.19,4.05,97.58,2.38,...,0.0,0.0,0,5.26,0,0.0,1.961639,7,20.366011,2.90943
