In [21]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import torch
import os
import re
import librosa
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch.nn.functional as F
import pandas as pd

# Load experiment data with stimuli
xls = pd.ExcelFile('data/experiment_data.xlsx')
experiment_data = {}
# Iterate through sheet names and load into the dictionary
for sheet_name in xls.sheet_names:
    df = pd.read_excel(xls, sheet_name=sheet_name)
    sheet_name = sheet_name.lower().rstrip('-ru')
    experiment_data[sheet_name] = df

In [1]:
processor = AutoProcessor.from_pretrained("mitchelldehaven/whisper-medium-ru")
model = AutoModelForSpeechSeq2Seq.from_pretrained("mitchelldehaven/whisper-medium-ru")

Downloading (…)rocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/830 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

In [19]:

# Preprocess the audio data
inputs = processor(audio_data['BE']['1'][-2], return_tensors="pt", sampling_rate=16000).input_features

# Create dummy decoder_input_ids
decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id

# Generate logits from the model
with torch.no_grad():
    logits = model(inputs, decoder_input_ids=decoder_input_ids).logits

# Print the shape of the last hidden state
print(list(logits.shape))

[1, 2, 51865]


In [3]:
def load_audio_files(language):
    """
    Load the audio data in appropriate format for LMs
    """
    audio_directory = f"data/audio_data/full_audio"
    
    if not os.path.exists(audio_directory):
        print(f"Error: Directory '{audio_directory}' not found.")
        return None

    audio_files = [f for f in os.listdir(audio_directory) if f.endswith(".wav")]
    
    all_data = {}
    
    for index, row in experiment_data[language.lower()].iterrows():
        row_number_str = str(index + 1)
        sentence_l2 = row['sentence l2']
        phrase_ru = row['phrase ru']
        phrase_l2 = row['phrase l2']
        audio_file_path = next((os.path.join(audio_directory, f) for f in audio_files if re.search(rf'{language}_{row_number_str}\.wav', f)), None)

        if audio_file_path:
            # Load audio data using librosa
            audio_data, sr = librosa.load(audio_file_path, sr=16000)
            # Calculate duration in seconds
            duration = len(audio_data) / sr
            all_data[row_number_str] = (phrase_ru, phrase_l2, sentence_l2, audio_data, duration)
        else:
            print(f"Warning: Audio file not found for row {row_number_str}, {language}")

    return all_data

# Example usage for loading audio for different languages
languages = ["BE", "BG", "CS", "UK", "PL"]
audio_data = dict()
for language in languages:
    audio_data[language] = load_audio_files(language)

audio_data['BE']['1']

('не раз',
 'не раз',
 'не раз відаць яны пра гэта гаварылі…',
 array([-0.00072414, -0.00259487, -0.00278208, ...,  0.000322  ,
         0.00276465,  0.        ], dtype=float32),
 3.2298125)

In [4]:
audio_duration = dict()
for language in languages:
    audio_duration[language] = dict()
    # Iterate through each row in the DataFrame
    for index, row in experiment_data[language.lower()].iterrows():
        sentence_l2 = row['sentence l2']
        audio_file_times = row['audio_file_times'].split(',')
    
        if len(audio_file_times) == 4:
            # If audio_file_times has 3 parts, set start as the first number, and end as the sum of the first two numbers
            start = float(audio_file_times[0])
            end = start + float(audio_file_times[1])
        elif len(audio_file_times) == 3:
            # If audio_file_times has 2 parts
            phrase_l2 = row['phrase l2']
    
            # Check if phrase_l2 is at the beginning of sentence_l2
            if sentence_l2.strip('i ').strip('— ').startswith(phrase_l2):

                start = None
                end = float(audio_file_times[0])
            elif sentence_l2.strip('?').strip('.').endswith(phrase_l2):
                start = float(audio_file_times[0])
                end = None
            else:
                print(phrase_l2, sentence_l2, audio_file_times)
                # Handle other cases as needed
                start = None
                end = None
        else:
            # Handle other cases as needed
            print(phrase_l2, sentence_l2, audio_file_times)
            start = None
            end = None
    
        # Store the result in the dictionary
        audio_duration[language][sentence_l2] = (start, end)
audio_duration

{'BE': {'не раз відаць яны пра гэта гаварылі…': (None, 1.09),
  'і колькі б я не зарабляў, хоць мільён, хоць сто мільёнаў, табе ўсе роўна будзе мала.': (4.589,
   5.1000000000000005),
  'старшыня беларускага фонду культуры уладзімір гілеп прэзентаваў у ялце новае арыгінальнае выданне твораў максіма багдановіча, толькі што выпушчанае ў свет фондам культуры.': (8.357,
   8.963),
  '— у свярдлоўску сыйшла, — сказаў я першае, што прыйшло ў галаву, і тут жа падумаў: “а, можа, і праўда: у свярдлоўску сыйшла?..': (4.966,
   5.2620000000000005),
  'да гэтай пары яны мне любоўна кажуць: “якое гладкае ў цябе і мускулістае цела…”': (None,
   1.682),
  'ступак нешта мармытаў у адказ, што мала разумее сам.': (1.584, 2.167),
  'капітан між тым варухнуў упартым падбародкам, паказваючы кудысьці ўбок.': (4.348,
   None),
  'такім чынам, шматлікія версіі сышліся ў непрыемнай знаходцы шэрагу містыфікацый, якія больш за паўстагоддзя нацягвалі на сябе маску арыгінальных тэкстаў.': (7.682,
   8.14),
  'зрэш

In [5]:


def get_audio_duration_internal(file_path):
    audio = AudioSegment.from_file(file_path)
    duration_in_seconds = len(audio) / 1000  # Convert milliseconds to seconds
    return duration_in_seconds

def get_audio_duration(basedir, countryCode, index):
    mp3_file_1 = f"{basedir}{countryCode}_{index}_0" + EXTENSION
    mp3_file_2 = f"{basedir}{countryCode}_{index}_1" + EXTENSION
    #check if 3rd file exists
    mp3_file_3 = f"{basedir}{countryCode}_{index}_2" + EXTENSION
    # if 3rd file doesnt exist then just return the two paths
    if not os.path.exists(mp3_file_3):
        duration_1 = get_audio_duration_internal(mp3_file_1)
        duration_2 = get_audio_duration_internal(mp3_file_2)
        # return mp3_file_1, mp3_file_2, duration_1, duration_2
        return duration_1, duration_2
    else:
        duration_1 = get_audio_duration_internal(mp3_file_1)
        duration_2 = get_audio_duration_internal(mp3_file_2)
        duration_3 = get_audio_duration_internal(mp3_file_3)
        # return mp3_file_1, mp3_file_2, mp3_file_3, duration_1, duration_2, duration_3
        return duration_1, duration_2, duration_3

def combine_audio_files(input_file_paths, output_file_path):
    # Initialize an empty audio segment for the final combined audio
    combined_audio = AudioSegment.empty()


    # Iterate through the input file paths and concatenate them
    for file_path in input_file_paths:
        audio_segment = AudioSegment.from_file(file_path)
        combined_audio += audio_segment 
base_dir = 'data/audio_data/full_audio'

In [25]:
def get_conditional_surprisal(audio_data, audio_context, language):
    surprisal_values = {}

    for file_number, (phrase_ru, phrase_l2, sentence, audio_clip, duration) in audio_data.items():
        # Extract the specified audio segment
        start_time, end_time = audio_context[language][sentence]
        segment_start = int(round(start_time * 16000)) if start_time is not None else 0
        segment_end = int(round(end_time * 16000)) if end_time is not None else len(audio_clip)
        
        # Ensure segment indices are within the bounds of the audio_clip
        segment_start = max(0, min(segment_start, len(audio_clip)))
        segment_end = max(0, min(segment_end, len(audio_clip)))
        
        # Cut the audio segment from the original clip
        # Preprocess the audio data
        context_input_values = processor(audio_clip, return_tensors="pt", sampling_rate=16000).input_features
        
        # Create dummy decoder_input_ids
        decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        
        # Generate logits from the model
        with torch.no_grad():
            context_logits = model(context_input_values, decoder_input_ids=decoder_input_ids).logits
                
        # Apply softmax to get probabilities
        context_probs = F.softmax(context_logits, dim=-1)
        # Extract the segment from the processed context
       #context_input_for_segment = context_input_values[:, segment_start:segment_end]
        #print('this is context_logits', context_input_values.size(), context_logits.size())
        # Extract the processed segment from the processed context
        surprisal_value = -torch.log(context_probs).mean().item()
        surprisal_values[file_number] = (phrase_ru, phrase_l2, sentence, surprisal_value)
        print(surprisal_value)
    return surprisal_values


# Save surprisal values for each language
for language in languages:
    print(language)
    audio_data_language = audio_data[language]
    # Create a new dictionary with only the first two entries for each language
    #audio_data_language = {key: audio_data[language][key] for key in ['1', '2']}
    surprisal_values_language = get_conditional_surprisal(audio_data_language, audio_duration, language)

    # Create directory if it doesn't exist
    save_directory = f"results/surprisal_whisper_medium_ru"
    os.makedirs(save_directory, exist_ok=True)
    # Create a list of dictionaries for DataFrame creation
    data_list = []
    
    for audio_number, surprisal_values in surprisal_values_language.items():
            expression_ru, expression_l2, sentence, surprisal_data = surprisal_values[0], surprisal_values[1], surprisal_values[2], surprisal_values[3]
            data_list.append({
                'audio_number': audio_number,
                'Expression RU': expression_ru,
                'Expression L2': expression_l2,
                'Sentence': sentence,
                'Surprisal Data': surprisal_data
            })
    
    # Create DataFrame
    df = pd.DataFrame(data_list)
    df.to_csv(os.path.join(save_directory, f'{language}_surprisal_data_whisper_medium_ru.csv'), index=False)

BE
34.216163635253906
34.276607513427734
31.537004470825195
35.64483642578125
35.611106872558594
28.639984130859375
32.06452941894531
31.96065330505371
32.72020721435547
34.264888763427734
28.21198272705078
37.12773132324219
31.760637283325195
35.682682037353516
31.604717254638672
34.398563385009766
32.731143951416016
36.41058349609375
35.72095489501953
35.41029739379883
26.8190860748291
32.17329025268555
34.84971237182617
34.73724365234375
38.0227165222168
32.902828216552734
33.01447296142578
31.9245662689209
34.30569076538086
33.89590835571289
33.794944763183594
32.416500091552734
32.3490104675293
37.685707092285156
30.158742904663086
30.072235107421875
32.76182556152344
30.383220672607422
33.5283203125
29.880266189575195
30.38016700744629
31.136682510375977
26.666027069091797
34.73072052001953
35.864620208740234
35.92337417602539
36.22817611694336
32.810646057128906
34.078392028808594
32.25362777709961
33.09435272216797
36.970314025878906
32.926029205322266
32.43156814575195
31.6660