In [4]:
import pprint
import pandas as pd
import torch, whisper, glob
import torch.nn.functional as F
from whisper.model import Whisper
from typing import List, Optional, Dict
from scrape import get_entire_web_google_results
from whisper.tokenizer import Tokenizer, get_tokenizer
from whisper.audio import N_FRAMES, N_MELS, log_mel_spectrogram, pad_or_trim

# PART 01

## Whisper Speech 2 Text Hugging Face

In [None]:
@torch.no_grad()
def calculate_audio_features(audio_path: Optional[str], model: Whisper) -> torch.Tensor:
    if audio_path is None:
        segment = torch.zeros((N_MELS, N_FRAMES), dtype=torch.float32).to(model.device)
    else:
        mel = log_mel_spectrogram(audio_path)
        segment = pad_or_trim(mel, N_FRAMES).to(model.device)
    return model.embed_audio(segment.unsqueeze(0))


@torch.no_grad()
def calculate_average_logprobs(
                                model: Whisper,
                                audio_features: torch.Tensor,
                                class_names: List[str],
                                tokenizer: Tokenizer,
                            ) -> torch.Tensor:
    initial_tokens = (
        torch.tensor(tokenizer.sot_sequence_including_notimestamps).unsqueeze(0).to(model.device)
    )
    eot_token = torch.tensor([tokenizer.eot]).unsqueeze(0).to(model.device)

    average_logprobs = torch.zeros(len(class_names))
    for i, class_name in enumerate(class_names):
        class_name_tokens = (
            torch.tensor(tokenizer.encode(" " + class_name)).unsqueeze(0).to(model.device)
        )
        input_tokens = torch.cat([initial_tokens, class_name_tokens, eot_token], dim=1)

        logits = model.logits(input_tokens, audio_features)  # (1, T, V)
        logprobs = F.log_softmax(logits, dim=-1).squeeze(0)  # (T, V)
        logprobs = logprobs[len(tokenizer.sot_sequence_including_notimestamps) - 1 : -1]  # (T', V)
        logprobs = torch.gather(logprobs, dim=-1, index=class_name_tokens.view(-1, 1))  # (T', 1)
        average_logprob = logprobs.mean().item()
        average_logprobs[i] = average_logprob

    return average_logprobs


def calculate_internal_lm_average_logprobs(
                                            model: Whisper,
                                            class_names: List[str],
                                            tokenizer: Tokenizer,
                                            verbose: bool = False,
                                        ) -> torch.Tensor:
    audio_features_from_empty_input = calculate_audio_features(None, model)
    average_logprobs = calculate_average_logprobs(
                                                model=model,
                                                audio_features=audio_features_from_empty_input,
                                                class_names=class_names,
                                                tokenizer=tokenizer,
                                                )
    if verbose:
        print("Internal LM average log probabilities for each class:")
        for i, class_name in enumerate(class_names):
            print(f"  {class_name}: {average_logprobs[i]:.3f}")
    return average_logprobs

model_cache = {}

def generate_candidate_str(command_path = 'data/voice_command/commands.txt'):
    with open(command_path, 'r') as f:
        commands = f.readlines()
    commands = [c.strip() for c in commands]
    commands = [c for c in commands if len(c) > 0]
    commands = [f"[{c}]" for c in commands]
    return ','.join(commands)

def zero_shot_classify(
                        audio_path: str, 
                        model_name = 'small'
                        ) -> Dict[str, float]:
    class_names = generate_candidate_str()
    class_names = class_names.split(",")
    tokenizer = get_tokenizer(multilingual=".en" not in model_name)

    if model_name not in model_cache:
        model = whisper.load_model(model_name)
        model_cache[model_name] = model
    else:
        model = model_cache[model_name]

    internal_lm_average_logprobs = calculate_internal_lm_average_logprobs(
        model=model,
        class_names=class_names,
        tokenizer=tokenizer,
    )
    audio_features = calculate_audio_features(audio_path, model)
    average_logprobs = calculate_average_logprobs(
        model=model,
        audio_features=audio_features,
        class_names=class_names,
        tokenizer=tokenizer,
    )
    average_logprobs -= internal_lm_average_logprobs
    scores = average_logprobs.softmax(-1).tolist()
    return {class_name: score for class_name, score in zip(class_names, scores)}

### Analyze for all audio files in the dataset

In [None]:
def extract_command(file_name):
    return file_name.split('/')[-1]

def process_output(output):
    command = max(output, key=output.get)
    command = command[1:-1]
    return command

In [None]:
audio_files = glob.glob('data/voice_command/*/*.wav')
audio_files = [audio_file.replace('\\', '/') for audio_file in audio_files]

result_dict = {}
result_dict['file_name'] = []
result_dict['command'] = []

for audio_file in audio_files:
    result = zero_shot_classify(audio_file)
    result = process_output(result)
    result_dict['file_name'].append(extract_command(audio_file))
    result_dict['command'].append(result)
    
df = pd.DataFrame(result_dict)
df.to_csv('data/voice_command/result.csv', index=False)

### Inference

In [None]:
def inference_voice_command(audio_file):
    result = zero_shot_classify(audio_file)
    result = process_output(result)
    return result

# PART 02

In [3]:
def scraping_references(search_item):
    df = get_entire_web_google_results(search_item)
    df = df[['title', 'description', 'DOI']]
    return df.to_dict('records')

In [5]:
text = '''Economics is a social science that studies how societies allocate scarce resources among competing demands. It examines the production, distribution, and consumption of goods and services, as well as the behavior of individuals, firms, and governments in making economic decisions. “Economics focuses on the behaviour and interactions of economic agents and how economies work. Microeconomics analyzes what's viewed as basic elements in the economy, including individual agents and markets, their interactions, and the outcomes of interactions. Individual agents may include, for example, households, firms, buyers, and sellers.”  The two main branches of economics are microeconomics, which focuses on individual economic behavior and decision-making, and macroeconomics, which looks at the economy as a whole. '''
response = scraping_references(text)
pprint.pprint(response)

[{'DOI': 'https://en.wikipedia.org/wiki/Economics',
  'description': 'Economics is a social science that studies the production, '
                 'distribution, and consumption of goods and services. ... '
                 'Economics focuses on the behaviour and interactions of '
                 'economic agents\xa0...',
  'title': 'Economics - Wikipedia'},
 {'DOI': 'https://www.investopedia.com/terms/e/economics.asp',
  'description': 'Economics is a social science that focuses on the '
                 'production, distribution, and consumption of goods and '
                 'services, and analyzes the choices that individuals\xa0...',
  'title': 'Economics Defined with Types, Indicators, and Systems'},
 {'DOI': 'https://www.bu.edu/eci/files/2019/05/MIC_4e_Ch7.pdf',
  'description': 'behavioral economics: a subfield of microeconomics that uses '
                 'insights from various social and biological sciences to '
                 'explore how people make actual economic\xa