### Dependency Setup

In [0]:
!pip install git+https://github.com/openai/whisper.git

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-75a4dp1b
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-75a4dp1b
  Resolved https://github.com/openai/whisper.git to commit e58f28804528831904c3b6f2c0e473f346223433
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting triton<3,>=2.0.0
  Downloading triton-2.1.0-0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (89.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 89.2/89.2 MB 10.3 MB/s eta 0:00:

In [0]:
import re
import json
import openai
import pandas as pd
import soundfile as sf
import whisper
model = whisper.load_model("large")

  0%|                                              | 0.00/2.88G [00:00<?, ?iB/s]  0%|▏                                     | 9.97M/2.88G [00:00<00:29, 104MiB/s]  1%|▎                                     | 22.2M/2.88G [00:00<00:25, 118MiB/s]  1%|▍                                     | 33.5M/2.88G [00:00<00:28, 105MiB/s]  2%|▌                                     | 47.8M/2.88G [00:00<00:24, 122MiB/s]  2%|▊                                     | 59.6M/2.88G [00:00<00:26, 116MiB/s]  2%|▉                                     | 71.4M/2.88G [00:00<00:25, 117MiB/s]  3%|█                                     | 82.7M/2.88G [00:00<00:27, 109MiB/s]  3%|█▏                                    | 96.4M/2.88G [00:00<00:25, 119MiB/s]  4%|█▍                                     | 108M/2.88G [00:01<00:27, 107MiB/s]  4%|█▌                                     | 120M/2.88G [00:01<00:26, 113MiB/s]  4%|█▋                                     | 131M/2.88G [00:01<00:27, 105MiB/s]  5%|█▉                    

### Getting necessary parameters

In [0]:
with open('config.json', 'r') as json_file:
    config = json.load(json_file)

[0;31m---------------------------------------------------------------------------[0m
[0;31mFileNotFoundError[0m                         Traceback (most recent call last)
File [0;32m<command-4503195805848441>, line 1[0m
[0;32m----> 1[0m [38;5;28;01mwith[39;00m [38;5;28;43mopen[39;49m[43m([49m[38;5;124;43m'[39;49m[38;5;124;43mconfig.json[39;49m[38;5;124;43m'[39;49m[43m,[49m[43m [49m[38;5;124;43m'[39;49m[38;5;124;43mr[39;49m[38;5;124;43m'[39;49m[43m)[49m [38;5;28;01mas[39;00m json_file:
[1;32m      2[0m     config [38;5;241m=[39m json[38;5;241m.[39mload(json_file)

File [0;32m/databricks/python/lib/python3.10/site-packages/IPython/core/interactiveshell.py:284[0m, in [0;36m_modified_open[0;34m(file, *args, **kwargs)[0m
[1;32m    277[0m [38;5;28;01mif[39;00m file [38;5;129;01min[39;00m {[38;5;241m0[39m, [38;5;241m1[39m, [38;5;241m2[39m}:
[1;32m    278[0m     [38;5;28;01mraise[39;00m [38;5;167;01mValueError[39;00m(
[1;32m    279

### Helper Functions

In [0]:
def get_transcription(audiofile):
    if audiofile[0:2] in ['na', 'uk']:
        whisper_result = model.transcribe(audiofile, fp16 = False)
        original = whisper_result["text"]
        english = whisper_result["text"]
    else:
        whisper_result = model.transcribe(audiofile, fp16 = False)
        original = whisper_result["text"]
        whisper_result = model.transcribe(audiofile, fp16 = False, task = 'translate')
        english = whisper_result["text"]
    return {'original': original, 'english': english}


def get_audio_duration(audiofile):
    f = sf.SoundFile(audiofile)
    audio_duration = int((f.frames / f.samplerate) // 1)
    return audio_duration


def get_llm_response(transcript):

    conversation = [
        {"role": "system", "content": "You are a market expert who can identify the pros and cons of a product from a text."},
        {"role": "user", "content": f"""Understand the following transcript and extract the following items 
                                        [brand, sub-brand, benefits, problems, product category].
                                        When looking for benefits and problems, consider only those arising out of use of the product.  
                                        Here is the text : {transcript}
                                        Present only the results in a neat JSON format. Always use lists when representing the values.
                                        Bargain detergents are competing products. So ignore the problems and benefits of competing products.
                                        Take a deep breath. Let us solve this rationally.
                                        """}
    ]

    openai.api_key = config['api_key']
    results = openai.Completion.create(
        engine = config['model_name'],
        max_tokens = 200
    )

    response = results['choices'][0]['text'].strip()

    return response


def type_correction(resp):
    for value in ['brand', 'sub_brand', 'product_category', 'benefits', 'problems']:
        x = resp[value]
        if x:
            if type(x) == str:
                resp[value] = [x]
        else:
            resp[value] = ''

    for value in ['benefits', 'problems']:
        resp[f'{value}_count'] = len(resp[value])
        resp[value] = ', '.join(resp[value])

    return resp


def wordcounter(text, word):
    text = re.sub('[^A-Za-z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    words_list = [x.strip() for x in text.split()]
    return words_list.count(word)


def brand_subbrand_correction():
    for brand in corrected_llm_resp['brand']:
        list_of_counts = []
        count = wordcounter(whisper_response['english'], brand)
        list_of_counts.append(count)
        corrected_llm_resp[f'brand_count'] = list_of_counts

    for sub_brand in corrected_llm_resp['sub_brand']:
        list_of_counts = []
        count = wordcounter(whisper_response['english'], sub_brand)
        list_of_counts.append(count)
        corrected_llm_resp[f'sub_brand_count'] = list_of_counts


### Driver

In [0]:
files_list = ['de_001.mp3', 'ph_001.mp3', 'ph_002.mp3', 'ph_003.mp3', 'de_002.mp3', 'de_003.mp3', 'ksa_002.mp3', 'ja_001.mp3', 'ksa_003.mp3', 
              'na_002.mp3', 'uk_001.mp3', 'na_001.mp3', 'uk_002.mp3', 'uk_003.mp3', 'ja_002.mp3', 'uk_004.mp3', 'ja_004.mp3', 'uk_005.mp3', 
              'ja_003.mp3', 'ksa_001.mp3']

all_responses = []

for filename in files_list:
    whisper_response = get_transcription(filename)

    llm_response = get_llm_response(whisper_response['english'])
    corrected_llm_resp = type_correction(llm_response)
    brand_subbrand_correction()

    corrected_llm_resp['original'] = whisper_response['original'].strip()
    corrected_llm_resp['english'] = whisper_response['english'].strip()
    corrected_llm_resp['duration'] = get_audio_duration(filename)
    corrected_llm_resp['filename'] = filename
    all_responses.append(corrected_llm_resp)
    print(filename)
    

de_001.mp3
ph_001.mp3
ph_002.mp3
ph_003.mp3
de_002.mp3
de_003.mp3
ksa_002.mp3
ja_001.mp3
ksa_003.mp3
na_002.mp3
uk_001.mp3
na_001.mp3
uk_002.mp3
uk_003.mp3
ja_002.mp3
uk_004.mp3
ja_004.mp3
uk_005.mp3
ja_003.mp3
ksa_001.mp3


### Export to CSV

In [0]:
df = pd.DataFrame()
for resp in all_responses:
    tdf = pd.DataFrame.from_records(resp, index = [0])
    df = pd.concat([df, tdf], axis = 0, ignore_index = True)
df.to_csv('transcript_extract.csv', index = False)