In [5]:
import requests
import os
from dotenv import load_dotenv

load_dotenv()

AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
API_VERSION = os.getenv("API_VERSION")

def transcribe_audio_to_text(audio_file_path, save_path=None):


    headers = {
        "api-key": AZURE_OPENAI_KEY
    }
    files = {
        "file": open(audio_file_path, "rb")
    }
    params = {
        "api-version": API_VERSION,
        "language": "es",
        "prompt": "You are going to be provided audio of interviews in Spanish, help transcribe them returning the text in Spanish",
        "temperature":0
    }

    response = requests.post(AZURE_OPENAI_ENDPOINT, headers=headers, files=files, params=params)

    files["file"].close()

    if response.status_code == 200:
        transcription = response.json().get('text')
        if transcription:
            save_path = save_path or os.path.splitext(audio_file_path)[0] + '.txt'
            with open(save_path, 'w') as text_file:
                text_file.write(transcription)
            print(f"Transcription saved to {save_path}")
        else:
            print("Transcription received but was empty.")
    else:
        print("Error:", response.status_code, response.text)


In [8]:
from pydub import AudioSegment
import os

def convert_media_to_mp3(directory):
    """
    Converts all .m4a and .mp4 files in the specified directory to .mp3 format and saves them in a subdirectory named 'mp3'.

    Parameters:
    directory (str): The path to the directory containing .m4a and .mp4 files.
    """
    mp3_directory = os.path.join(directory, 'mp3')

    #
    if not os.path.exists(mp3_directory):
        os.makedirs(mp3_directory)

    for filename in os.listdir(directory):
        if filename.endswith('.m4a') or filename.endswith('.mp4'):
            file_path = os.path.join(directory, filename)
            mp3_filename = os.path.splitext(filename)[0] + '.mp3'
            mp3_path = os.path.join(mp3_directory, mp3_filename)


            audio = AudioSegment.from_file(file_path, format="mp4" if filename.endswith('.mp4') else "m4a")
            audio.export(mp3_path, format="mp3")
            print(f"Converted {filename} to MP3 and saved in 'mp3' subdirectory")

In [9]:
convert_media_to_mp3("/Users/nicolasmartinez-geijovila/code/IamjustNick/intelcia/audio_transcriptions/data/audio")

Converted 01.MFY (GIJÓN EDP COORDIS 17112023).mp4 to MP3 and saved in 'mp3' subdirectory
Converted 04.MFY (BCN Volotea coordis 01122023).m4a to MP3 and saved in 'mp3' subdirectory
Converted 05.MFY (EMU LEROY MERLIN AGENTES 21112023).m4a to MP3 and saved in 'mp3' subdirectory
Converted 02.MFY (BCN DANONE Agentes 01122023).m4a to MP3 and saved in 'mp3' subdirectory
Converted 06.MFY (OM IB SUPER+COORDI 22112023).m4a to MP3 and saved in 'mp3' subdirectory
Converted 14.MFY (VIGO IB, Ocaso, Naturgy, Mutua COORDIS 12122023).m4a to MP3 and saved in 'mp3' subdirectory
Converted 15.MFY (VIGO IbVentas + Naturgy Agentes 12122024).m4a to MP3 and saved in 'mp3' subdirectory
Converted 12.MFY (SANRO Carrefour agentes 23112023).m4a to MP3 and saved in 'mp3' subdirectory
Converted 03.MFY (BCN VOLOTEA AGENTES 01122023).m4a to MP3 and saved in 'mp3' subdirectory
Converted 11.MFY (OM ZARA AGENTES 22112023).m4a to MP3 and saved in 'mp3' subdirectory
Converted 09.MFY (OM SANTANDER AGENTES NUEVOS 21112023).m

In [4]:
from pydub import AudioSegment
import os

def cut_large_mp3_files(directory):
    """
    Cuts .mp3 files larger than 25 MB in the specified directory into two halves.

    Parameters:
    directory (str): The path to the directory containing .mp3 files.
    """
    for filename in os.listdir(directory):
        if filename.endswith('.mp3'):
            file_path = os.path.join(directory, filename)
            file_size = os.path.getsize(file_path) / (1024 * 1024)  # size in MB

            if file_size > 25:
                audio = AudioSegment.from_file(file_path)

                half_way_point = len(audio) // 2
                audio_part1 = audio[:half_way_point]
                audio_part2 = audio[half_way_point:]

                base_filename = filename.replace('.mp3', '')
                audio_part1.export(os.path.join(directory, f"{base_filename} 1.mp3"), format="mp3")
                audio_part2.export(os.path.join(directory, f"{base_filename} 2.mp3"), format="mp3")

                print(f"Processed {filename}")


In [5]:
cut_large_mp3_files("/Users/nicolasmartinez-geijovila/code/IamjustNick/intelcia/audio_transcriptions/data/audio")

Processed 16.MFY (VIGO ING AGENTES 12122023).mp3
Processed 07.MFY (OM IBERDOLA AGENTES 22112023).mp3
Processed 10.MFY (OM Santander coordinadores 21122023).mp3
Processed 18.MFY (VIGO Ocaso, Mutua, Sabadell AGENTES 12122023).mp3
Processed 15.MFY (VIGO IbVentas + Naturgy Agentes 12122024).mp3
Processed 12.MFY (SANRO Carrefour agentes 23112023).mp3
Processed 02.MFY (BCN DANONE Agentes 01122023).mp3
Processed 14.MFY (VIGO IB, Ocaso, Naturgy, Mutua COORDIS 12122023).mp3
Processed 06.MFY (OM IB SUPER+COORDI 22112023).mp3
Processed 05.MFY (EMU LEROY MERLIN AGENTES 21112023).mp3


In [12]:
from pydub import AudioSegment
import os

def process_and_cleanup_mp3_files(directory):
    """
    Processes .mp3 files larger than 25 MB by cutting them into two halves and deletes the original file.

    Parameters:
    directory (str): The path to the directory containing .mp3 files.
    """
    for filename in os.listdir(directory):
        if filename.endswith('.mp3'):
            file_path = os.path.join(directory, filename)
            file_size = os.path.getsize(file_path) / (1024 * 1024)  # size in MB

            if file_size > 24:
                audio = AudioSegment.from_file(file_path)

                half_way_point = len(audio) // 2
                audio_part1 = audio[:half_way_point]
                audio_part2 = audio[half_way_point:]

                base_filename = filename.replace('.mp3', '')
                audio_part1.export(os.path.join(directory, f"{base_filename} 1.mp3"), format="mp3")
                audio_part2.export(os.path.join(directory, f"{base_filename} 2.mp3"), format="mp3")

                os.remove(file_path)

                print(f"Processed and deleted {filename}")


In [13]:
process_and_cleanup_mp3_files("/Users/nicolasmartinez-geijovila/code/IamjustNick/intelcia/audio_transcriptions/data/audio/mp3")

Processed and deleted 07.MFY (OM IBERDOLA AGENTES 22112023).mp3
Processed and deleted 10.MFY (OM Santander coordinadores 21122023).mp3
Processed and deleted 18.MFY (VIGO Ocaso, Mutua, Sabadell AGENTES 12122023).mp3
Processed and deleted 15.MFY (VIGO IbVentas + Naturgy Agentes 12122024).mp3
Processed and deleted 12.MFY (SANRO Carrefour agentes 23112023).mp3
Processed and deleted 03.MFY (BCN VOLOTEA AGENTES 01122023).mp3
Processed and deleted 02.MFY (BCN DANONE Agentes 01122023).mp3
Processed and deleted 14.MFY (VIGO IB, Ocaso, Naturgy, Mutua COORDIS 12122023).mp3
Processed and deleted 06.MFY (OM IB SUPER+COORDI 22112023).mp3
Processed and deleted 05.MFY (EMU LEROY MERLIN AGENTES 21112023).mp3


In [6]:
import requests
import os
from dotenv import load_dotenv

load_dotenv()

AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
API_VERSION = os.getenv("API_VERSION")

def transcribe_audio_to_text(audio_file_path, save_path=None):



    headers = {
        "api-key": AZURE_OPENAI_KEY
    }
    files = {
        "file": open(audio_file_path, "rb")
    }
    params = {
        "api-version": API_VERSION,
        "language": "es",
        "prompt": "You are going to be provided audio of interviews in Spanish, help transcribe them returning the text in Spanish",
        "temperature":0
    }


    response = requests.post(AZURE_OPENAI_ENDPOINT, headers=headers, files=files, params=params)


    files["file"].close()


    if response.status_code == 200:
        transcription = response.json().get('text')
        if transcription:
            # Save the transcription
            save_path = save_path or os.path.splitext(audio_file_path)[0] + '.txt'
            with open(save_path, 'w') as text_file:
                text_file.write(transcription)
            print(f"Transcription saved to {save_path}")
        else:
            print("Transcription received but was empty.")
    else:
        print("Error:", response.status_code, response.text)

transcribe_audio_to_text("/Users/nicolasmartinez-geijovila/code/IamjustNick/intelcia/audio_transcriptions/data/audio/mp3/01.MFY (GIJÓN EDP COORDIS 17112023).mp3")


Transcription saved to /Users/nicolasmartinez-geijovila/code/IamjustNick/intelcia/audio_transcriptions/data/audio/mp3/01.MFY (GIJÓN EDP COORDIS 17112023).txt


In [3]:
import time

def transcribe_all_mp3_in_directory(directory, retry_delay=60, max_retries=3):
    """
    Processes all .mp3 files in the specified directory for transcription, skipping files already transcribed.

    Parameters:
    directory (str): The path to the directory containing .mp3 files.
    retry_delay (int): Time in seconds to wait before retrying after a 503 error.
    max_retries (int): Maximum number of retries for each file in case of 503 errors.
    """
    for filename in os.listdir(directory):
        if filename.endswith('.mp3'):
            audio_file_path = os.path.join(directory, filename)
            transcription_save_path = os.path.splitext(audio_file_path)[0] + '.txt'


            if os.path.exists(transcription_save_path):
                print(f"Skipping {filename}, transcription already exists.")
                continue


            retries = 0
            while retries < max_retries:
                try:
                    transcribe_audio_to_text(audio_file_path, save_path=transcription_save_path)
                    break  # Break the loop if transcription is successful
                except requests.exceptions.HTTPError as e:
                    if e.response.status_code == 503:
                        print(f"503 Service Unavailable error for {filename}. Retrying in {retry_delay} seconds.")
                        time.sleep(retry_delay)
                        retries += 1
                    else:
                        print(f"Error transcribing {filename}: {e}")
                        break
                except Exception as e:
                    print(f"An error occurred while transcribing {filename}: {e}")
                    break

In [28]:
transcribe_all_mp3_in_directory("data/audio/mp3")

Skipping 14.MFY (VIGO IB, Ocaso, Naturgy, Mutua COORDIS 12122023) 1.mp3, transcription already exists.
Transcription saved to /Users/nicolasmartinez-geijovila/code/IamjustNick/intelcia/audio_transcriptions/data/audio/mp3/01.MFY (GIJÓN EDP COORDIS 17112023).txt
Transcription saved to /Users/nicolasmartinez-geijovila/code/IamjustNick/intelcia/audio_transcriptions/data/audio/mp3/14.MFY (VIGO IB, Ocaso, Naturgy, Mutua COORDIS 12122023) 2.txt
Transcription saved to /Users/nicolasmartinez-geijovila/code/IamjustNick/intelcia/audio_transcriptions/data/audio/mp3/18.MFY (VIGO Ocaso, Mutua, Sabadell AGENTES 12122023) 1.txt
Transcription saved to /Users/nicolasmartinez-geijovila/code/IamjustNick/intelcia/audio_transcriptions/data/audio/mp3/16.MFY (VIGO ING AGENTES 12122023) 2.txt
Transcription saved to /Users/nicolasmartinez-geijovila/code/IamjustNick/intelcia/audio_transcriptions/data/audio/mp3/08.MFY (OM Iberdrola AGENTES 23112023).txt
Transcription saved to /Users/nicolasmartinez-geijovila/cod

## The function below is just to concatenate the different text files into a single file

In [29]:
import os

def create_knowledge_base_with_titles(directory):
    """
    Reads all text files in the specified directory and combines their content into a file named 'knowledge_base.txt'.
    Each document is preceded by its title (derived from the filename) and followed by two line spaces.

    Parameters:
    directory (str): The path to the directory containing text files.
    """
    knowledge_base_path = os.path.join(directory, 'knowledge_base.txt')

    with open(knowledge_base_path, 'w') as kb_file:
        for filename in os.listdir(directory):
            if filename.endswith('.txt') and filename != 'knowledge_base.txt':
                file_path = os.path.join(directory, filename)
                title = os.path.splitext(filename)[0]  # Remove the file extension to get the title

                with open(file_path, 'r') as file:
                    content = file.read()
                    kb_file.write(f"# {title}\n\n{content}\n\n")  # Add title and content with spacing

    print(f"Knowledge base created at {knowledge_base_path}")





create_knowledge_base_with_titles("/Users/nicolasmartinez-geijovila/code/IamjustNick/intelcia/audio_transcriptions/data/audio/mp3")


Knowledge base created at /Users/nicolasmartinez-geijovila/code/IamjustNick/intelcia/audio_transcriptions/data/audio/mp3/knowledge_base.txt
