In [58]:
# !pip install streamlit

Collecting streamlit
  Using cached streamlit-1.36.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Using cached altair-5.3.0-py3-none-any.whl.metadata (9.2 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Using cached pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Using cached streamlit-1.36.0-py2.py3-none-any.whl (8.6 MB)
Using cached altair-5.3.0-py3-none-any.whl (857 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: pydeck, altair, streamlit
Successfully installed altair-5.3.0 pydeck-0.9.1 streamlit-1.36.0


In [2]:
import requests
import urllib.parse
import os
from datetime import datetime
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
import io
from googleapiclient.http import MediaIoBaseDownload
from unstructured.partition.pdf import partition_pdf
import glob
import json
import time
from dotenv import load_dotenv

In [54]:
def search_pdfs_on_github(query, num_urls=5):
    load_dotenv()
    base_url = "https://www.googleapis.com/customsearch/v1"
    #GOOGLE_SEARCH_API
    api_key = os.environ.get("GOOGLE_SEARCH_API")
    # GOOGLE_SEARCH_ENGINE_ID
    search_engine_id = os.environ.get("GOOGLE_SEARCH_ENGINE_ID")
    
    params = {
        "q": f"{query} site:github.com filetype:pdf",
        "key": api_key,
        "cx": search_engine_id,
        "num": num_urls
    }
    
    print(f"Request parameters: {params}")
    print(f"Request URL: {base_url}?{urllib.parse.urlencode(params)}")
    
    print("Sending request to Google Custom Search API...")
    response = requests.get(base_url, params=params)
    print("Response received.")
    
    if response.status_code == 200:
        print("Parsing response data...")
        data = response.json()
        print("Response data parsed.")
        print(f"Response data: {data}")
        
        pdf_urls = []
        if "items" in data:
            print("Found 'items' in response data.")
            for item in data["items"]:
                pdf_url = item["link"]
                pdf_urls.append(pdf_url)
                print(f"Added URL: {pdf_url}")
        else:
            print("No 'items' found in response data.")
    else:
        print(f"Error: {response.status_code}")
        print(response.json())
        pdf_urls = []
    
    print(f"Found {len(pdf_urls)} PDF URLs.")
    return pdf_urls


def create_folder(name, parent_id=None):
    file_metadata = {
        'name': name,
        'mimeType': 'application/vnd.google-apps.folder'
    }
    if parent_id:
        file_metadata['parents'] = [parent_id]
    folder = drive_service.files().create(body=file_metadata, fields='id').execute()
    return folder.get('id')

def find_or_create_folder(name, parent_id=None):
    query = f"name='{name}' and mimeType='application/vnd.google-apps.folder'"
    if parent_id:
        query += f" and '{parent_id}' in parents"
    results = drive_service.files().list(q=query, spaces='drive', fields='files(id, name)').execute()
    folders = results.get('files', [])
    if folders:
        return folders[0]['id']
    else:
        return create_folder(name, parent_id)


def download_and_upload_pdf(drive_service, url, folder_id):
    response = requests.get(url)
    if response.status_code == 200:
        file_name = os.path.basename(url)
        with open(file_name, 'wb') as f:
            f.write(response.content)
        
        file_metadata = {'name': file_name, 'parents': [folder_id]}
        media = MediaFileUpload(file_name, resumable=True)
        file = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
        
        os.remove(file_name)  # Remove the local file after uploading
        print(f"File {file_name} uploaded successfully.")
        return file.get('id')
    else:
        print(f"Failed to download {url}")
        return None


def download_file(drive_service, file_id, file_name):
    request = drive_service.files().get_media(fileId=file_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while done is False:
        status, done = downloader.next_chunk()
    fh.seek(0)
    with open(file_name, 'wb') as f:
        f.write(fh.read())

def pdf_to_text(file_path):
    elements = partition_pdf(filename=file_path)
    return '\n'.join([str(element) for element in elements])

def process_pdf(drive_service, file_id, file_name, folder_id):
    print(f"Processing {file_name}...")

    # Download the file
    download_file(drive_service, file_id, file_name)

    # Convert PDF to text
    text_content = pdf_to_text(file_name)

    # Save text content to a file
    text_file_name = f"{os.path.splitext(file_name)[0]}.txt"
    with open(text_file_name, 'w', encoding='utf-8') as f:
        f.write(text_content)

    # Upload text file to Google Drive
    file_metadata = {'name': text_file_name, 'parents': [folder_id]}
    media = MediaFileUpload(text_file_name, resumable=True)
    drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()

    # Clean up local files
    os.remove(file_name)
    os.remove(text_file_name)

    print(f"Processed {file_name} and uploaded {text_file_name}")

def upload_pdf_to_google_drive(drive_service, pdf_urls, subfolder_id, num_files_to_process=None):
    # Download and upload PDFs
    uploaded_files = []
    for url in pdf_urls[:num_files_to_process]:
        file_id = download_and_upload_pdf(drive_service, url, subfolder_id)
        if file_id:
            uploaded_files.append((file_id, os.path.basename(url)))

    print("All PDFs have been uploaded to Google Drive.")

    # Process uploaded PDFs
    print("Processing PDFs...")
    for file_id, file_name in uploaded_files:
        process_pdf(drive_service, file_id, file_name, subfolder_id)

    print("All PDFs have been processed and converted to text.")


def download_file(drive_service, file_id, file_name):
    request = drive_service.files().get_media(fileId=file_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while done is False:
        status, done = downloader.next_chunk()
    fh.seek(0)
    with open(file_name, 'wb') as f:
        f.write(fh.read())

def pdf_to_text(file_path):
    try:
        elements = partition_pdf(filename=file_path)
        return '\n'.join([str(element) for element in elements])
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def process_pdfs(drive_service, folder_id, local_dir, num_files=None):
    # Create local directory if it doesn't exist
    os.makedirs(local_dir, exist_ok=True)
    
    # List files in the Google Drive folder
    query = f"'{folder_id}' in parents and mimeType='application/pdf'"
    results = drive_service.files().list(q=query, spaces='drive', fields='files(id, name)').execute()
    files = results.get('files', [])
    
    # Limit the number of files to process if specified
    if num_files is not None:
        files = files[:num_files]
    
    for file in files:
        file_name = file['name']
        file_id = file['id']
        local_path = os.path.join(local_dir, file_name)
        
        print(f"Downloading {file_name}...")
        download_file(drive_service, file_id, local_path)
        
        print(f"Converting {file_name} to text...")
        text_content = pdf_to_text(local_path)
        
        if text_content:
            # Save text content
            text_file_path = os.path.splitext(local_path)[0] + '.txt'
            with open(text_file_path, 'w', encoding='utf-8') as f:
                f.write(text_content)
            print(f"Processed {file_name}")
        else:
            print(f"Failed to process {file_name}")
    
    print("All specified PDFs have been downloaded and converted to text.")


def process_last_text_file(directory="pdfs_to_convert_to_text", preview_length=100):
    # Get all text files in the directory
    txt_files = glob.glob(os.path.join(directory, "*.txt"))
    
    # Sort the files to ensure consistent ordering
    txt_files.sort()
    
    created_variable_name = None
    content = ""
    
    if txt_files:
        # Get the last text file
        last_txt_file = txt_files[-1]
        
        # Extract the base name and create the variable name
        base_name = os.path.basename(last_txt_file)
        file_name_without_ext = os.path.splitext(base_name)[0]
        variable_name = f"{file_name_without_ext}_str"
        
        # Read the content of the last file
        with open(last_txt_file, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Create a variable with the file name and _str suffix
        globals()[variable_name] = content
        created_variable_name = variable_name
        
        print(f"Created variable: {variable_name}")
        print(f"Content preview: {content[:preview_length]}...")  # Print preview_length characters as preview
    else:
        print("No text files found in the directory.")
    
    # Now, let's use the created variable
    if created_variable_name:
        # Print every third character of the preview
        print(f"Every third character of the preview from {created_variable_name}:")
        print(content[:preview_length][::3])
    else:
        print("No variable was created because no text files were found.")
    
    return created_variable_name, content


def chunk_text(text, max_chunk_size=5000):
    words = text.split()
    chunks = []
    current_chunk = []
    current_size = 0
    for word in words:
        if current_size + len(word) + 1 > max_chunk_size:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_size = len(word)
        else:
            current_chunk.append(word)
            current_size += len(word) + 1
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

def text_to_speech(text, file_name, output_directory="audio_voiceovers", chunk_index=None):
    load_dotenv()
    CHUNK_SIZE = 1024
    XI_API_KEY = os.environ.get("XI_API_KEY")
    VOICE_ID = os.environ.get("VOICE_ID")
    tts_url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}/stream"
    headers = {
        "Accept": "audio/mpeg",
        "Content-Type": "application/json",
        "xi-api-key": XI_API_KEY
    }
    data = {
        "text": text,
        "model_id": "eleven_multilingual_v2",
        "voice_settings": {
            "stability": 0.5,
            "similarity_boost": 0.8,
            "style": 0.0,
            "use_speaker_boost": True
        }
    }
    os.makedirs(output_directory, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    if chunk_index is not None:
        output_file_name = f"{timestamp}_{file_name}_part{chunk_index+1}_audio.mp3"
    else:
        output_file_name = f"{timestamp}_{file_name}_audio.mp3"
    
    output_path = os.path.join(output_directory, output_file_name)
    response = requests.post(tts_url, json=data, headers=headers, stream=True)
    if response.status_code == 200:
        with open(output_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
                if chunk:
                    f.write(chunk)
        print(f"Audio stream saved successfully to {output_path}")
        return output_path
    else:
        print(f"Error in text-to-speech conversion: {response.status_code}")
        print(response.text)
        return None

def get_text_files(directory):
    return [f for f in os.listdir(directory) if f.endswith('.txt')]

def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def process_text_file(file_path, output_directory, delay_between_chunks):
    content = read_text_file(file_path)
    file_name_without_ext = os.path.splitext(os.path.basename(file_path))[0]
    
    print(f"Converting {os.path.basename(file_path)} to speech...")
    
    text_chunks = chunk_text(content)
    
    for i, chunk in enumerate(text_chunks):
        print(f"Processing chunk {i+1} of {len(text_chunks)}...")
        audio_file_path = text_to_speech(chunk, file_name_without_ext, output_directory, i)
        
        if audio_file_path:
            print(f"Audio file saved as: {audio_file_path}")
        else:
            print(f"Failed to convert chunk {i+1} of {file_path} to speech.")
        
        if i < len(text_chunks) - 1:
            print(f"Waiting for {delay_between_chunks} seconds before processing the next chunk...")
            time.sleep(delay_between_chunks)

# def convert_txt_to_audio_voiceover(text_directory = "pdfs_to_convert_to_text", output_directory = "audio_voiceovers"):
#     delay_between_chunks = 60  # Adjust this value as needed

#     text_files = get_text_files(text_directory)
    
#     for text_file in text_files:
#         file_path = os.path.join(text_directory, text_file)
#         process_text_file(file_path, output_directory, delay_between_chunks)
    
#     print("All text files have been processed.")

def convert_txt_to_audio_voiceover(text_directory="pdfs_to_convert_to_text", output_directory="audio_voiceovers"):
    delay_between_chunks = 60  # Adjust this value as needed
    text_files = get_text_files(text_directory)
    
    mp3_paths = []
    for text_file in text_files:
        file_path = os.path.join(text_directory, text_file)
        mp3_path = process_text_file(file_path, output_directory, delay_between_chunks)
        mp3_paths.append(mp3_path)
    
    print("All text files have been processed.")
    
    # Return the path of the last generated MP3 file
    return mp3_paths[-1] if mp3_paths else None

In [22]:
# Example usage
num_urls = 5
keywords = "observability"
query = keywords

print("Searching for PDF URLs...")
pdf_urls = search_pdfs_on_github(query, num_urls)
print("Search completed.")

print("PDF URLs found:")
for url in pdf_urls:
    print(url)

Searching for PDF URLs...
Request parameters: {'q': 'observability site:github.com filetype:pdf', 'key': 'AIzaSyDh6OU3OFcn_t3vrxO2fzGU49cT6pXpW0g', 'cx': 'f4afc285f53804afb', 'num': 5}
Request URL: https://www.googleapis.com/customsearch/v1?q=observability+site%3Agithub.com+filetype%3Apdf&key=AIzaSyDh6OU3OFcn_t3vrxO2fzGU49cT6pXpW0g&cx=f4afc285f53804afb&num=5
Sending request to Google Custom Search API...
Response received.
Parsing response data...
Response data parsed.
Response data: {'kind': 'customsearch#search', 'url': {'type': 'application/json', 'template': 'https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&dateRestrict={dateRes

In [39]:
# Define the scopes you need
SCOPES = ['https://www.googleapis.com/auth/drive.file']

# Create the flow using the client secrets file from the Google API Console.
flow = InstalledAppFlow.from_client_secrets_file(
    'credentials.json',
    scopes=SCOPES,
    redirect_uri='http://localhost:8081/'  # Ensure this matches the configured redirect URI
)

# Run the OAuth flow
creds = flow.run_local_server(port=8081)

# Build the Drive service
drive_service = build('drive', 'v3', credentials=creds)

# Create main folder 'getpdfs' if it doesn't exist
main_folder_id = find_or_create_folder('getpdfs')

# Create subfolder with today's date and time
current_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
subfolder_id = create_folder(current_time, main_folder_id)

upload_pdf_to_google_drive(drive_service, pdf_urls, subfolder_id, num_files_to_process)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=131811907665-252blpd0att90p8qbeiifjf4cpsh20e4.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8081%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.file&state=kTEgf2Z9dduIZXRxbuXNVVbTMy4x4D&access_type=offline
File Getting.Started.with.Ingesting.GitHub.GHAS.Alerts.pdf uploaded successfully.
File Synqor_thermal_relief_study.pdf uploaded successfully.
All PDFs have been uploaded to Google Drive.
Processing PDFs...
Processing Getting.Started.with.Ingesting.GitHub.GHAS.Alerts.pdf...
Processed Getting.Started.with.Ingesting.GitHub.GHAS.Alerts.pdf and uploaded Getting.Started.with.Ingesting.GitHub.GHAS.Alerts.txt
Processing Synqor_thermal_relief_study.pdf...
Processed Synqor_thermal_relief_study.pdf and uploaded Synqor_thermal_relief_study.txt
All PDFs have been processed and converted to text.


In [50]:
local_directory = "pdfs_to_convert_to_text"
num_files_to_process = 1

process_pdfs(drive_service, subfolder_id, local_directory, num_files_to_process)

How many files do you want to process? (Enter a number or press Enter for all):  1


Downloading Synqor_thermal_relief_study.pdf...
Converting Synqor_thermal_relief_study.pdf to text...
Processed Synqor_thermal_relief_study.pdf
All specified PDFs have been downloaded and converted to text.


In [51]:
variable_name, content = process_last_text_file(preview_length=100)
content

Created variable: Synqor_thermal_relief_study_str
Content preview: Thermal & Reliability Study on High Current Thermal Vias & Output Pins Application Note 00-08-01 Rev...
Every third character of the preview from Synqor_thermal_relief_study_str:
Trl lbi u  gCrthm a&uuPspitno -- v


'Thermal & Reliability Study on High Current Thermal Vias & Output Pins Application Note 00-08-01 Rev. 03 - 7/31/01\nSummary:\nThis application note addresses concerns raised with regards to pin and board heating when high currents (>60A) are driven through ther- mally relieved plated through hole (PTH) vias. Detailed thermal analy- sis will show that internal heat generation and resulting temperature rise of the power converter and load board is minimal when using a single thermally relieved via.\nIntroduction Trends in the design of Distributed Power Architectures (DPA\'s) are requiring DC-DC converters to deliver very high power at very low voltages. This has required a thorough examination on how high current power can be delivered safely and reliably to the distribution path within a printed circuit card. SynQor has evaluated these challenges through theoretical analysis and carefully controlled laboratory testing. Using the latest in thermal imaging equipment as well as analytica

In [52]:
text_directory = "pdfs_to_convert_to_text"
output_directory = "audio_voiceovers"
convert_txt_to_audio_voiceover(text_directory, output_directory)

Converting 1811_GitHub_wp_part1.txt to speech...
Processing chunk 1 of 3...
Audio stream saved successfully to audio_voiceovers/20240629_201258_1811_GitHub_wp_part1_part1_audio.mp3
Audio file saved as: audio_voiceovers/20240629_201258_1811_GitHub_wp_part1_part1_audio.mp3
Waiting for 60 seconds before processing the next chunk...
Processing chunk 2 of 3...


KeyboardInterrupt: 

In [70]:
from datetime import datetime
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

def process_keywords_and_pdfs(keywords, num_urls, num_files_to_process):
    # Search for PDF URLs on GitHub
    print("Searching for PDF URLs...")
    pdf_urls = search_pdfs_on_github(keywords, num_urls)
    print("Search completed.")
    print("PDF URLs found:")
    for url in pdf_urls:
        print(url)

    # Set up Google Drive authentication
    SCOPES = ['https://www.googleapis.com/auth/drive.file']
    flow = InstalledAppFlow.from_client_secrets_file(
        'credentials.json',
        scopes=SCOPES,
        redirect_uri='http://localhost:8081/'
    )
    creds = flow.run_local_server(port=8081)
    drive_service = build('drive', 'v3', credentials=creds)

    # Create folders in Google Drive
    main_folder_id = find_or_create_folder('getpdfs')
    current_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    subfolder_id = create_folder(current_time, main_folder_id)

    # Upload PDFs to Google Drive
    upload_pdf_to_google_drive(drive_service, pdf_urls, subfolder_id, num_files_to_process)

    # Process PDFs
    local_directory = "pdfs_to_convert_to_text"
    process_pdfs(drive_service, subfolder_id, local_directory, num_files_to_process)

    # Process last text file
    variable_name, content = process_last_text_file(preview_length=100)

    # Convert text to audio voiceover
    text_directory = "pdfs_to_convert_to_text"
    output_directory = "audio_voiceovers"
    convert_txt_to_audio_voiceover(text_directory, output_directory)

    return variable_name, content

# Example usage:
keywords = "neuroscience"
num_urls = 5
num_files_to_process = 1
variable_name, content = process_keywords_and_pdfs(keywords, num_urls, num_files_to_process)

Searching for PDF URLs...
Request parameters: {'q': 'neuroscience site:github.com filetype:pdf', 'key': 'AIzaSyDh6OU3OFcn_t3vrxO2fzGU49cT6pXpW0g', 'cx': 'f4afc285f53804afb', 'num': 5}
Request URL: https://www.googleapis.com/customsearch/v1?q=neuroscience+site%3Agithub.com+filetype%3Apdf&key=AIzaSyDh6OU3OFcn_t3vrxO2fzGU49cT6pXpW0g&cx=f4afc285f53804afb&num=5
Sending request to Google Custom Search API...
Response received.
Parsing response data...
Response data parsed.
Response data: {'kind': 'customsearch#search', 'url': {'type': 'application/json', 'template': 'https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&dateRestrict={dateRestr

  LTComponent.__init__(self, (x0, y0, x1, y1))


Processed A%20neuro-computational%20model%20of%20visual%20attention%20with%20multiple%20attentional%20control%20sets.pdf and uploaded A%20neuro-computational%20model%20of%20visual%20attention%20with%20multiple%20attentional%20control%20sets.txt
All PDFs have been processed and converted to text.
Downloading A%20neuro-computational%20model%20of%20visual%20attention%20with%20multiple%20attentional%20control%20sets.pdf...
Converting A%20neuro-computational%20model%20of%20visual%20attention%20with%20multiple%20attentional%20control%20sets.pdf to text...
Processed A%20neuro-computational%20model%20of%20visual%20attention%20with%20multiple%20attentional%20control%20sets.pdf
All specified PDFs have been downloaded and converted to text.
Created variable: Synqor_thermal_relief_study_str
Content preview: Thermal & Reliability Study on High Current Thermal Vias & Output Pins Application Note 00-08-01 Rev...
Every third character of the preview from Synqor_thermal_relief_study_str:
Trl lbi u  gCr

KeyboardInterrupt: 

In [None]:
# Streamlit app
st.title("PDF to Audio Voiceover Converter")

# User inputs
keywords = st.text_input("Enter keywords:", "observability")
num_urls = st.number_input("Number of URLs to process:", min_value=1, value=5)
num_files_to_process = st.number_input("Number of files to process:", min_value=1, value=1)

if st.button("Process"):
    with st.spinner("Processing..."):
        mp3_path = process_keywords_and_pdfs(keywords, num_urls, num_files_to_process)
    
    if mp3_path and os.path.exists(mp3_path):
        st.success("Processing complete!")
        
        # Display audio player
        audio_file = open(mp3_path, 'rb')
        audio_bytes = audio_file.read()
        st.audio(audio_bytes, format='audio/mp3')
        
        # Provide download link
        st.download_button(
            label="Download MP3",
            data=audio_bytes,
            file_name="voiceover.mp3",
            mime="audio/mp3"
        )
    else:
        st.error("Failed to generate audio file.")

In [63]:
# !pip install streamlit nest_asyncio

In [68]:
import streamlit as st
import os
import io

# Import your other necessary functions here
# from your_module import process_keywords_and_pdfs, convert_txt_to_audio_voiceover

def main():
    st.title("PDF to Audio Voiceover Converter")

    # User inputs
    keywords = st.text_input("Enter keywords:", "observability")
    num_urls = st.number_input("Number of URLs to process:", min_value=1, value=5)
    num_files_to_process = st.number_input("Number of files to process:", min_value=1, value=1)

    if st.button("Process"):
        with st.spinner("Processing..."):
            mp3_path = process_keywords_and_pdfs(keywords, num_urls, num_files_to_process)
        
        if mp3_path and os.path.exists(mp3_path):
            st.success("Processing complete!")
            
            # Display audio player
            audio_file = open(mp3_path, 'rb')
            audio_bytes = audio_file.read()
            st.audio(audio_bytes, format='audio/mp3')
            
            # Provide download link
            st.download_button(
                label="Download MP3",
                data=audio_bytes,
                file_name="voiceover.mp3",
                mime="audio/mp3"
            )
        else:
            st.error("Failed to generate audio file.")

# if __name__ == "__main__":
#     main()

import subprocess
import time
import webbrowser

def run_streamlit():
    # Start the Streamlit app in a separate process
    process = subprocess.Popen(["streamlit", "run", "streamlit_app.py"], 
                               stdout=subprocess.PIPE, 
                               stderr=subprocess.PIPE, 
                               text=True)
    
    # Wait for the app to start and print output
    for i in range(30):  # Wait for up to 30 seconds
        output = process.stdout.readline()
        if output:
            print(output.strip())
        if "You can now view your Streamlit app in your browser." in output:
            break
        time.sleep(1)
    
    # Open the Streamlit app in a new browser tab
    webbrowser.open_new_tab("http://localhost:8501")
    
    return process

# Run the Streamlit app
print("Starting Streamlit app...")
streamlit_process = run_streamlit()

# To stop the Streamlit app later, you can use:
# streamlit_process.terminate()

Starting Streamlit app...
