In [8]:
import os
import re
from tqdm import tqdm  # Import tqdm for progress bar

# Function to convert .srt file to plain text
def srt_to_text(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    text = ""
    for line in lines:
        # Skip the line numbers and time stamps
        if re.match(r'^\d+$', line.strip()) or '-->' in line:
            continue
        # Add non-empty lines of text to the output
        elif line.strip():
            text += line.strip() + " "
    
    # Write the plain text to the output file
    with open(output_file, 'w', encoding='utf-8') as output:
        output.write(text.strip())

# Get user input for the folder containing .srt files
folder_path = input("Enter the path to the folder containing .srt files: ")

# Check if the folder exists
if not os.path.isdir(folder_path):
    print(f"The folder '{folder_path}' does not exist. Please check the path and try again.")
else:
    # Get the current working directory
    current_directory = os.getcwd()

    # Create the plain_text folder if it doesn't exist
    output_folder = os.path.join(current_directory, 'plain_text')
    os.makedirs(output_folder, exist_ok=True)

    # List all .srt files in the folder
    srt_files = [f for f in os.listdir(folder_path) if f.endswith('.srt')]

    # Use tqdm to track the progress of processing each file
    for filename in tqdm(srt_files, desc="Converting files", unit="file"):
        input_file = os.path.join(folder_path, filename)
        output_file = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.txt")

        # Call the function to convert the .srt to plain text
        srt_to_text(input_file, output_file)
        # tqdm will automatically update the progress bar after each file is processed
    
    print(f"All conversions are complete. The plain text files are saved in the '{output_folder}' folder.")


Enter the path to the folder containing .srt files:  D:\English\Impact Theory Podcasts\subtitles


Converting files: 100%|██████████████████████████████████████████████████████████████| 87/87 [00:02<00:00, 37.72file/s]

All conversions are complete. The plain text files are saved in the 'C:\Users\moham\Workplace\Helping-Study-Tools\plain_text' folder.





In [25]:
import os
import re
from tqdm import tqdm

# Function to split text into segments by word count
def split_by_words(text, word_count):
    words = text.split()
    segments = [" ".join(words[i:i + word_count]) for i in range(0, len(words), word_count)]
    return segments


# Function to convert .srt file to plain text and split into segments
def srt_to_text(input_file, output_file, word_count=100):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    # Extracting text from the SRT file
    text = ""
    for line in lines:
        # Skip the line numbers and time stamps
        if re.match(r'^\d+$', line.strip()) or '-->' in line:
            continue
        elif line.strip():  # Add non-empty lines of text
            text += line.strip() + " "

    # Split the text into segments by word count
    segments = split_by_words(text, word_count)

    # Write the segments to the output file
    with open(output_file, 'w', encoding='utf-8') as output:
        for idx, segment in enumerate(segments):
            output.write(f"Segment {idx + 1}:\n")
            output.write(segment + "\n\n")


# Main program
folder_path = input("Enter the path to the folder containing .srt files: ")

# Check if the folder exists
if not os.path.isdir(folder_path):
    print(f"The folder '{folder_path}' does not exist. Please check the path and try again.")
else:
    # Get the current working directory
    current_directory = os.getcwd()

    # Create the plain_text folder if it doesn't exist
    output_folder = os.path.join(current_directory, 'plain_text')
    os.makedirs(output_folder, exist_ok=True)

    # List all .srt files in the folder
    srt_files = [f for f in os.listdir(folder_path) if f.endswith('.srt')]

    # Get word count for splitting from the user
    word_count = 1000  # Default word count
    
    # Use tqdm to track the progress of processing each file
    for filename in tqdm(srt_files, desc="Converting files", unit="file"):
        input_file = os.path.join(folder_path, filename)
        output_file = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.txt")

        # Call the function to convert the .srt to plain text and split it into segments
        srt_to_text(input_file, output_file, word_count=word_count)

    print(f"All conversions are complete. The plain text files are saved in the '{output_folder}' folder.")


Enter the path to the folder containing .srt files:  D:\English\Impact Theory Podcasts\subtitles


Converting files: 100%|██████████████████████████████████████████████████████████████| 87/87 [00:02<00:00, 31.68file/s]

All conversions are complete. The plain text files are saved in the 'C:\Users\moham\Workplace\Helping-Study-Tools\plain_text' folder.



