# **Document Analysis**

In [2]:
import os
import re
from docx import Document
import json

def pyillam_script_final(docx_file_path):
    document = Document(docx_file_path)
    text = "\n".join([paragraph.text for paragraph in document.paragraphs]).strip().lower()

    lines = text.split('\n')
    final_lines = [i.strip() for i in lines if len(i) > 1]

    v_list = []
    video_info = []
    titles = []
    for i, element in enumerate(final_lines):
        if re.search(r'v\d', element):
            v_list.append(i)

    for video_index in v_list:
        v_line = final_lines[video_index].strip()
        v_index = v_line[v_line.find('v')]
        video_title = final_lines[video_index + 1].strip().title()
        titles.append(video_title)
        video_info.append({'video_index': v_index, 'video_title': video_title})

    script = [title for title in final_lines if title.lower() not in [t.lower() for t in titles]]

    v_trimmed_list = [j for j, j_element in enumerate(script) if re.search(r'v\d', j_element)]
    v_trimmed_list.append(len(script))

    paragraphs = [script[start_idx:end_idx][1:] for start_idx, end_idx in zip(v_trimmed_list[:-1], v_trimmed_list[1:])]

    index_lists = [[int(re.search(r'p(\d+)', element).group(1)) for element in sub_list if re.search(r'p\d', element)] for sub_list in paragraphs]
    videos_list = [[jendex for jendex, element in enumerate(sub_list) if re.search(r'p\d', element)] for sub_list in paragraphs]

    for sub_list in paragraphs:
        sub_list.append(len(sub_list))

    paragraphs_result = []
    for i in range(len(index_lists)):
        paragraphs_result.append({'videoId': i + 1, 'video_title': video_info[i]['video_title'], 'paragraphInfo': []})
        paragraphs_counter = 1
        for j in range(len(videos_list[i])):
            start = videos_list[i][j] + 1
            end = videos_list[i][j + 1] if j + 1 < len(videos_list[i]) else -1
            paragraphs_result[i]['paragraphInfo'].append({'viewIndex': paragraphs_counter, 'paragraphDetails': " ".join(paragraphs[i][start:end])})
            paragraphs_counter += 1

    return paragraphs_result

def process_docx_files_final(panel_master_drive_path):
    for folder_name in os.listdir(panel_master_drive_path):
        full_folder_drive_path = os.path.join(panel_master_drive_path, folder_name)

        docx_file_path = os.path.join(full_folder_drive_path, f"{folder_name}.docx")
        if os.path.isfile(docx_file_path):
            script_result = pyillam_script_final(docx_file_path)

            # Save as a JSON file
            json_file_path = os.path.join(full_folder_drive_path, f"{folder_name} Script.json")
            with open(json_file_path, "w", encoding="utf-8") as json_file:
                json.dump(script_result, json_file, ensure_ascii=False, indent=4)

In [3]:
panel_master_drive_path = "courses"
process_docx_files_final(panel_master_drive_path)