### Files:

In [4]:
!pip install -q transformers torch

import os
import re
import json
import torch
from transformers import pipeline

DATA_PATH = "/content/data"

print("Files found:")
print(os.listdir(DATA_PATH))

device = 0 if torch.cuda.is_available() else -1
print(f"üöÄ Processing using: {'GPU (Fast)' if device == 0 else 'CPU (Slow)'}")

Files found:
['meeting number 2.txt', 'meeting number 1.txt']
üöÄ Processing using: CPU (Slow)


### Configuration:

In [5]:
DATA_FOLDER_PATH = "/content/data"
OUTPUT_FILE = "fathom_training_data_local.json"

print("‚è≥ Loading local model (facebook/bart-large-cnn)...")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
print("‚úÖ Model loaded!")

‚è≥ Loading local model (facebook/bart-large-cnn)...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


‚úÖ Model loaded!


### The cleaner function:

In [6]:
def clean_fathom_transcript(text):
    """
    Cleans the raw text, removing timestamps and metadata.
    """
    lines = text.split('\n')
    cleaned_lines = []

    for line in lines:
        line = line.strip()
        if not line or "VIEW RECORDING" in line or "https://" in line or "---" in line: continue
        if line.startswith("BOOKMARK:") or line.startswith("HIGHLIGHT:"): continue

        match = re.match(r'\d+:\d+\s-\s(.+)', line)
        if match:
            speaker = match.group(1).strip()
            cleaned_lines.append(f"\n[{speaker}]:")
            continue

        cleaned_lines.append(line)

    return " ".join(cleaned_lines).replace(" [", "\n[").strip()

### The Local Summarizer Logic:

In [9]:
def generate_local_summary(text):
    """
    Summarizes text using the local BART model.
    Handles long text by chunking.
    """
    chunk_size = 3000
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

    summaries = []

    try:
        for chunk in chunks:
            output = summarizer(chunk, max_length=150, min_length=40, do_sample=False)
            summaries.append(output[0]['summary_text'])

        full_summary = " ".join(summaries)
        return full_summary

    except Exception as e:
        print(f"Error during local inference: {e}")
        return None

## Main Execution Loop:

In [13]:
training_pairs = []

print(f"üìÇ Reading files from: {"/content/data"}")

if os.path.exists("/content/data"):
    files = [f for f in os.listdir("/content/data") if f.endswith('.txt')]

    for i, filename in enumerate(files):
        print(f"\nProcessing {i+1}/{len(files)}: {filename}...")

        try:
            filepath = os.path.join("/content/data", filename)
            with open(filepath, 'r', encoding='utf-8') as f: raw_text = f.read()

            clean_text = clean_fathom_transcript(raw_text)

            if len(clean_text) < 100:
                print("   ‚ö†Ô∏è Text too short, skipping.")
                continue

            print("   üß† Generating summary (Local AI)...")
            target_summary = generate_local_summary(clean_text)

            training_pairs.append({
                "input_text": clean_text,
                "target_text": target_summary
            })
            print("   ‚úÖ Done.")

        except Exception as e:
            print(f"   ‚ùå Failed: {e}")

    if training_pairs:
        with open(OUTPUT_FILE, 'w') as f:
            json.dump(training_pairs, f, indent=4)
        print(f"\nüéâ SUCCESS! Saved {len(training_pairs)} items to {OUTPUT_FILE}")
    else:
        print("\n‚ö†Ô∏è No data found.")

else:
    print("‚ùå Folder path incorrect.")

üìÇ Reading files from: /content/data

Processing 1/2: meeting number 2.txt...
   üß† Generating summary (Local AI)...
   ‚úÖ Done.

Processing 2/2: meeting number 1.txt...
   üß† Generating summary (Local AI)...
   ‚úÖ Done.

üéâ SUCCESS! Saved 2 items to fathom_training_data_local.json
