In [1]:
import sys, os
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

from agents.curator import curate_from_list, load_curated, save_curated
from nlp.summarize import Summarizer
from nlp.translate import IndicTranslator
import json

  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [2]:
sample_resources = [
    {
        "id": "r1",
        "title": "Introduction to Fractions",
        "url": "https://www.example.com/fractions",
        "language": "en"
    },
    {
        "id": "r2",
        "title": "Basic Algebra in Hindi",
        "url": "https://www.example.com/algebra-hindi",
        "language": "hi"
    }
]

sample_resources


[{'id': 'r1',
  'title': 'Introduction to Fractions',
  'url': 'https://www.example.com/fractions',
  'language': 'en'},
 {'id': 'r2',
  'title': 'Basic Algebra in Hindi',
  'url': 'https://www.example.com/algebra-hindi',
  'language': 'hi'}]

In [3]:
curated = curate_from_list(sample_resources)
print("Curated items:")
curated


Curated items:


[{'id': 'r1',
  'title': 'Introduction to Fractions',
  'url': 'https://www.example.com/fractions',
  'language': 'en',
  'transcript': '<!doctype html><html lang="en"><head><title>Example Domain</title><meta name="viewport" content="width=device-width, initial-scale=1"><style>body{background:#eee;width:60vw;margin:15vh auto;font-family:system-ui,sans-serif}h1{font-size:1.5em}div{opacity:0.8}a:link,a:visited{color:#348}</style><body><div><h1>Example Domain</h1><p>This domain is for use in documentation examples without needing permission. Avoid use in operations.<p><a href="https://iana.org/domains/example">Learn more</a></div></body></html>\n',
  'size_bytes': 513},
 {'id': 'r2',
  'title': 'Basic Algebra in Hindi',
  'url': 'https://www.example.com/algebra-hindi',
  'language': 'hi',
  'transcript': '<!doctype html><html lang="en"><head><title>Example Domain</title><meta name="viewport" content="width=device-width, initial-scale=1"><style>body{background:#eee;width:60vw;margin:15vh a

In [4]:
curated = load_curated()
len(curated), curated[:2]


(2,
 [{'id': 'r1',
   'title': 'Introduction to Fractions',
   'url': 'https://www.example.com/fractions',
   'language': 'en',
   'transcript': '<!doctype html><html lang="en"><head><title>Example Domain</title><meta name="viewport" content="width=device-width, initial-scale=1"><style>body{background:#eee;width:60vw;margin:15vh auto;font-family:system-ui,sans-serif}h1{font-size:1.5em}div{opacity:0.8}a:link,a:visited{color:#348}</style><body><div><h1>Example Domain</h1><p>This domain is for use in documentation examples without needing permission. Avoid use in operations.<p><a href="https://iana.org/domains/example">Learn more</a></div></body></html>\n',
   'size_bytes': 513},
  {'id': 'r2',
   'title': 'Basic Algebra in Hindi',
   'url': 'https://www.example.com/algebra-hindi',
   'language': 'hi',
   'transcript': '<!doctype html><html lang="en"><head><title>Example Domain</title><meta name="viewport" content="width=device-width, initial-scale=1"><style>body{background:#eee;width:60v

In [5]:
summ = Summarizer()
updated = []

for item in curated:
    text = item.get("transcript", "")
    summary = summ.summarize(text).get("summary", "")
    item["summary"] = summary
    updated.append(item)

save_curated(updated)
print("Summaries added!")


Summaries added!


In [6]:
translator = IndicTranslator()

translated_items = []
for item in updated:
    orig = item["summary"]
    hi, _ = translator.translate(orig, "en", "hi")
    kn, _ = translator.translate(orig, "en", "kn")  # fallback if model missing

    item["summary_hi"] = hi
    item["summary_kn"] = kn
    translated_items.append(item)

save_curated(translated_items)
print("Translation added!")


Could not load local IndicTrans model: ai4bharat/indictrans2-en-hi is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`


Translation added!


In [7]:
final = load_curated()
json.dumps(final, indent=2, ensure_ascii=False)[:2000]


'[\n  {\n    "id": "r1",\n    "title": "Introduction to Fractions",\n    "url": "https://www.example.com/fractions",\n    "language": "en",\n    "transcript": "<!doctype html><html lang=\\"en\\"><head><title>Example Domain</title><meta name=\\"viewport\\" content=\\"width=device-width, initial-scale=1\\"><style>body{background:#eee;width:60vw;margin:15vh auto;font-family:system-ui,sans-serif}h1{font-size:1.5em}div{opacity:0.8}a:link,a:visited{color:#348}</style><body><div><h1>Example Domain</h1><p>This domain is for use in documentation examples without needing permission. Avoid use in operations.<p><a href=\\"https://iana.org/domains/example\\">Learn more</a></div></body></html>\\n",\n    "size_bytes": 513,\n    "summary": "<!doctype html><html lang=\\"en\\"><head><title>Example Domain</title><meta name=\\"viewport\\" content=\\"width=device-width, initial-scale=1\\"><style>body{background:#eee;width:60vw;margin:15vh auto;font-family:system-ui,sans-serif}h1{font-size:1. 5em}div{opacit