In [None]:
pip uninstall scikit-learn transformers transformers torch PyMuPDF

In [None]:
pip install scikit-learn

In [None]:
pip install transformers

In [None]:
pip install requests PyMuPDF

In [None]:
pip install transformers torch

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
import pandas as pd
import requests
import fitz 
import json
import os
import re

In [None]:
def download_pdf(pdf_url, filename="downloaded.pdf"):
    response = requests.get(pdf_url)
    with open(filename, 'wb') as f:
        f.write(response.content)
    return filename

In [None]:
def pdf_to_json(pdf_path, output_json="output.json"):
    doc = fitz.open(pdf_path)
    pdf_data = {}

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()
        pdf_data[f"page_{page_num + 1}"] = text.strip()

    with open(output_json, 'w', encoding='utf-8') as json_file:
        json.dump(pdf_data, json_file, indent=2, ensure_ascii=False)

    for page, content in pdf_data.items():
        print(f"\n--- {page} ---\n{content}\n")

    return pdf_data

    # print(f"✅ PDF converted and saved to {output_json}")

In [None]:
local_pdf = download_pdf("https://ifsca.gov.in/CommonDirect/GetFileView?id=21626bde60601ef44a0ed0220138e431&fileName=Regulatory_action_s__against_certain_Fund_Management_Entities_for_non_compliance_with_IFSCA__Fund_Management__Regulations__2025_20250718_0112.pdf")
pdf_to_json(local_pdf)

In [None]:
print(local_pdf)

In [None]:
with open("output.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [None]:
def clean_text(text):
    text = re.sub(r"\[image\]", "", text)
    text = re.sub(r"[•■○]", "-", text)
    text = re.sub(r"-{2,}", "-", text)
    text = re.sub(r"[×]", "", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\n+", "\n", text)
    return text.strip()

In [None]:
cleaned_data = {page: clean_text(content) for page, content in data.items()}

In [None]:
with open("pdf_cleaned.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, indent=2, ensure_ascii=False)

In [None]:
with open("pdf_cleaned.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, indent=2, ensure_ascii=False)

In [None]:
with open("pdf_cleaned.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [None]:
pages = [data[page] for page in sorted(data.keys(), key=lambda x: int(x.split('_')[1]))]

In [None]:
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.9, 
    min_df=1,
)

In [None]:
tfidf_matrix = vectorizer.fit_transform(pages)

In [None]:
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
df_tfidf.index = [f"page_{i+1}" for i in range(len(pages))]

In [None]:
df_tfidf.to_csv("tfidf_matrix.csv", index=True)

In [None]:
print(df_tfidf.head())

In [None]:
summarizer = pipeline("summarization")


In [None]:
with open("pdf_cleaned.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [None]:
full_text = "\n".join(data[page] for page in sorted(data.keys(), key=lambda x: int(x.split('_')[1])))

In [None]:
summary = summarizer(full_text, max_length=150, min_length=50, do_sample=False)

In [None]:
print("Summary:\n", summary[0]['summary_text'])