<a href="https://colab.research.google.com/github/HafizMuhammadAnas/text_summary/blob/main/Text_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# First, you need to install the necessary libraries.
# Run this cell to install the required packages.
!pip install openai langchain tiktoken pypdf unstructured pandas chromadb FPDF

In [None]:
# Import the necessary libraries and set your OpenAI API key.
import langchain
from langchain import OpenAI
from langchain import PromptTemplate
import os
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

In [None]:
# Set your OpenAI API key here.
openai_api_key = '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'

In [None]:
# Initialize the OpenAI model and load the PDF file.
llm = OpenAI(temperature=0, openai_api_key=openai_api_key)

# Load the book from the PDF file (replace "crime-and-punishment.pdf" with your file).
loader = PyPDFLoader("crime-and-punishment.pdf")
pages = loader.load()

In [None]:
# Combine the pages and replace tabs with spaces.
text = ""
for page in pages:
    text += page.page_content
text = text.replace('\t', ' ')

# Calculate the number of tokens in the text.
num_tokens = llm.get_num_tokens(text)
print(f"This book has {num_tokens} tokens in it")

In [None]:
# Split the text into documents using RecursiveCharacterTextSplitter.
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", "\t"], chunk_size=10000, chunk_overlap=3000)
docs = text_splitter.create_documents([text])

# Get the number of documents.
num_documents = len(docs)
print(f"Now our book is split up into {num_documents} documents")

In [None]:
# Create embeddings for the text using OpenAIEmbeddings.
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
vectors = embeddings.embed_documents([x.page_content for x in docs])

In [None]:
# Define the number of clusters for K-means clustering.
num_clusters = 20

# Perform K-means clustering on the embeddings.
kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(vectors)

In [None]:
# Find the closest embeddings to the centroids for each cluster.
closest_indices = []
for i in range(num_clusters):
    distances = np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1)
    closest_index = np.argmin(distances)
    closest_indices.append(closest_index)
selected_indices = sorted(closest_indices)

In [None]:
# Initialize ChatOpenAI for text generation.
llm3 = ChatOpenAI(
    temperature=0,
    openai_api_key=openai_api_key,
    max_tokens=10000,
    model='gpt-3.5-turbo-16k'
)

In [None]:
# Define a prompt template for summarizing text passages.
map_prompt = """
You will be given a single passage of a book. This section will be enclosed in triple backticks (```)
Your goal is to give a summary of this section so that a reader will have a full understanding of what happened.
Your response should be at least 20 pages long and fully encompass what was said in the passage.

```{text}```
FULL SUMMARY:
"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

# Load the summarization chain.
map_chain = load_summarize_chain(llm=llm3, chain_type="stuff", prompt=map_prompt_template)

# Select the documents that were closest to cluster centroids.
selected_docs = [docs[doc] for doc in selected_indices]

In [None]:

# Initialize a list to store the summaries.
summary_list = []

# Loop through the selected documents and generate summaries.
for i, doc in enumerate(selected_docs):
    chunk_summary = map_chain.run([doc])
    summary_list.append(chunk_summary)

In [None]:
# Import the FPDF library for creating PDF files.
from fpdf import FPDF

In [None]:
# Create a PDF class instance and set font properties.
pdf = FPDF()
pdf.set_font("Arial", size=10)

# Initialize a page counter.
page_counter = 0

# Iterate through the summaries and add them to the PDF.
for item in summary_list:
    if page_counter >= 20:
        break  # Exit the loop if the page limit is reached
    pdf.add_page()
    pdf.multi_cell(0, 10, item)  # Add each item to the PDF
    page_counter += 1  # Increment the page counter

# Save the PDF to a file.
pdf_filename = "summary_data.pdf"
if page_counter > 20:
    pdf.delete_page(21, page_counter)  # Delete excess pages
    pdf.output(pdf_filename)
else:
    pdf.output(pdf_filename)

In [None]:
# Join the summary_list into a single string for token counting.
summaries = "\n".join(summary_list)
# Convert the summaries back to a document.
summaries = Document(page_content=summaries)

# Count the number of tokens in the total summary.
total_tokens = llm.get_num_tokens(summaries.page_content)
print(f"Your total summary has {total_tokens} tokens")