In [81]:
!pip install llama-index llama-index-llms-groq llama-index-embeddings-huggingface



In [82]:
import os

In [83]:
GROQ_API_KEY = "GROQ_API_KEY"

In [84]:
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

In [85]:
import nest_asyncio

nest_asyncio.apply()

In [86]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core import SummaryIndex
from llama_index.llms.groq import Groq
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [87]:
llm = Groq(model="llama-3.1-8b-instant")

Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding()

In [88]:
# load the document
documents = SimpleDirectoryReader(input_files=["/content/sample_data/6400_Internship Syllabus_Summer2022.pdf"]).load_data()

In [76]:
print(type(documents))

<class 'list'>


In [43]:
len(documents)

6

In [44]:
documents[0]

Document(id_='cad6765f-23f6-4b6f-bba9-3c3be0df2439', embedding=None, metadata={'page_label': '1', 'file_name': '6400_Internship Syllabus_Summer2022.pdf', 'file_path': '/content/sample_data/6400_Internship Syllabus_Summer2022.pdf', 'file_type': 'application/pdf', 'file_size': 209787, 'creation_date': '2024-11-10', 'last_modified_date': '2024-11-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='1  \n \nDSBA 6400 – Internship  \nData Science and Business Analytics  \n                                                                                       \n                                                                                                       Revised by D r. Xi (Sunshine) Niu , May 2021  \n  Course  Credit:  3 Graduate  Credits  \n \n \nCou

In [45]:
documents[1]

Document(id_='7c3a7a84-e4b3-43a3-8e92-5a8e7ea57579', embedding=None, metadata={'page_label': '2', 'file_name': '6400_Internship Syllabus_Summer2022.pdf', 'file_path': '/content/sample_data/6400_Internship Syllabus_Summer2022.pdf', 'file_type': 'application/pdf', 'file_size': 209787, 'creation_date': '2024-11-10', 'last_modified_date': '2024-11-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='2  \n \nDSBA 6400 – Internship  \nData Science and Business Analytics  \n                                                                                       \n                                                                                                       Revised by D r. Xi (Sunshine) Niu , May 2021  \n Course  Competencies:  \nAfter  successfully compl

In [89]:
splitter = SentenceSplitter(chunk_size=2048)
nodes = splitter.get_nodes_from_documents(documents)

In [90]:
summary_index = SummaryIndex(nodes)

In [91]:
summary_query_engine = summary_index.as_query_engine(
    response_mode="tree_summarize",
    use_async=True
)

# **Response 1**

In [92]:
response = summary_query_engine.query("You are a text summarizer. provide a summary that is concise and to the point not more than 200 words. Focus on the main idea and avoid unnecessary details.")

In [57]:
print(type(response))

<class 'llama_index.core.base.response.schema.Response'>


In [66]:
print(response)

The internship course, DSBA 6400, is designed to provide students with practical experience in data science and business analytics. To participate, students must secure a suitable internship opportunity and develop a proposal that meets the course's fundamental aspects. 

Throughout the internship, students are expected to complete a minimum of 160 hours, submit regular progress reports, and present a final summary. The internship is evaluated based on the student's performance, as assessed by the mentor and a post-internship evaluation.

The program aims to provide students with hands-on experience, valuable skills, and measurable benefits to the organization. Academic integrity and responsible conduct are emphasized, with students required to abide by the university's code of conduct. Disability services are also available to students who require accommodations.

The internship experience is evaluated by the student, mentor, and supervising faculty member. The program serves as a val

In [60]:
!pip install pdfkit
!apt-get install wkhtmltopdf

Collecting pdfkit
  Downloading pdfkit-1.0.0-py3-none-any.whl.metadata (9.3 kB)
Downloading pdfkit-1.0.0-py3-none-any.whl (12 kB)
Installing collected packages: pdfkit
Successfully installed pdfkit-1.0.0
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  avahi-daemon bind9-host bind9-libs geoclue-2.0 glib-networking glib-networking-common
  glib-networking-services gsettings-desktop-schemas iio-sensor-proxy libavahi-core7 libavahi-glib1
  libdaemon0 libevdev2 libfontenc1 libgudev-1.0-0 libhyphen0 libinput-bin libinput10
  libjson-glib-1.0-0 libjson-glib-1.0-common liblmdb0 libmaxminddb0 libmbim-glib4 libmbim-proxy
  libmd4c0 libmm-glib0 libmtdev1 libnl-genl-3-200 libnotify4 libnss-mdns libproxy1v5 libqmi-glib5
  libqmi-proxy libqt5core5a libqt5dbus5 libqt5gui5 libqt5network5 libqt5positioning5
  libqt5printsupport5 libqt5qml5 libqt5qmlmodels5 libqt5quick5 libqt5sensors5 libqt5svg5
  lib

In [67]:
from IPython.display import display, HTML
import pdfkit

# Convert the response to HTML for better formatting in the PDF
html_content = f"<html><body>{response.response}</body></html>"

# Convert the HTML content to PDF
pdfkit.from_string(html_content, "summary.pdf")

# Display the generated PDF within the Colab notebook
# (You'll need to download it from the Colab files)
display(HTML("PDF generated: <a href='summary.pdf' target='_blank'>summary.pdf</a>"))

# **Response 2**

In [104]:
response = summary_query_engine.query("You are a text summarizer. provide a summary that is concise and to the point ideally in one or two sentences. Focus on the main idea and avoid unnecessary details.")

In [105]:
print(response)

This graduate course provides students with hands-on experience in Data Science and Business Analytics, allowing them to apply theoretical knowledge in a real-world setting.


In [None]:
from IPython.display import display, HTML
import pdfkit

# Convert the response to HTML for better formatting in the PDF
html_content = f"<html><body>{response.response}</body></html>"

# Convert the HTML content to PDF
pdfkit.from_string(html_content, "summary2.pdf")

# Display the generated PDF within the Colab notebook
# (You'll need to download it from the Colab files)
display(HTML("PDF generated: <a href='summary2.pdf' target='_blank'>summary.pdf</a>"))

# **Storing**

In [None]:
# storing the document summary index
summary_index.storage_context.persist("doc_summary_index")

# **Another way**

Open AI

In [None]:
!pip install --upgrade llama-index
import os
from llama_index import LLMPredictor # Changed import statement to import directly from llama_index
from llama_index import (
    ServiceContext,
    PromptHelper,
    SentenceSplitter,
    SummaryIndex,
)
from llama_index.llms import OpenAI

# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = "API key"

# Define the LLM
llm_predictor = LLMPredictor(llm=OpenAI(model="text-davinci-003")) # Change model here to 'text-davinci-003' or another suitable model.
# Configure service context
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
# Initialize prompt helper
prompt_helper = PromptHelper(max_input_size=4096, num_output=256, max_chunk_overlap=20)

# Your existing code with the new service_context:
splitter = SentenceSplitter(chunk_size=2048, chunk_overlap=20)
nodes = splitter.get_nodes_from_documents(documents, prompt_helper=prompt_helper)
summary_index = SummaryIndex(nodes, service_context=service_context)
summary_query_engine = summary_index.as_query_engine(
    response_mode="tree_summarize",
    use_async=True
)

response = summary_query_engine.query("You are a text summarizer. provide a summary that is concise and to the point not more than 200 words. Focus on the main idea and avoid unnecessary details.")