In [2]:
!pip install openai langchain 




[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: C:\Users\Mahinour Elsarky\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


In [3]:
!pip install tiktoken




[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: C:\Users\Mahinour Elsarky\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


In [4]:
# LangChain basics
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain



# Langchain Loaders:
from langchain.document_loaders import YoutubeLoader
from langchain.document_loaders import WebBaseLoader

# Vector Store and retrievals
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
#import pinecone

# Chat Prompt templates for dynamic values
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate
)

# Supporting libraries
import os
from dotenv import load_dotenv

load_dotenv()

True

In [5]:
!pip install bs4




[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: C:\Users\Mahinour Elsarky\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


In [7]:

llm3 = ChatOpenAI(temperature=0,
                  model_name="gpt-3.5-turbo-0613",
                  request_timeout = 180
                )


In [8]:

website_loader = WebBaseLoader(["https://www.sciencedaily.com/news/computers_math/artificial_intelligence/", "https://news.mit.edu/topic/artificial-intelligence2"])
website_data = website_loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=2200)
website_docs = text_splitter.split_documents(website_data)
print (f"You have {len(website_docs)} docs. First doc is {llm3.get_num_tokens(website_docs[0].page_content)} tokens")

You have 1 docs. First doc is 1336 tokens


In [10]:

# % START OF EXAMPLES
# - Sam's Elisabeth Murdoch Story: Sam got a call from Elizabeth Murdoch when he had just launched The Hustle. She wanted to generate video content.
# - Shaan's Rupert Murdoch Story: When Shaan was running Blab he was invited to an event organized by Rupert Murdoch during CES in Las Vegas.
# - Revenge Against The Spam Calls: A couple of businesses focused on protecting consumers: RoboCall, TrueCaller, DoNotPay, FitIt
# - Wildcard CEOs vs. Prudent CEOs: However, Munger likes to surround himself with prudent CEO's and says he would never hire Musk.
# - Chess Business: Priyav, a college student, expressed his doubts on the MFM Facebook group about his Chess training business, mychesstutor.com, making $12.5K MRR with 90 enrolled.
# - Restaurant Refiller: An MFM Facebook group member commented on how they pay AirMark $1,000/month for toilet paper and toilet cover refills for their restaurant. Shaan sees an opportunity here for anyone wanting to compete against AirMark.
# - Collecting: Shaan shared an idea to build a mobile only marketplace for a collectors' category; similar to what StockX does for premium sneakers.
# % END OF EXAMPLES

template="""
You are a helpful assistant that helps retrieve distinct topics discussed from many websites' content
- Your goal is to extract the topic names and brief 1-sentence description of the topic
- Topics can include:
- AI tools
- GPT Models
- Google Models
- LLMs
- llama Models
- Falcon Models
- Programming Languages
- AI recent News
- AI tutorials
- OpenAI
- AI for business 
- AI for education
- AI for medicine 
- AI for art and music
- Deep Learning
- NLP
- Machine Learning
- Data science
- Opportunities in AI
- AI frameworks
- Future AI
- Langchain

- Provide a brief description of the topics after the topic name. Example: 'Topic: Brief Description'
- Use the same words and terminology that is said in the websites' content
- Do not respond with numbers, just bullet points of all topics listed under each other. Example:
Topics:
    - Topic 1 title: topic 1 description
    - Topic 2 title: topic 2 description
    - Topic 3 title: topic 3 description
    
- Ignore topics on policy and regulations
- Do not respond with anything outside of the webstes' content. If you can't extract any topics at all in the whole content, say 'Sorry, No topics found in the given content'
- Only pull topics from the websites' content. Do not use the examples
- If the authors' names were mentioned in the transcript, instead of saying 'The Author' refer to the names.
- Make your titles descriptive but concise. Example: 'Shaan's Experience at Twitch' should be 'Shaan's Interesting Projects At Twitch'
- A topic should be substantial, more than just a one-off comment

"""
#- Do not respond with anything outside of the transcript. If you don't see any topics, say, 'No Topics'
system_message_prompt_map = SystemMessagePromptTemplate.from_template(template)

human_template="Websites' Content: {text}" # Simply just pass the text as a human message
human_message_prompt_map = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt_map = ChatPromptTemplate.from_messages(messages=[system_message_prompt_map, human_message_prompt_map])

In [11]:
# % START OF EXAMPLES
# - Sam's Elisabeth Murdoch Story: Sam got a call from Elizabeth Murdoch when he had just launched The Hustle. She wanted to generate video content.
# - Shaan's Rupert Murdoch Story: When Shaan was running Blab he was invited to an event organized by Rupert Murdoch during CES in Las Vegas.
# % END OF EXAMPLES

template="""
You are a helpful assistant that helps retrieve topics discussed in websites' content
- You will be given a series of bullet topics of topics found
- Your goal is to exract the topic names and brief 1-sentence description of the topic
- Do not respond with numbers, just bullet points of all topics listed under each other.
- Deduplicate any bullet points you see
- If you think two or more topics are similar and can be merged, merge them together with one topic title and create a new description that fits the merged topics
- Only pull topics from the websites' content. Do not use the examples.
"""
system_message_prompt_map = SystemMessagePromptTemplate.from_template(template)

human_template="Websites' Content: {text}" # Simply just pass the text as a human message
human_message_prompt_map = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt_combine = ChatPromptTemplate.from_messages(messages=[system_message_prompt_map, human_message_prompt_map])



In [12]:
chain = load_summarize_chain(llm3,
                             chain_type="map_reduce",
                             map_prompt=chat_prompt_map,
                             combine_prompt=chat_prompt_combine,
                            verbose=True
                            )

In [14]:
topics_found = chain.run({"input_documents": website_docs})



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: 
You are a helpful assistant that helps retrieve distinct topics discussed from many websites' content
- Your goal is to extract the topic names and brief 1-sentence description of the topic
- Topics can include:
- AI tools
- GPT Models
- Google Models
- LLMs
- llama Models
- Falcon Models
- Programming Languages
- AI recent News
- AI tutorials
- OpenAI
- AI for business 
- AI for education
- AI for medicine 
- AI for art and music
- Deep Learning
- NLP
- Machine Learning
- Data science
- Opportunities in AI
- AI frameworks
- Future AI
- Langchain

- Provide a brief description of the topics after the topic name. Example: 'Topic: Brief Description'
- Use the same words and terminology that is said in the websites' content
- Do not respond with numbers, just bullet points of all topics listed under each other. Example:
Topics:
    - Topic 1 t

In [15]:
print(topics_found)

- Artificial intelligence: MIT News covers advancements, applications, and research projects related to AI.
- Fast-tracking fusion energy's arrival with AI and accessibility: MIT Plasma Science and Fusion Center uses AI to improve access to fusion data and increase workforce diversity.
- Autonomous innovations in an uncertain world: Aerospace Controls Laboratory develops planning algorithms for autonomous vehicles to navigate dynamic environments.
- Helping high schoolers prepare for the rise of AI: A one-week summer program aims to foster understanding of machine-learning approaches in health among young minds.
- Supporting sustainability, digital health, and the future of work: The MIT and Accenture Convergence Initiative selects three research projects to support.
- AI helps robots manipulate objects with their whole bodies: A new technique allows robots to reason efficiently about moving objects.
- SMART launches research group to advance AI, automation, and the future of work: M3S

In [16]:
!pip install kor




[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: C:\Users\Mahinour Elsarky\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


In [17]:

from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number

In [18]:
schema = Object(
    id="topic",
    description="Topic Information",
    examples=[
        ("Generative AI: An exciting tool in AI that can generate text based on input prompts.", [{"topic title": "Generative AI"}, {"description": "An exciting tool in AI that can generate text based on input prompts."}, {"tag": "AI Tools"}]),
        ("AI in relationships: The use of AI in relationship coaching and mentoring, such as the example of an AI-powered romantic relationship coaching app.", [{"topic title": "AI in relationships"}, {"description": "The use of AI in relationship coaching and mentoring, such as the example of an AI-powered romantic relationship coaching app."}, {"tag": "AI Applications"}]),
        ("AI in different industries: Exploring the potential of AI in various industries beyond consumer software and the internet.", [{"topic title": "AI in different industries"}, {"description": "Exploring the potential of AI in various industries beyond consumer software and the internet."}, {"tag": "AI Opportunities"}]),
        ("Supervised Learning: A technique in AI that is good at labeling things or computing input to outputs.", [{"topic title": "Supervised Learning"}, {"description": "A technique in AI that is good at labeling things or computing input to outputs."}, {"tag": "AI Tools"}]),
        ("Large Language Models: The power and potential of large language models in AI applications.", [{"topic title": "Large Language Models"}, {"description": "The power and potential of large language models in AI applications."}, {"tag": "AI LLMs"}]),
        ("Future growth of AI technologies: The prediction that supervised learning and generative AI will continue to grow in value and adoption over the next three years, with the potential for even greater expansion in the long term.", [{"topic title": "Future growth of AI technologies:"}, {"description": "The prediction that supervised learning and generative AI will continue to grow in value and adoption over the next three years, with the potential for even greater expansion in the long term."}, {"tag": "Future of AI"}]),
        ("Large Language Models: Dr. Andrew explains how large language models, like GPT, are built using supervised learning to predict the next word, enabling applications that can be built faster and more efficiently.", [{"topic title": "Large Language Models:"}, {"description": " Dr. Andrew explains how large language models, like GPT, are built using supervised learning to predict the next word, enabling applications that can be built faster and more efficiently."}, {"tag": "AI LLMs"}]),
 
    ],
    attributes=[
         Text(
            id="title",
            description="The title of the topic listed",
        ),
        Text(
            id="description",
            description="The description of the topic listed",
        ),
        Text(
            id="tag",
            description="The type of content being described",
        )
    ],
    many=True,
)

In [19]:
#chain = create_extraction_chain(schema, llm3)
chain = create_extraction_chain(llm3, schema)

In [20]:
topics_structured = chain.run(topics_found)

In [21]:
topics_structured

{'data': {'topic': [{'title': 'Artificial intelligence: MIT News covers advancements, applications, and research projects related to AI.',
    'description': '',
    'tag': 'AI News'},
   {'title': "Fast-tracking fusion energy's arrival with AI and accessibility: MIT Plasma Science and Fusion Center uses AI to improve access to fusion data and increase workforce diversity.",
    'description': '',
    'tag': 'AI Applications'},
   {'title': 'Autonomous innovations in an uncertain world: Aerospace Controls Laboratory develops planning algorithms for autonomous vehicles to navigate dynamic environments.',
    'description': '',
    'tag': 'AI Innovations'},
   {'title': 'Helping high schoolers prepare for the rise of AI: A one-week summer program aims to foster understanding of machine-learning approaches in health among young minds.',
    'description': '',
    'tag': 'AI Education'},
   {'title': 'Supporting sustainability, digital health, and the future of work: The MIT and Accenture 