In [1]:
import requests
from bs4 import BeautifulSoup
from llama_index import SimpleDirectoryReader,LLMPredictor,GPTListIndex
from langchain.chat_models import ChatOpenAI
import configparser
import os
import openai

# Read configuration file and get all the connections
config = configparser.ConfigParser()
config.read('my_config.ini')
config.sections()
os.environ['OPENAI_API_KEY'] = config['OpenAI']['password']

In [2]:
def scrape_all_posts(url):
    # Scrape web
    blog_json = requests.get(url).json()
    number_posts = len(blog_json)

    # Save text file per post
    for i in range(number_posts):
        # Title
        title=(blog_json[i]['title']['rendered'])

        # Content
        entry_content = blog_json[i]['content']['rendered']
        entry_text = BeautifulSoup(entry_content).get_text()

        # Clean Content
        entry_text = entry_text.replace('\xa0', '') 
        entry_text = entry_text.replace('\n3', '') 
        entry_text = entry_text.replace('\n1', '') 
        entry_text = entry_text.replace('\n', ' ') 

        # Save text
        with open("../data/"+title+'.txt', 'w') as f:
            f.write(entry_text)     


def train_LLM(source):
    # Read documents to train the LLM
    docs  = SimpleDirectoryReader(source).load_data()

    # Training
    llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.7, model_name="gpt-3.5-turbo", max_tokens=512))
    output_index = GPTListIndex.from_documents(docs)

    # Engine
    engine = output_index.as_query_engine()

    return engine

def question_LLM(engine, question):
    response = engine.query(question)   
    return response


In [3]:
url = 'https://www.saramarlop.com/wp-json/wp/v2/posts'
question = "What is this document about?"
source = "../data"

scrape_all_posts(url = url)
engine = train_LLM(source = source)
response = question_LLM(engine = engine, question =  question)

In [5]:
response.response

'\nThis document is about the concept of Large Language Models (LLM) and how they relate to Artificial Intelligence (AI), Machine Learning (ML), Deep Learning (DL), and Generative AI. It also discusses the cognitive bias of scope insensitivity and the martial art of rationality.'