# extract data from Web URL

In [1]:
import os
from langchain_core.documents import Document 
from langchain_community.document_loaders import WebBaseLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
# List of URLs
urls = [
    'https://en.wikipedia.org/wiki/Dana_White',
    'https://en.wikipedia.org/wiki/Elon_Musk',
    'https://en.wikipedia.org/wiki/Mark_Zuckerberg',
    'https://en.wikipedia.org/wiki/Bill_Gates',
    'https://en.wikipedia.org/wiki/Jeff_Bezos',
    'https://en.wikipedia.org/wiki/Steve_Jobs',
    'https://en.wikipedia.org/wiki/Sam_Altman',
    # 'https://en.wikipedia.org/wiki/Mukesh_Ambani',
    # 'https://en.wikipedia.org/wiki/Jensen_Huang',
    # 'https://en.wikipedia.org/wiki/Satoshi_Nakamoto',
    # 'https://en.wikipedia.org/wiki/Donald_Trump',
    'https://en.wikipedia.org/wiki/Larry_Ellison',
    'https://en.wikipedia.org/wiki/Larry_Page',
    'https://en.wikipedia.org/wiki/Mark_Cuban',
    #'https://en.wikipedia.org/wiki/Ratan_Tata',
    #'https://en.wikipedia.org/wiki/Brian_Chesky',
    #'https://en.wikipedia.org/wiki/Jack_Dorsey',
    'https://en.wikipedia.org/wiki/Satya_Nadella'
]

# Initialize an empty list
url_list = []

# Loop through the URLs and add them to the list
for url in urls:
    url_list.append(url)

# Display the list
print(url_list)


['https://en.wikipedia.org/wiki/Dana_White', 'https://en.wikipedia.org/wiki/Elon_Musk', 'https://en.wikipedia.org/wiki/Mark_Zuckerberg', 'https://en.wikipedia.org/wiki/Bill_Gates', 'https://en.wikipedia.org/wiki/Jeff_Bezos', 'https://en.wikipedia.org/wiki/Steve_Jobs', 'https://en.wikipedia.org/wiki/Sam_Altman', 'https://en.wikipedia.org/wiki/Larry_Ellison', 'https://en.wikipedia.org/wiki/Larry_Page', 'https://en.wikipedia.org/wiki/Mark_Cuban', 'https://en.wikipedia.org/wiki/Satya_Nadella']


In [3]:
import re

def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]*?>', '', text)
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s{2,}', ' ', text)
    # Trim leading and trailing whitespace
    text = text.strip()
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

In [4]:
# extract the data from the URLs 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import WebBaseLoader

def extract_data_from_URL(url):
    loader=WebBaseLoader([url])
    data=loader.load().pop().page_content
    data=clean_text(data)
    documents=[Document(page_content=data)]
    # print(documents)
    splitter=RecursiveCharacterTextSplitter(chunk_size=3000,chunk_overlap=100)
    smaller_doc=splitter.split_documents(documents)
    print(len(smaller_doc))
    return smaller_doc

# GROQ API

In [5]:
import os 
from groq import Groq 
from openai import AzureOpenAI

client=Groq(api_key=os.getenv('GROQ_API_KEY'))
# client=AzureOpenAI(
#         api_key=os.getenv('AZURE_OPENAI_API_KEY'),
#         azure_endpoint=os.getenv('AZURE_OpenAI_ENDPOINT'),
#         api_version=os.getenv('AZURE_OpenAI_API_VERSION')
#     )

In [6]:
# create system prompt to extract data in JSON format as required

system=""" You are a network graph maker tasked with analyzing the relationships involving top entrepreneurs. Your job is to process the provided context chunk (delimited by ```) and extract an ontology of terms that represent key entrepreneurs, their associated entities, and all kinds of relationships present in the context.

**Guidelines for Extraction:**

1. **Identify Key Entrepreneurs and Related Terms**:
   - Extract key entrepreneurs and related concepts such as:
     - Companies, organizations, or industries they are associated with.
     - Collaborators, partners, rivals, or competitors.
     - Key innovations, achievements, or milestones.
     - Locations, events, or time periods relevant to their actions.

2. **Identify Relationships**:
   - Extract all types of relationships between entrepreneurs and other entities (or between entities themselves).
   - Relationships can include:
     - Professional roles or associations.
     - Business partnerships, collaborations, or rivalries.
     - Innovations or contributions to industries.
     - Personal connections or influences.
     - Historical events or shared milestones.

3. **Define Relationships**:
   - Clearly specify the nature of each relationship in simple and concise terms.
   - Relationships should convey meaningful connections relevant to the context.

**Response Format**:
- Provide your output **strictly as a list of JSON objects**. No additional text, descriptions, or comments are allowed.
- Each object should include the following fields:
  - `"node_1"`: The first entity in the relationship (can be a person, organization, or concept).
  - `"node_2"`: The second entity in the relationship.
  - `"edge"`: A concise sentence describing the relationship between `node_1` and `node_2`.

**Example Output**:
[
   {
       "node_1": "Elon Musk",
       "node_2": "SpaceX",
       "edge": "Elon Musk founded SpaceX to revolutionize space exploration."
   },
   {
       "node_1": "Steve Jobs",
       "node_2": "Apple Inc.",
       "edge": "Steve Jobs co-founded Apple Inc., a leading tech company."
   },
   {
       "node_1": "Mark Zuckerberg",
       "node_2": "Sheryl Sandberg",
       "edge": "Sheryl Sandberg worked closely with Mark Zuckerberg as COO of Facebook."
   },
   {
       "node_1": "Jeff Bezos",
       "node_2": "Blue Origin",
       "edge": "Jeff Bezos founded Blue Origin to focus on space exploration."
   }
]

**Important Note**:
- Always respond exclusively in JSON format. Any deviation from the JSON structure or inclusion of additional text will not be accepted.

Please provide the context containing information about entrepreneurs and their relationships for analysis.

""" 

In [7]:
import requests
import time
from itertools import cycle
results=[]
models = [
    'llama-3.1-8b-instant',
    'llama3-groq-8b-8192-tool-use-preview',
    'llama3-8b-8192',
    'llama-3.2-1b-preview'
]
model_cycle = cycle(models)  # Create an infinite cycle of models
model_name = next(model_cycle)  # Start with the first model
for url in urls:
    
    try:
        smaller_doc=extract_data_from_URL(url)
        for doc in smaller_doc[:50]:
            chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": system
                },
                {
                    "role": "user",
                    "content": doc.page_content,
                }
            ],
            model=model_name,
            #model='gpt-4'
            )
            results.append(chat_completion.choices[0].message.content)
    except Exception as e:
        print('Exception',e)
        errordata=e.args[0]
        
        if 'rate_limit_exceeded' in errordata:
            print('Rate limit exceeded for model:', model_name)
            model_name = next(model_cycle)  # Switch to the next model
            print('Switching to model:', model_name)
            
    
len(results)

11


KeyboardInterrupt: 

# python to Gephi using LLM

In [38]:
# connect to gephi server
from gephistreamer import graph
from gephistreamer import streamer
import json

# create a stream 
stream = streamer.Streamer(streamer.GephiWS(hostname="localhost", port=8080, workspace="workspace1"))


In [None]:
combined_json_object=[]
for res in results:
    try: 
        json_object=json.loads(res)
        # combined_json_boject.extend(json.loads(json_object))
        print(res)
        for data in json_object:
            node_a = graph.Node(data['node_1'],custom_property=1)
            node_b = graph.Node(data['node_2'],custom_property=2)
            stream.add_node(node_a,node_b)
            edge_ab = graph.Edge(node_a,node_b,custom_property=data['edge'])
            stream.add_edge(edge_ab)

    except Exception as e:
        print('buggy JSON object', e)
    

[ {
  "node_1": "Dana White",
  "node_2": "Ultimate Fighting Championship",
  "edge": "Dana White is the CEO and President of the Ultimate Fighting Championship."
}, {
  "node_1": "Dana White",
  "node_2": "Anne White",
  "edge": "Anne White is Dana White's wife."
}, {
  "node_1": "Dana White",
  "node_2": "Manchester, Connecticut",
  "edge": "Dana White was born in Manchester, Connecticut, on July 28, 1969."
}, {
  "node_1": "Dana White",
  "node_2": "Las Vegas",
  "edge": "When Dana White was in third grade, his family moved to Las Vegas."
}, {
  "node_1": "Dana White",
  "node_2": "Bishop Gorman High School",
  "edge": "Dana White attended Bishop Gorman High School in Las Vegas, Nevada."
}, {
  "node_1": "Dana White",
  "node_2": "Lorenzo Fertitta",
  "edge": "Dana White first met Lorenzo Fertitta while attending Bishop Gorman High School, although they became close friends later."
}, {
  "node_1": "Dana White",
  "node_2": "Levant, Maine",
  "edge": "During his childhood, Dana Whit

In [None]:
# combined_objects=[]
# for res in results:
#     try: 
#         combined_objects.extend(json.loads(res))
#     except Exception as e:
#         print('buggy JSON object', e)
# with open('30minRun.json','w') as file:
#     json.dump(combined_objects,file,indent=1)

buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 