# extract data from Web URL's

### Step 1: install necessary packages 

In [1]:
#! pip install langchain
# ! pip install langchain-core
# ! pip install langchain-community
# ! pip install groq 
#! pip install gephistreamer

### Step 2: create a list of all the web urls from where we can extract data

In [2]:
# list of top Tech leaders
url_list=[  
    'https://en.wikipedia.org/wiki/Elon_Musk',
    'https://en.wikipedia.org/wiki/Mark_Zuckerberg',
    'https://en.wikipedia.org/wiki/Bill_Gates',
    'https://en.wikipedia.org/wiki/Jeff_Bezos',
    'https://en.wikipedia.org/wiki/Steve_Jobs',
    'https://en.wikipedia.org/wiki/Sam_Altman',
    'https://en.wikipedia.org/wiki/Larry_Ellison',
    'https://en.wikipedia.org/wiki/Larry_Page',
    'https://en.wikipedia.org/wiki/Sundar_Pichai',
    'https://en.wikipedia.org/wiki/Satya_Nadella' 
    
]

### Step 3 : define a function to clean the extracted data from web URL'S

In [3]:
# define a function to clean the extracted web URL data
import re #for regular expression 

def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]*?>', '', text)
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s{2,}', ' ', text)
    # Trim leading and trailing whitespace
    text = text.strip()
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

### Step 4: Define a function to Extract the data from Web URL's using Langchain Framework

In [4]:
# extract the data from the URLs
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import WebBaseLoader

def extract_data_from_URL(url):
    loader=WebBaseLoader([url])
    data=loader.load().pop().page_content
    data=clean_text(data)
    documents=[Document(page_content=data)]
    # print(documents)
    splitter=RecursiveCharacterTextSplitter(chunk_size=3000,chunk_overlap=100)
    smaller_doc=splitter.split_documents(documents)
    print(len(smaller_doc))
    return smaller_doc

USER_AGENT environment variable not set, consider setting it to identify your requests.


### Step 5: GROQ API 

In [7]:
from groq import Groq 
import os 

client=Groq(api_key=os.getenv('GROQ_API_KEY'))

In [8]:
# create system prompt to extract data in JSON format as required

system=""" You are a network graph maker tasked with analyzing the relationships involving top entrepreneurs. Your job is to process the provided context chunk (delimited by ```) and extract an ontology of terms that represent key entrepreneurs, their associated entities, and all kinds of relationships present in the context.

**Guidelines for Extraction:**

1. **Identify Key Entrepreneurs and Related Terms**:
   - Extract key entrepreneurs and related concepts such as:
     - Companies, organizations, or industries they are associated with.
     - Collaborators, partners, rivals, or competitors.
     - Key innovations, achievements, or milestones.
     - Locations, events, or time periods relevant to their actions.

2. **Identify Relationships**:
   - Extract all types of relationships between entrepreneurs and other entities (or between entities themselves).
   - Relationships can include:
     - Professional roles or associations.
     - Business partnerships, collaborations, or rivalries.
     - Innovations or contributions to industries.
     - Personal connections or influences.
     - Historical events or shared milestones.

3. **Define Relationships**:
   - Clearly specify the nature of each relationship in simple and concise terms.
   - Relationships should convey meaningful connections relevant to the context.

**Response Format**:
- Provide your output **strictly as a list of JSON objects**. No additional text, descriptions, or comments are allowed.
- Each object should include the following fields:
  - `"node_1"`: The first entity in the relationship (can be a person, organization, or concept).
  - `"node_2"`: The second entity in the relationship.
  - `"edge"`: A concise sentence describing the relationship between `node_1` and `node_2`.

**Example Output**:
[
   {
       "node_1": "Elon Musk",
       "node_2": "SpaceX",
       "edge": "Elon Musk founded SpaceX to revolutionize space exploration."
   },
   {
       "node_1": "Steve Jobs",
       "node_2": "Apple Inc.",
       "edge": "Steve Jobs co-founded Apple Inc., a leading tech company."
   },
   {
       "node_1": "Mark Zuckerberg",
       "node_2": "Sheryl Sandberg",
       "edge": "Sheryl Sandberg worked closely with Mark Zuckerberg as COO of Facebook."
   },
   {
       "node_1": "Jeff Bezos",
       "node_2": "Blue Origin",
       "edge": "Jeff Bezos founded Blue Origin to focus on space exploration."
   }
]

**Important Note**:
- Always respond exclusively in JSON format. Any deviation from the JSON structure or inclusion of additional text will not be accepted.

Please provide the context containing information about entrepreneurs and their relationships for analysis.

""" 

### Step 6: the below code extracts the nodes and edges from the wikipedia links using LLM models
1. cycle through the LLM models to extract the data (since GROQ is free it has a limit for each model)
2. loop through the URL's to extract the data (extract_data_from_URL function we defined earlier)
3. from the extracted data use the LLM to get the nodes and edges in JSON format 

In [9]:
import requests
from datetime import datetime
from itertools import cycle

results=[]
models = [
    'gemma-7b-it',
    'llama-3.1-8b-instant',
    'llama3-groq-8b-8192-tool-use-preview',
    'llama3-8b-8192',
    'llama-3.2-1b-preview'
]
model_cycle = cycle(models)  # Create an infinite cycle of models
model_name = next(model_cycle)  # Start with the first model
start_time=datetime.now()
for url in url_list:
    
    try:
        smaller_doc=extract_data_from_URL(url)
        for doc in smaller_doc[:50]:
            chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": system
                },
                {
                    "role": "user",
                    "content": doc.page_content,
                }
            ],
            model=model_name,
            #model='gpt-4'
            )
            results.append(chat_completion.choices[0].message.content)
    except Exception as e:
        print('Exception',e)
        errordata=e.args[0]
        # 
        if 'rate_limit_exceeded' in errordata:
            print('Rate limit exceeded for model:', model_name)
            model_name = next(model_cycle)  # Switch to the next model
            print('Switching to model:', model_name)
            
end_time=datetime.now()
len(results)
print(f'extracted information in {end_time-start_time}')

80


KeyboardInterrupt: 

### step 7 work with the json results from the LLM 
- next time we run the code we dont need the LLM to capture the nodes and edges again, so we store the json file
- there are some cases when the LLM return a buggy JSON object there we exclude it
- we store all the json objects to file Nodes_and_edges.json file

In [22]:
print(f'extracted information in {end_time-start_time}')
import json 

combined_nodes_and_edges=[]
for res in results:
    try: 
        combined_nodes_and_edges.extend(json.loads(res)) #convert the string result from LLM to JSON 
    except Exception as e:
        print('buggy JSON object', e)
with open('Nodes_and_edges.json','w') as file:
    json.dump(combined_nodes_and_edges,file,indent=1)

extracted information in 0:17:03.936263
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 12 column 3 (char 404)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 0)
buggy JSON object Expecting value: line 1 column 1 (char 

### step 8: setup Gephi and start the server

### step 9: send JSON data to Gephi
- connect to gephi server using gephistream
- read the saved JSON file
- ppush the data to gephi 

In [26]:
from gephistreamer import graph
from gephistreamer import streamer
# connect to gephi server
# create a stream 
stream = streamer.Streamer(streamer.GephiWS(hostname="localhost", port=8080, workspace="workspace1"))

In [28]:
# load the nodes and edges from the json file
with open('Nodes_and_edges.json','r') as file:
    results=json.load(file)

In [30]:
# loop throgugh the list of json i.e. results 
for res in results:
    try:        
        node_a = graph.Node(res['node_1'],custom_property=1)
        node_b = graph.Node(res['node_2'],custom_property=2)
        stream.add_node(node_a,node_b)
        edge_ab = graph.Edge(node_a,node_b,custom_property=res['edge'])
        stream.add_edge(edge_ab)

    except Exception as e:
        print('buggy JSON object', e,res)