In [1]:
from firecrawl import FirecrawlApp
from openai import OpenAI
from dotenv import load_dotenv
import os 
import json 
import pandas as pd 
from datetime import datetime

In [17]:
def scrape_data(url):
    load_dotenv()
    # Initialize the FirecrawlApp with your API key
    app = FirecrawlApp(api_key=os.getenv('FIRECRAWL_KEY'))
    
    # Scrape a single URL
    scraped_data = app.scrape_url(url,{'pageOptions':{'onlyMainContent': True}})
    
    # Check if 'markdown' key exists in the scraped data
    if 'markdown' in scraped_data:
        return scraped_data['markdown']
    else:
        raise KeyError("The key 'markdown' does not exist in the scraped data.")

In [26]:
def save_raw_data(raw_data, timestamp, output_folder='scraping_output'):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Save the raw markdown data with timestamp in filename
    raw_output_path = os.path.join(output_folder, f'rawData_{timestamp}.md')
    with open(raw_output_path, 'w', encoding='utf-8') as f:
        f.write(raw_data)
    print(f"Raw data saved to {raw_output_path}")

In [45]:
def format_data(data, fields=None):
    load_dotenv()
    # Instantiate the OpenAI client
    client = OpenAI(api_key=os.getenv('OPENAI_KEY'))

    # Assign default fields if not provided
    if fields is None:
        fields = ["Address", "Real Estate Agency", "Price", "Beds", "Baths", "Sqft", "Home Type", "Listing Age", "Picture of home URL", "Listing URL"]

    # Define system message content
    system_message = f"""You are an intelligent text extraction and conversion assistant. Your task is to extract structured information 
                        from the given text and convert it into a pure JSON format. The JSON should contain only the structured data extracted from the text, 
                        with no additional commentary, explanations, or extraneous information. 
                        You could encounter cases where you can't find the data of the fields you have to extract or the data will be in a foreign language.
                        Please process the following text and provide the output in pure JSON format with no words before or after the JSON:"""

    # Define user message content
    user_message = f"Extract the following information from the provided text:\nPage content:\n\n{data}\n\nInformation to extract: {fields}"


    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        response_format={ "type": "json_object" },
        messages=[
            {
                "role": "system",
                "content": system_message
            },
            {
                "role": "user",
                "content": user_message
            }
        ]
    )

    # Check if the response contains the expected data
    if response and response.choices:
        formatted_data = response.choices[0].message.content.strip()
        print(f"Formatted data received from API: {formatted_data}")

        try:
            parsed_json = json.loads(formatted_data)
        except json.JSONDecodeError as e:
            print(f"JSON decoding error: {e}")
            print(f"Formatted data that caused the error: {formatted_data}")
            raise ValueError("The formatted data could not be decoded into JSON.")
        
        return parsed_json
    else:
        raise ValueError("The OpenAI API response did not contain the expected choices data.")
    

def save_formatted_data(formatted_data, timestamp, output_folder='output'):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Save the formatted data as JSON with timestamp in filename
    output_path = os.path.join(output_folder, f'sorted_data_{timestamp}.json')

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(formatted_data, f, indent=4)
    print(f"Formatted data saved to {output_path}")

    # Check if data is a dictionary and contains exactly one key
    if isinstance(formatted_data, dict) and len(formatted_data) == 1:
        key = next(iter(formatted_data))  # Get the single key
        formatted_data = formatted_data[key]

    # Convert the formatted data to a pandas DataFrame
    df = pd.DataFrame(formatted_data)

    # Convert the formatted data to a pandas DataFrame
    if isinstance(formatted_data, dict):
        formatted_data = [formatted_data]

    df = pd.DataFrame(formatted_data)

    # Save the DataFrame to an Excel file
    excel_output_path = os.path.join(output_folder, f'sorted_data_{timestamp}.xlsx')
    df.to_excel(excel_output_path, index=False)
    print(f"Formatted data saved to Excel at {excel_output_path}")


In [None]:
url = 'https://www.trulia.com/CA/San_Francisco/'

try:
    # Generate timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Scrape data
    raw_data = scrape_data(url)
    
    # Save raw data
    save_raw_data(raw_data, timestamp)
    
    # Format data
    formatted_data = format_data(raw_data)
    
    # Save formatted data
    save_formatted_data(formatted_data, timestamp)
except Exception as e:
    print(f"An error occurred: {e}")

In [22]:
raw_data = scrape_data('https://www.estimize.com/')

In [28]:
print(raw_data)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
save_raw_data(raw_data, timestamp, output_folder='scraping_output')

[![Logo](https://ucarecdn.com/64603bc8-776c-4912-bb3b-25887fcc958a/)](/)

*   *   [Login](/users/sign_in)
        
    

*   *   [Login](/users/sign_in)
        
    

The Most Accurate Earnings Estimates Dataset on the Planet

Estimize crowdsources earnings and macroeconomic estimates from over 120,000 contributors across the globe. We welcome anyone to contribute in return for free access to our data and analytics tools, or you can simply pay now to subscribe. 

Learn more below

*   [Free Contributor Account](/users/sign_up)
    
*   [Premium 21-day Free Trial](/premium)
    

![](https://www.estimize.com/images/macbook-mockup-shadow.png)

*   70% Win Rate
    
    The Estimize Consensus has been closer to the company's actual reported results 70% of the time compared to legacy sell-side only estimate data sets.
    
*   2x Deeper
    
    More than 2x the number of estimates per earnings release on average, wider estimate dispersion and 3x the average number of revisions per estima

In [47]:
from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage 


system_message_1 = """

            You are an intelligent text extraction and conversion assistant. Your task is to extract structured information 
            from the given text and convert it into a pure JSON format. 
            The JSON should contain only the structured data extracted from the text, with no additional commentary, explanations, or extraneous information. 
            You could encounter cases where you can't find the data of the fields you have to extract.
            Please process the following text and provide the output in pure JSON format with no words before or after the JSON:
            """
            
system_message_2 = """          
            Extract the following information from the text extracted from a webpage of a company:

            ## 1. Product offering:
               - What service or product does the company provide?
               - What features does the product or service have?

            ## 2. Client or partnership:
               - Who are the partners or clients of the company?
               - What are the clients or partners use this product for?

            Output in JSON format:
            {{
                "product_offering": {{
                    "product_1": "concise features description of the product or service",
                    "product_2": "concise features description of the product or service",
                    ...
                }}
                "partners": {{
                    "partner_name_1": "description of the usecase",
                    "partner_name_2": "description of the usecase",
                    ...
                }}
            }}

            Here are the rules that you need to adhere:
            ## Rules:
               - The aim is to achieve simplicity and clarity in the extracted text.
               - Make sure to answer in the correct JSON format.
               - If no information is provided for any of the fields, return nothing of that field.
               - DO NOT HALLUCINATE.
            """
            

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_message_1),
        ("system", system_message_2),
        ("human", "Use the given text to extract information: {input}"),
        ("human", "Tip: Make sure to answer in the correct JSON format"),
    ]
)

llm = ChatOpenAI(openai_api_key=os.getenv('OPENAI_KEY'),
                 temperature = 0, 
                 model_name = "gpt-4o")

llm_chain = prompt | llm | SimpleJsonOutputParser()

response = llm_chain.invoke({'input': raw_data})

print(response)



{'product_offering': {'Earnings Estimates Dataset': 'Crowdsources earnings and macroeconomic estimates from over 120,000 contributors. Features include a 70% win rate, 2x deeper estimates, and over 10 years of data.', 'Estimize Platform': 'Collects opinions from a wide range of contributors using advanced behavioral and statistical algorithms. Features include anonymous contribution, give-to-get model, quality control, smart consensus, alerts, screening, and Excel files.'}, 'partners': {'University of Pennsylvania': 'Research on Estimize-covered firms meeting or beating analyst earnings expectations.', 'Temple University': 'Research on improving consensus forecast accuracy using the wisdom of crowds.', 'University of Kentucky': 'Research on reducing consensus bias and increasing consensus accuracy.', 'George Washington University': 'Research on providing a less biased and more accurate view of market expectations.', 'McKinley Capital Research': 'Research on unique and valuable pre-earn

In [51]:
# Save the response dictionary to a JSON file
output_file = "output.json"
with open(output_file, 'w') as f:
    json.dump(response, f, indent=4)

print(f"Output saved to {output_file}")

Output saved to output.json
