In [1]:
import openai
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Initialize OpenAI API 
openai.api_key = "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"

def fetch_website_content(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Remove script, style, header, footer, and navigation elements
        for element in soup(["script", "style", "header", "footer", "nav"]):
            element.decompose()
        
        # Find the main content
        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
        
        if main_content:
            text = main_content.get_text(separator='\n', strip=True)
        else:
            text = soup.get_text(separator='\n', strip=True)
        
        # Further clean up the text
        lines = (line.strip() for line in text.splitlines())
        text = '\n'.join(line for line in lines if line)
        
        return text[:8000]  # Increased to 8000 characters for more context
    except Exception as e:
        print(f"Error fetching website content: {e}")
        return None

def analyze_content(content, url):
    system_prompt = {
        "role": "system",
        "content": """
        You are an expert data analyst specialized in content analysis and keyword extraction for business research. Your task is to analyze the given content and extract relevant information to categorize the source. The content given is parsed from websites, It may have irrelevant stuff that you need to ignore. Your audience is business researchers who will use this information for market analysis and reports.

        Provide your answers in the following format:

        1. Sector/Area: (Main industry or area of focus)
        2. Sub-Sector: (More specific category within the Sector/Area)
        3. Source name: (Name of the organization or database providing the information)
        4. Description: (2-3 sentences summarizing what kind of data or information the source provides)        
        5. Years: (The range of years the data covers, in the format "YYYY; YYYY; YYYY" for specific years or "YYYY-YYYY" for ranges) If not specified, use "Not specified." ONLY.
        6. Tags: (At least 30 relevant keywords and phrases, separated by semicolons. Include a mix of general and specific terms, synonyms, related terms, and both short-tail and long-tail keywords. Consider different user intents such as informational, navigational, and transactional.)

        Guidelines:
        - For Sector/Area and Sub-Sector, be specific but not overly narrow.
        - The Description should be informative but concise, focusing on the type of data available.
        - Tags should be comprehensive and relevant for search and categorization purposes.
        - For Years, if specific years are mentioned, list them individually. If a range is given, use the range format.
        - Avoid including irrelevant information or speculation.
        """
    }

    conversation = [
        system_prompt,
        {"role": "user", "content": f"Analyze the following content from {url} and provide the requested information:\n\n{content}"}
    ]

    response = openai.chat.completions.create(
        model="gpt-4-1106-preview",
        temperature=0.1,
        messages=conversation
    )

    return response.choices[0].message.content

def parse_llm_response(response):
    lines = response.split('\n')
    result = {}
    for line in lines:
        if ':' in line:
            key, value = line.split(':', 1)
            key = key.split('.', 1)[-1].strip()  # Remove numbering and strip
            result[key] = value.strip()
    
    # Process Years field
    if 'Years' in result:
        years = result['Years']
        # If it's a range, keep it as is
        if '-' in years:
            result['Years'] = years
        else:
            # If it's a list of years, ensure they are separated by semicolons
            years = re.findall(r'\d{4}', years)
            result['Years'] = '; '.join(years)
    
    return result

def check_link_exists(df, link):
    return link in df['Link'].values

def process_link(df, link):
    if check_link_exists(df, link):
        return {"error": "Link already exists in the database"}

    content = fetch_website_content(link)
    if not content:
        return {"error": "Failed to fetch website content"}

    llm_response = analyze_content(content, link)
    parsed_response = parse_llm_response(llm_response)
    parsed_response['link'] = link

    return parsed_response

In [3]:
df = pd.read_excel(r'C:\Users\Youssef Moutaouakkil\Desktop\Github\BCG_Search_Website\bcg-knowledge-search-tool\Backend\Database.xlsx')

df.columns

Index(['PA classification', 'Sector/Area', 'Sub-Sector', 'Source name',
       'Description', 'Type (General DB, specialized, ...)', 'Free/Paid?',
       'Geography', 'Regional data', 'Country data',
       'Frequency cover harmonized for all geos ? ', 'Frequency ', 'Years',
       'Tags', 'Format ', 'Reliability score (1-10) ', 'Link'],
      dtype='object')

In [2]:
#Let's testt the functions one by one , link : https://www.oica.net/category/sales-statistics/

# Fetch website content
url = "https://www.oica.net/category/sales-statistics/"
content = fetch_website_content(url)

print(content)

www.oica.net
>
Sales Statistics
Global Sales Statistics 2019 – 2023
SALES OF NEW VEHICLES 2019-2023
/
Passengers Cars
/
Commercial vehicles
/
All vehicles
Overview
All the data available at OICA are included here.
For more details, please contact the respective countries or the individual OICA member associations directly
.
The OICA secretariat does not have any further data.
These data are gathered in cooperation with
Ward’s
(for the American Continent) and
Fourin
(for the Asian Continent).
Definitions


In [3]:
text = analyze_content(content, url)

In [4]:
print(text)

1. Sector/Area: Automotive Industry
2. Sub-Sector: Automotive Sales Statistics
3. Source name: International Organization of Motor Vehicle Manufacturers (OICA)
4. Description: OICA provides global sales statistics for new vehicles, including passenger cars, commercial vehicles, and the total of all vehicles. The data is collected in cooperation with Ward's for the American continent and Fourin for the Asian continent.
5. Years: 2019-2023
6. Tags: OICA; global vehicle sales; automotive sales data; passenger cars sales statistics; commercial vehicles sales statistics; vehicle sales trends; automotive industry analysis; new vehicle sales; automotive market research; Ward's automotive data; Fourin automotive data; automotive sales reports; vehicle sales data; car sales statistics; truck sales statistics; automotive sales figures; market analysis automotive; international vehicle sales; automotive sales tracking; sales of new vehicles; automotive sector statistics; vehicle industry sales da

In [5]:
parse_llm_response(text)

{'Sector/Area': 'Automotive Industry',
 'Sub-Sector': 'Automotive Sales Statistics',
 'Source name': 'International Organization of Motor Vehicle Manufacturers (OICA)',
 'Description': "OICA provides global sales statistics for new vehicles, including passenger cars, commercial vehicles, and the total of all vehicles. The data is collected in cooperation with Ward's for the American continent and Fourin for the Asian continent.",
 'Years': '2019-2023',
 'Tags': "OICA; global vehicle sales; automotive sales data; passenger cars sales statistics; commercial vehicles sales statistics; vehicle sales trends; automotive industry analysis; new vehicle sales; automotive market research; Ward's automotive data; Fourin automotive data; automotive sales reports; vehicle sales data; car sales statistics; truck sales statistics; automotive sales figures; market analysis automotive; international vehicle sales; automotive sales tracking; sales of new vehicles; automotive sector statistics; vehicle i

In [4]:
# check link exists with :  C:\Users\Youssef Moutaouakkil\Desktop\Github\BCG_Search_Website\bcg-knowledge-search-tool\Backend\Database.xlsx
df = pd.read_excel(r"C:\Users\Youssef Moutaouakkil\Desktop\Github\BCG_Search_Website\bcg-knowledge-search-tool\Backend\Database.xlsx")
link = "https://www.oica.net/category/sales-statistics/"


process_link(df, link)

{'error': 'Link already exists in the database'}

In [7]:
link = "https://ec.europa.eu/eurostat/web/structural-business-statistics/database"

# let's look at the parsing of the website first
content = fetch_website_content(link)
print(content)

Database - Eurostat
Structural business statistics
Database
Site Map
Data navigation tree
Hidden
Close
Your feedback was sent
Thank you for your feedback.
Thank you for the information. We will investigate the issue.
Was this page useful?
Yes
✓
No
If you do not wish to provide more detailed feedback, please just click on the “Submit” button to send your response.
Click this radio box.
What type of issue would you like to report?
(Optional)
Select type
There is a technical problem with this page
I cannot find the information I am looking for
Other
Show / hide list
Enter your name
Please describe the issue
(Optional)
Please do not include any personal information
300
/300 characters remaining
I accept the terms & conditions.
Submit


In [8]:
text_2 = analyze_content(content, link)

print(text_2)

1. Sector/Area: Economics and Statistics
2. Sub-Sector: Structural Business Statistics
3. Source name: Eurostat
4. Description: Eurostat's database provides comprehensive statistics on the structure, conduct, and performance of businesses across the European Union. It includes data on business demographics, financial operations, and sectoral performance, which can be used for market analysis and economic research.
5. Years: Not specified.
6. Tags: Eurostat; Structural Business Statistics; European Union; Business Demographics; Financial Operations; Sectoral Performance; Market Analysis; Economic Research; Business Data; EU Statistics; Business Conduct; Business Performance; Statistical Database; Business Sector Data; Economic Data; Business Research; Business Statistics; Eurostat Database; Economic Analysis; Industry Analysis; Business Operations; Company Statistics; Business Environment; European Market Data; Business Trends; Economic Indicators; Business Analysis; Statistical Analysi

In [None]:
import requests

# curl this website to get the content (without the function to see what it looks like) : "https://ec.europa.eu/eurostat/web/structural-business-statistics/database"

url = "https://ec.europa.eu/eurostat/web/structural-business-statistics/database"
response = requests.get(url)
content = response.text

print(content)

In [21]:
import nest_asyncio
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup

nest_asyncio.apply()

async def fetch_website_playwright(url):
    try:
        async with async_playwright() as p:
            browser = await p.chromium.launch()
            page = await browser.new_page()
            await page.goto(url)
            
            # Wait for the content to load
            await page.wait_for_load_state('networkidle')
            
            # Get the full HTML content
            content = await page.content()
            
            await browser.close()
        
        # Parse the content with BeautifulSoup
        soup = BeautifulSoup(content, 'html.parser')
        
        # Remove script and style elements
        for element in soup(["script", "style"]):
            element.decompose()
        
        # Get text from the body
        text = soup.body.get_text(separator='\n', strip=True)
        
        # Further clean up the text
        lines = (line.strip() for line in text.splitlines())
        text = '\n'.join(line for line in lines if line)
        
        return text[:10000]  # Increased to 10000 characters for more context
    except Exception as e:
        print(f"Error fetching website content: {e}")
        return None

# Example usage
url = "https://ec.europa.eu/eurostat/web/structural-business-statistics/database"
content = asyncio.run(fetch_website_playwright(url))

print(content)

Task exception was never retrieved
future: <Task finished name='Task-2' coro=<Connection.run() done, defined at c:\Users\Youssef Moutaouakkil\.conda\envs\notebooksBaisc\Lib\site-packages\playwright\_impl\_connection.py:265> exception=NotImplementedError()>
Traceback (most recent call last):
  File "c:\Users\Youssef Moutaouakkil\.conda\envs\notebooksBaisc\Lib\asyncio\tasks.py", line 314, in __step_run_and_handle_result
    result = coro.send(None)
             ^^^^^^^^^^^^^^^
  File "c:\Users\Youssef Moutaouakkil\.conda\envs\notebooksBaisc\Lib\site-packages\playwright\_impl\_connection.py", line 272, in run
    await self._transport.connect()
  File "c:\Users\Youssef Moutaouakkil\.conda\envs\notebooksBaisc\Lib\site-packages\playwright\_impl\_transport.py", line 133, in connect
    raise exc
  File "c:\Users\Youssef Moutaouakkil\.conda\envs\notebooksBaisc\Lib\site-packages\playwright\_impl\_transport.py", line 120, in connect
    self._proc = await asyncio.create_subprocess_exec(
       

Error fetching website content: 
None


Error fetching website content: 'PlaywrightContextManager' object does not support the asynchronous context manager protocol
None


  content = await fetch_website_playwright(url)
