In [None]:
# !pip list

Name: playwright
Version: 1.52.0
Summary: A high-level API to automate web browsers
Home-page: https://github.com/Microsoft/playwright-python
Author: Microsoft Corporation
Author-email: 
License-Expression: Apache-2.0
Location: C:\Users\Kurt Salapare\Desktop\TCS_1\Research Project\research-project-source-code\venv\Lib\site-packages
Requires: greenlet, pyee
Required-by: Crawl4AI, tf-playwright-stealth


**Testing Ollama Qwen Model**

Trying simple extraction of just the body of a site

In [None]:
import ollama

def run_qwen_ollama(prompt_text):
  """
  Sends a prompt to the Qwen model in Ollama and prints the response.

  Args:
    prompt_text: The text prompt to send to the model.
  """
  try:
    # Initialize the Ollama client
    client = ollama.Client()

    # --- Using the chat API (recommended for conversational turns) ---
    print("--- Using ollama.chat ---")
    messages = [
        {'role': 'user', 'content': prompt_text},
    ]
    response_chat = client.chat(model='qwen:7b', messages=messages)
    print("Chat Response:")
    print(response_chat['message']['content'])

    print("\n" + "="*30 + "\n") # Separator

    # --- Using the generate API (simpler for single prompts) ---
    print("--- Using ollama.generate ---")
    response_generate = client.generate(model='qwen:7b', prompt=prompt_text)
    print("Generate Response:")
    print(response_generate['response'])


  except ollama.ResponseError as e:
    print(f"Error interacting with Ollama: {e}")
    print("Please ensure Ollama is running and the 'qwen:7b' model is pulled.")  
  except Exception as e:
    print(f"An unexpected error occurred: {e}")


# --- Example Usage ---
if __name__ == "__main__":
  my_prompt = """
  Look at the website attached to this url: https://venturebeat.com/ai/a-chevy-for-1-car-dealer-chatbots-show-perils-of-ai-for-customer-service/, 
  and return all the paragraphs from the website.
  """
  run_qwen_ollama(my_prompt)

--- Using ollama.chat ---
Chat Response:
As an AI language model, I cannot access external websites directly. However, based on the provided URL, I can summarize the paragraphs from the website:

1. Introduction to the topic:
   "AI and chatbots are increasingly being used in customer service for automotive dealerships. This article highlights the potential perils of using AI for customer service in this context."

2. Example: A Chevrolet for $1
   "The example given is a fictional car listing on a dealership's website. The AI chatbot would assist customers by asking questions about their preferences and budget. However, the headline 'A Chevrolet for $1' implies a seemingly unreasonable price, which could be an AI-generated error."

3. Risks and perils of AI in customer service:
   "Using AI in customer service can bring efficiency and personalized experiences. However, it also poses risks such as miscommunication or errors due to AI's reliance on algorithms and data inputs."

4. Concl

***Testing With Beautiful Soup***

Scraping with beautifulsoup python lib

In [4]:
from bs4 import BeautifulSoup as bsoup
import requests

headers = {
    'User Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
    'Referer' : 'https://hiddenlayer.com/innovation-hub/novel-universal-bypass-for-all-major-llms/'
}

page_to_scrape = requests.get("https://hiddenlayer.com/innovation-hub/novel-universal-bypass-for-all-major-llms/", headers=headers)
html_soup = bsoup(page_to_scrape.text, "html.parser")
print(html_soup)
# soup = bsoup(html_soup, "lxml") 
# div_tag = html_soup.find("div", class_="entry-content-wrapper")
# paragraphs = [p_tag.get_text() for p_tag in div_tag.find_all("p")]
# print(paragraphs)

<html>
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>
<hr/><center>nginx</center>
</body>
</html>



***Testing with Crawl4ai***

Webscraping with Crawl4ai

In [15]:
import asyncio
# Ensure crawl4ai is imported correctly as per previous corrections
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
import ollama

# --- Configuration ---
OLLAMA_MODEL = 'qwen:7b' # Or 'qwen:latest', 'qwen:14b', etc.
MAX_LLM_INPUT_CHARS = 15000 # Max characters for LLM input to avoid context window issues
                             # Adjust based on your Qwen model's actual context window.

# --- 1. Function to crawl the URL and get text content using crawl4ai (Remains the same) ---
async def get_webpage_content_with_crawl4ai(url: str):
    """
    Crawls a given URL using crawl4ai and returns its raw Markdown content.
    """
    browser_conf = BrowserConfig(
        browser_type="chromium", # Use "chromium" for Chrome/Chromium-based browsers
        headless=False,          # Set to True for running without a visible browser window
        verbose=True             # Set to True for more detailed logging from the browser
    )
    run_conf = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)

    print(f"[Crawl4AI] Starting crawl for: {url}")
    # try:
    #     print('RESULT UNDER')
    #     async with AsyncWebCrawler(config=browser_conf) as crawler:
    #         result = await crawler.arun(url=url, config=run_conf)
    #         print(result)

    #     if result.success:
    #         print(f"[Crawl4AI] Successfully crawled {url}. Returning markdown content.")
    #         return result.markdown_v2.raw_markdown
    #     else:
    #         print(f"[Crawl4AI] Crawl failed for {url}: {result.error_message}")
    #         return None
    # except Exception as e:
    #     print(f"[Crawl4AI] An unexpected error during crawl4ai execution for {url}: {e}")
    #     return None
    
    print('RESULT UNDER')
    async with AsyncWebCrawler(config=browser_conf) as crawler:
        result = await crawler.arun(url=url, config=run_conf)
        print(result)

    if result.success:
        print(f"[Crawl4AI] Successfully crawled {url}. Returning markdown content.")
        return result.markdown_v2.raw_markdown
    else:
        print(f"[Crawl4AI] Crawl failed for {url}: {result.error_message}")
        return None
    



In [4]:
# --- 2. Function to process text with Ollama/Qwen (MODIFIED for plain string output) ---
async def extract_paragraphs_with_qwen(text_content: str, model: str = OLLAMA_MODEL) -> list[str]:
    """
    Uses Ollama/Qwen to extract only coherent text paragraphs from the given content.
    Instructs Qwen to return paragraphs separated by two newlines, then splits them.
    """
    if not text_content:
        return []

    client = ollama.Client()

    # Limit the input text length to fit within the LLM's context window
    if len(text_content) > MAX_LLM_INPUT_CHARS:
        print(f"[Qwen] Truncating content to {MAX_LLM_INPUT_CHARS} characters for LLM processing.")
        text_content = text_content[:MAX_LLM_INPUT_CHARS]

    # Adjusted Prompt: Asking for newline-separated paragraphs, no JSON format
    prompt_messages = [
        {'role': 'system', 'content': 'You are an expert web content extractor. Your task is to identify and extract only the main, coherent text paragraphs from the provided web page content. Exclude all non-paragraph elements such as headers, footers, navigation links, advertisements, image captions, code blocks, or short, disconnected phrases. Return the extracted paragraphs as a single string, with each paragraph separated by exactly two newline characters (`\n\n`). Do NOT include any introductory or concluding remarks, explanations, or additional text from yourself.'},
        {'role': 'user', 'content': f"Extract all text paragraphs from the following web page content:\n\n{text_content}"}
    ]

    print(f"[Qwen] Sending content to {model} for paragraph extraction (plain text response expected)...")
    try:
        # Removed format='json' from here
        response = await asyncio.to_thread(
            lambda: client.chat(model=model, messages=prompt_messages, options={'temperature': 0.1})
        )

        qwen_output_str = response['message']['content']
        print(f"[Qwen] Received raw response from {model}. Splitting into paragraphs.")

        # Split the string by two newlines and strip whitespace from each part
        # Filter out empty strings that might result from extra newlines or no content
        paragraphs_list = [p.strip() for p in qwen_output_str.split('\n\n') if p.strip()]

        return paragraphs_list

    except ollama.ResponseError as e:
        print(f"[Qwen] Error interacting with Ollama ({model}): {e}")
        print("Please ensure Ollama is running and the model is pulled.")
        return []
    except Exception as e:
        print(f"[Qwen] An unexpected error occurred during Ollama processing: {e}")
        return []



In [None]:
# --- Main execution flow ---
async def main():
    # --- IMPORTANT: Replace with the actual URL you want to scrape ---
    # Always check the website's robots.txt and terms of service before scraping.
    # target_url = "https://www.theverge.com/2024/5/15/24157147/openai-gpt-4o-voice-mode-safety-concerns"
    # Example for a more structured site:
    target_url = "https://www.bbc.com/news/articles/cn9j1052p2yo"

    print(f"Starting web scraping and paragraph extraction for: {target_url}")

    # Step 1: Crawl the website to get its content
    web_content = await get_webpage_content_with_crawl4ai(target_url)

    if web_content:
        # Step 2: Use Qwen to extract paragraphs from the crawled content
        extracted_paragraphs = await extract_paragraphs_with_qwen(web_content, model=OLLAMA_MODEL)

        print("\n" + "="*30 + "\n")
        print("--- Final Extracted Paragraphs ---")
        if extracted_paragraphs:
            print(f"Successfully extracted {len(extracted_paragraphs)} paragraphs:")
            for i, paragraph in enumerate(extracted_paragraphs):
                print(f"--- Paragraph {i+1} ---")
                print(paragraph)
                print("-" * 20) # Simple separator
            print("\n--- Process Completed ---")
            return extracted_paragraphs
        else:
            print("Qwen processing returned no paragraphs.")
    else:
        print("Failed to get web page content with crawl4ai.")

    return []

await main()

In [None]:
import asyncio
# Corrected import statement
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
import json

async def main():
    # 1. Browser config
    browser_cfg = BrowserConfig(
        browser_type="chromium", # Or "chromium" for Chrome
        headless=True
    )


    run_cfg = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
    )

    async with AsyncWebCrawler(config=browser_cfg) as crawler:
        result = await crawler.arun(
            url="https://www.bbc.com/news/articles/cn9j1052p2yo", # Replace with your target URL
            config=run_cfg
        )

        if result.success:
            print("Cleaned HTML length:", len(result.cleaned_html))
            if result.extracted_content:
                articles = json.loads(result.extracted_content)
                print("Extracted articles:", articles[:2])
            else:
                print("No content extracted despite success.")
        else:
            print("Error:", result.error_message)

# --- How to run it in a Jupyter Notebook cell ---
# Simply await the main() function directly
await main()

In [None]:
import nest_asyncio

nest_asyncio.apply()

import asyncio

from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
import json

async def main():
    # 1. Browser config
    browser_cfg = BrowserConfig(
        browser_type="chromium",
        headless=False,
        verbose=True
    )

    run_cfg = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS
    )

    async with AsyncWebCrawler(config=browser_cfg) as crawler:
        result = await crawler.arun(
            url="https://www.bbc.com/news/articles/cvgddel17kvo",
            config=run_cfg
        )

        if result.success:
            print("Cleaned HTML length:", len(result.cleaned_html))
            print(result.cleaned_html)
            print(result)
            if result.extracted_content:
                articles = json.loads(result.extracted_content)
                print("Extracted articles:", articles[:2])
        else:
            print("Error:", result.error_message)

asyncio.run(main())

Task exception was never retrieved
future: <Task finished name='Task-8' coro=<Connection.run() done, defined at c:\Users\Kurt Salapare\Desktop\TCS_1\Research Project\research-project-source-code\venv\Lib\site-packages\playwright\_impl\_connection.py:272> exception=NotImplementedError()>
Traceback (most recent call last):
  File "C:\Users\Kurt Salapare\AppData\Local\Programs\Python\Python312\Lib\asyncio\tasks.py", line 314, in __step_run_and_handle_result
    result = coro.send(None)
             ^^^^^^^^^^^^^^^
  File "c:\Users\Kurt Salapare\Desktop\TCS_1\Research Project\research-project-source-code\venv\Lib\site-packages\playwright\_impl\_connection.py", line 279, in run
    await self._transport.connect()
  File "c:\Users\Kurt Salapare\Desktop\TCS_1\Research Project\research-project-source-code\venv\Lib\site-packages\playwright\_impl\_transport.py", line 133, in connect
    raise exc
  File "c:\Users\Kurt Salapare\Desktop\TCS_1\Research Project\research-project-source-code\venv\Lib\

NotImplementedError: 