In [1]:
%load_ext autoreload
%autoreload 2 

In [2]:
import os
os.chdir('/home/gops/work/market_analysis')
import pandas as pd
from src.core.scraper import WebScraper
from src.core.content_analyzer import ContentAnalyzer
from src.core.data_processer import DataProcessor
from src.utils.rate_limiter import RateLimiter
# from src.utils.cache import AnalysisCache


In [3]:
# Initialize Components
scraper = WebScraper()
processor = DataProcessor()
analyzer = ContentAnalyzer()
rate_limiter = RateLimiter()


In [5]:
urls= processor.read_excel_to_url('input/test.xlsx')

In [6]:
sample_urls= urls[1:2]

In [7]:
processed_urls = []
for url in sample_urls:
    try:
        clean_url = processor.clean_url(url)
        processed_urls.append(clean_url)
        print(f"Processed URL: {clean_url}")
    except Exception as e:
        print(f"Error processing URL {url}: {e}")

Processed URL: https://sanjosebengalcats.com


In [8]:
scraped_data = {}
for url in processed_urls:
    try:
        rate_limiter.wait()
        data = scraper.scrape_website(url)
        scraped_data[url] = data
        print(f"Scraped data from {url}: {data['metadata']}")
    except Exception as e:
        print(f"Error scraping {url}: {e}")

Scraped data from https://sanjosebengalcats.com: {'title': 'Bengal Kittens & Cats For Sale | San Jose Bengal Cats', 'meta_description': 'Bengal Cats are the most exotic and adored cats among families and friends for their cuteness and uniqueness. It is one of the best Cats breeders in CA.'}


In [9]:
scraped_data[url]

{'content': "Email: rene@sanjosebengalcats.com Phone: (408) 478-8707 Home About Bengal cat History Are Bengal Cats Family Friendly? Asian Leopard Cat Gallery Past Litters View our Kittens Bengal Adults for Sale Bengal Kittens for sale Bringing Your Kitten Home Studs and Queens Are Bengal Cats smarter than other Cat Breeds? Studs Queens Bengals & Dogs Bengal kitten pricing Blog Contact Us Home About Bengal cat History Are Bengal Cats Family Friendly? Asian Leopard Cat Gallery Past Litters View our Kittens Bengal Adults for Sale Bengal Kittens for sale Bringing Your Kitten Home Studs and Queens Are Bengal Cats smarter than other Cat Breeds? Studs Queens Bengals & Dogs Bengal kitten pricing Blog Contact Us Bengal Kittens san jose bengal cats Prev Next About Bengals All of our Bengal kittens and Bengal cats include vacc- inations, dewormings and 2 Year genetic health guar- antee, Pedigree Paperwork and health checked by a licensed veterinarian. Click Here to See Our BENGAL KITTENS CLICK HE

In [10]:
analysis_results = {}
for url, data in scraped_data.items():
    try:
        if data['content']:
            analysis = analyzer.analyze_with_ollama(data['content'], url)
            analysis_results[url] = analysis
            print(f"Analysis for {url}: {analysis}")
        else:
            print(f"No content to analyze for {url}")
    except Exception as e:
        print(f"Error analyzing {url}: {e}")
        

Analysis for https://sanjosebengalcats.com: {'keywords': ['Bengal cat breeders', 'exotic feline companions for sale', 'intelligent domestic cats for adoption', 'affectionate Bengal kittens for sale', 'retired adult Bengal Cats for forever homes'], 'business_name': 'San Jose Bengal Cats', 'products_services': 'Bengal kittens, Bengal adults, Pedigree Paperwork, vaccinations, dewormings, 2 Year genetic health guarantee', 'target_audience': 'Individuals and families seeking unique and affectionate feline companions for their homes.', 'location': 'San Jose, United States', 'headers': [], 'emails': ['rene@sanjosebengalcats.com'], 'phones': []}


In [11]:
analysis_results

{'https://sanjosebengalcats.com': {'keywords': ['Bengal cat breeders',
   'exotic feline companions for sale',
   'intelligent domestic cats for adoption',
   'affectionate Bengal kittens for sale',
   'retired adult Bengal Cats for forever homes'],
  'business_name': 'San Jose Bengal Cats',
  'products_services': 'Bengal kittens, Bengal adults, Pedigree Paperwork, vaccinations, dewormings, 2 Year genetic health guarantee',
  'target_audience': 'Individuals and families seeking unique and affectionate feline companions for their homes.',
  'location': 'San Jose, United States',
  'headers': [],
  'emails': ['rene@sanjosebengalcats.com'],
  'phones': []}}

In [17]:
url='https://sanjosebengalcats.com'
analysis_results[url]

KeyError: 'https://sanjosebengalcats.com'

In [16]:
analyzer.analyze_with_ollama(analysis_results[url], url)

KeyError: 'https://sanjosebengalcats.com'

In [26]:
formatted_prompt

'\nAnalyze the website content as a marketing expert and extract detailed information in JSON format.\nOnly respond with the JSON, no other text.\n\nFocus on these key aspects:\n1. Keywords: Identify at least five detailed, long-tail keywords related to the product or service.\n2. Business Name: Extract the specific name of the business.\n3. Products or Services: Identify and list the products or services mentioned. The list should be simple, clear, and concise, separated by commas.\n   Example: "Bengal cats, Bengal kittens, Bengal adult cats."\n4. Target Audience: Summarize the target audience in a focused and specific statement, including their demographics and intent.\n   Example: "Individuals and families seeking rental apartments in Augusta, GA."\n\nEnsure the JSON format is valid and does not include newlines, leading spaces, or any invalid characters.\n\nRequired format:\n[\n    "keywords": [\n        "detailed keyword phrase one",\n        "detailed keyword phrase two",\n      

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 33)

In [19]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode, urljoin

class GoogleSearchScraper:
    def __init__(self, keyword, location):
        self.keyword = keyword
        self.location = location
        self.base_url = "https://www.google.com/search"
    
    def build_query(self):
        # Construct query with keyword and location
        query = f"{self.keyword} in {self.location}"
        return urlencode({'q': query})
    
    def fetch_results(self):
        query = self.build_query()
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
        url = f"{self.base_url}?{query}"
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.text
    
    def parse_results(self, html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        results = []
        for result in soup.find_all('div', class_='tF2Cxc'):  # Div for each search result
            title_tag = result.find('h3')
            link_tag = result.find('a')
            
            if title_tag and link_tag:
                title = title_tag.get_text()
                link = link_tag['href']
                results.append({'name': title, 'url': link})
        return results
    
    def run(self):
        html_content = self.fetch_results()
        return self.parse_results(html_content)



In [20]:
keyword = "best restaurants"
location = "San Francisco"
scraper = GoogleSearchScraper(keyword, location)
business_names = scraper.run()
print("Top Business Names:")
for idx, name in enumerate(business_names, start=1):
    print(f"{idx}. {name}")

Top Business Names:
