In [1]:
!pwd

/home/gops/work/market_analysis/notebooks


In [2]:
%load_ext autoreload
%autoreload 2 

In [3]:
import os 
os.chdir('/home/gops/work/market_analysis')

In [4]:
import sys
import os
import pandas as pd

from src.core.scraper import WebScraper
from src.core.content_analyzer import ContentAnalyzer
from src.core.data_processer import DataProcessor
from src.core.advanced_analytics import AdvancedAnalytics
from src.utils.rate_limiter import RateLimiter
import datetime

In [5]:
# Initialize Components
scraper = WebScraper()
processor = DataProcessor()
analyzer = ContentAnalyzer()
rate_limiter = RateLimiter()
advance_analyzer = AdvancedAnalytics()

In [6]:
urls=processor.read_excel_to_url(file_path='/home/gops/work/market_analysis/input/test.xlsx')

In [7]:
sample_urls= urls[1:2]

In [8]:
processed_urls = []
for url in sample_urls:
    try:
        clean_url = processor.clean_url(url)
        processed_urls.append(clean_url)
        print(f"Processed URL: {clean_url}")
    except Exception as e:
        print(f"Error processing URL {url}: {e}")

Processed URL: https://sanjosebengalcats.com


In [9]:
scraped_data = {}
for url in processed_urls:
    try:
        rate_limiter.wait()
        data = scraper.scrape_website(url)
        scraped_data[url] = data
        print(f"Scraped data from {url}: {data['metadata']}")
    except Exception as e:
        print(f"Error scraping {url}: {e}")

Scraped data from https://sanjosebengalcats.com: {'title': 'Bengal Kittens & Cats For Sale | San Jose Bengal Cats', 'meta_description': 'Bengal Cats are the most exotic and adored cats among families and friends for their cuteness and uniqueness. It is one of the best Cats breeders in CA.'}


In [10]:
analysis_results = {}
for url, data in scraped_data.items():
    try:
        if data['content']:
            analysis = analyzer.analyze_with_ollama(data['content'], url)
            analysis_results[url] = analysis
            print(f"Analysis for {url}: {analysis}")
        else:
            print(f"No content to analyze for {url}")
    except Exception as e:
        print(f"Error analyzing {url}: {e}")
analysis_results

Analysis for https://sanjosebengalcats.com: {'keywords': ['bengal cats for sale san jose', 'san jose bengal cat breeders', 'exotic feline companion near me', 'bengal kittens prices new york los angeles miami', 'intelligent domestic cats breed'], 'business_name': 'San Jose Bengal Cats', 'products_services': 'Bengal kittens, Bengal adult cats, pedigree paperwork, health checked by licensed veterinarian', 'target_audience': 'Individuals and families seeking exotic feline companions, specifically those in New York, Los Angeles, Miami, and beyond, with a focus on first-time owners and Bengal enthusiasts.', 'location': 'San Jose', 'headers': [], 'emails': ['rene@sanjosebengalcats.com'], 'phones': []}


{'https://sanjosebengalcats.com': {'keywords': ['bengal cats for sale san jose',
   'san jose bengal cat breeders',
   'exotic feline companion near me',
   'bengal kittens prices new york los angeles miami',
   'intelligent domestic cats breed'],
  'business_name': 'San Jose Bengal Cats',
  'products_services': 'Bengal kittens, Bengal adult cats, pedigree paperwork, health checked by licensed veterinarian',
  'target_audience': 'Individuals and families seeking exotic feline companions, specifically those in New York, Los Angeles, Miami, and beyond, with a focus on first-time owners and Bengal enthusiasts.',
  'location': 'San Jose',
  'headers': [],
  'emails': ['rene@sanjosebengalcats.com'],
  'phones': []}}

In [11]:
for url, analysis_data in analysis_results.items():
    try:
        keywords = analysis_data['keywords'][0]
        location = analysis_data['location']
        print(f"Keywords for {url}: {keywords} | Location: {location}")
        result=advance_analyzer.find_top_competitors(keywords, location,origin_url=url,pages=1)
        print(result)
    except Exception as e:
        print(f"Error extracting keywords for {url}: {e}")

Keywords for https://sanjosebengalcats.com: bengal cats for sale san jose | Location: San Jose
['https://support.google.com', 'https://www.google.com', 'https://www.google.com', 'https://www.google.co.in', 'https://accounts.google.com']


In [22]:
from urllib.parse import urlparse

def clean_and_filter_urls_test(urls, origin_url):
    """Clean and filter URLs to extract unique competitors."""
    unique_urls = set()
    base_urls = []
    excluded_domains = {"google", "facebook", "yelp", "instagram"}

    for url in urls:
        parsed = urlparse(url)

        # Skip URLs with excluded domains
        if any(domain in parsed.netloc for domain in excluded_domains):
            continue
        
        # Extract base URL
        base_url = f"{parsed.scheme}://{parsed.netloc}"

        # Add to unique URLs if not already present and not equal to the origin URL
        if base_url not in unique_urls and base_url != origin_url:
            unique_urls.add(base_url)
            base_urls.append(base_url)

        # Stop when we have 5 unique URLs
        if len(base_urls) == 5:
            break

    return base_urls


In [14]:
all_links=advance_analyzer.fetch_google_results('Bengal cats for sale', 'US', pages=1)

In [23]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import json
import time
clean_and_filter_urls_test(all_links,'https://sanjosebengalcats.com/')

['https://wildnsweetbengals.com',
 'https://www.thebengalcats.com',
 'https://www.royalbengalcattery.com',
 'https://www.belleamibengals.com',
 'https://kittens-bengal.com']

In [26]:
advance_analyzer.check_gmb_setup('https://www.golgix.com/')

True

In [27]:
advance_analyzer.count_non_indexed_pages('https://www.golgix.com/')

Error counting non-indexed pages for https://www.golgix.com/: name 're' is not defined


In [28]:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import json
import time
import re
def count_non_indexed_pages( url):
    """Count the number of non-indexed pages for a given domain."""
    try:
        site_query = f"site:{urlparse(url).netloc}"  # Google site query
        search_url = f"https://www.google.com/search?q={site_query}"
        response = requests.get(search_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Check if there are zero results
        no_results_text = soup.find("div", class_="card-section")
        if no_results_text and "did not match any documents" in no_results_text.text:
            return 0

        # Count results (example logic, may need refinement)
        result_stats = soup.find("div", id="result-stats")
        if result_stats:
            match = re.search(r"About ([\d,]+) results", result_stats.text)
            if match:
                return int(match.group(1).replace(",", ""))

    except Exception as e:
        print(f"Error counting non-indexed pages for {url}: {e}")
    
    return None

In [30]:
count_non_indexed_pages('https://www.thebengalcats.com')

114

In [12]:
business = "Starbucks"
product = "coffee"
competitors = search_competitors(business, product)
competitors

NameError: name 'search_competitors' is not defined

In [6]:
# business_name="Starbucks"
product="Bengal kittens for sale in united states"
query = f"{product}"
google_search_url = f"https://www.google.com/search?q={'+'.join(query.split())}"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

In [29]:
response = requests.get(google_search_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# soup
        

In [30]:
competitors = []
for result in soup.select('a'):  # Using 'h3' to capture result titles
        title = result.text
        competitors.append(title)
competitors

['here',
 'Skip to main content',
 'Accessibility help',
 'Accessibility feedback',
 '',
 '',
 '',
 'Sign in',
 'All',
 'Shopping',
 'Images',
 'Videos',
 'Web',
 'News',
 'Maps',
 'Wild & Sweet Bengals | Bengal Cat Breeder in North Americawildnsweetbengals.comhttps://wildnsweetbengals.com › ...',
 'Bengal Kittens Available',
 'Snow Bengal',
 'Silver Bengal',
 'Charcoal Bengal',
 'Bengal Kittens for salethebengalcats.comhttps://www.thebengalcats.com',
 'Belle Ami Bengals: Gorgeous Bengal CatsBelle Ami Bengalshttps://www.belleamibengals.com',
 'Bengal Cats & Kittens for Sale',
 'Snow Bengal Kittens For Sale',
 'Bengal pricing',
 'Bengal Kittens and Cats - Royal Bengal Cattery - Bengal Cats ...Royal Bengal Catteryhttps://www.royalbengalcattery.com',
 'Bengal cat breeders Philadelphia | Bengal cat saleBengal Kittens Philadelphiahttps://kittens-bengal.com',
 'Bengal Kittens & Cats for Sale Near Me | Wild & Sweet Bengalswildnsweetbengals.comhttps://wildnsweetbengals.com › bengal-kittens-for

In [7]:
links = []
for result in soup.select('a'):
    href = result.get('href')
    if href and 'http' in href:
        links.append(href)
links

NameError: name 'soup' is not defined

In [10]:
base_url = "https://www.google.com/search"
product = "Bengal kittens"
location = "United States"
queries = [f"{product} in {location}"]
page_numbers = [0, 1]  # Adjust to the desired pages (start=0, start=10, start=20 for pages 1, 2, 3)

all_links = []
for start in page_numbers:
    params = {'q': '+'.join(queries), 'start': start}
    response = requests.get(base_url, params=params, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    for result in soup.select('a'):
        href = result.get('href')
        if href and 'http' in href:
            all_links.append(href)


In [12]:
from urllib.parse import urlparse


def clean_and_filter_urls(urls):
    unique_urls = set()  # To store unique URLs
    base_urls = []       # Final cleaned base URLs list

    for url in urls:
        parsed = urlparse(url)

        # Ignore Google links
        if 'google' in parsed.netloc:
            continue

        # Extract base URL
        base_url = f"{parsed.scheme}://{parsed.netloc}"

        # Add to unique URLs if not already present
        if base_url not in unique_urls:
            unique_urls.add(base_url)
            base_urls.append(base_url)

        # Stop when we have 5 unique URLs
        if len(base_urls) == 5:
            break

    return base_urls




In [13]:
# Get cleaned and filtered URLs
unique_base_urls = clean_and_filter_urls(all_links)
print(unique_base_urls)

['https://www.royalbengalcattery.com', 'https://wildnsweetbengals.com', 'https://www.sakurabengals.com', 'https://kittens-bengal.com', 'https://bengalsofbama.com']


In [11]:
all_links

['https://support.google.com/websearch/answer/181196?hl=en-IN',
 'https://www.google.com/webhp?hl=en&sa=X&ved=0ahUKEwjp9tPQxNeKAxWsTGcHHVrqKfcQPAgI',
 'https://www.google.com/webhp?hl=en&ictx=0&sa=X&ved=0ahUKEwjp9tPQxNeKAxWsTGcHHVrqKfcQpYkNCAo',
 'https://www.google.co.in/intl/en/about/products?tab=wh',
 'https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=https://www.google.com/search%3Fq%3DBengal%2Bkittens%2Bin%2BUnited%2BStates%26start%3D0&ec=GAZAAQ',
 'https://maps.google.com/maps?sca_esv=d46ad9e90e5d3acb&output=search&q=Bengal+kittens+in+United+States&source=lnms&fbs=AEQNm0DvD4UMlvdpwktgGj2ZHhIXAIHy0lF5HBdT5py_0SmcDRj-ZcG8sN4MPTI25WFYis4wl2w2HABIwzHTNHgs0XyvwBQ326rTyYqJYbg_1X6pmt2OKPouyxEdwTCL9hsaApO3FRLbqtkj9tDfgnXKaUvRe7AGx_xdZ-OvZau2lFn-u1kTzfVBNQnpFLBtYqyr8ThTATosP7fshT1I2Y9qykLFrVsXeA&entry=mc&ved=1t:200715&ictx=111',
 'https://www.royalbengalcattery.com/',
 'https://www.royalbengalcattery.com/',
 'https://wildnsweetbengals.com/en/',
 'https://wildnsweetbenga