In [3]:
import os
import json
from datetime import datetime
from src.scrapper import WebScraper
from src.data_processer import DataProcessor
from src.content_analyzer import ContentAnalyzer

In [1]:
def setup_directories():
    """Create necessary directories if they don't exist"""
    dirs = ['output', 'output/scraping', 'output/analysis']
    for dir in dirs:
        os.makedirs(dir, exist_ok=True)

In [4]:
setup_directories()
        
# Initialize components
web_scraper = WebScraper()
data_processor = DataProcessor()
content_analyzer = ContentAnalyzer()

In [5]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
scraping_output = f"output/scraping/scraped_data_{timestamp}.json"
analysis_output = f"output/analysis/analysis_results_{timestamp}.json"

In [19]:
print("\n1. Reading URLs from Excel...")
excel_path = "File54.xlsx"  # Adjust path as needed
urls = data_processor.read_excel_to_url(excel_path)
print(f"Found {len(urls)} URLs")


1. Reading URLs from Excel...
Found 90001 URLs


In [20]:
## sampling 
urls = urls[:100]

In [21]:
print("\n2. Starting web scraping...")
scraping_results = {}

for i, url in enumerate(urls, 1):
  try:
      print(f"\nProcessing [{i}/{len(urls)}]: {url}")
      
      # Clean URL
      clean_url = data_processor.clean_url(url)
      print(f"Cleaned URL: {clean_url}")
      
      # Scrape website
      scraped_data = web_scraper.scrape_website(clean_url)
      
      # Store result
      scraping_results[url] = {
          'status': 'success',
          'cleaned_url': clean_url,
          'data': scraped_data,
          'timestamp': datetime.now().isoformat()
      }
      print("✓ Scraping successful")
      
  except Exception as e:
      print(f"✗ Error: {str(e)}")
      scraping_results[url] = {
          'status': 'error',
          'error': str(e),
          'timestamp': datetime.now().isoformat()
      }


2. Starting web scraping...

Processing [1/100]: http://Sanjosebengalcats.com
Cleaned URL: https://Sanjosebengalcats.com
✓ Scraping successful

Processing [2/100]: peachmarketplace.com
Cleaned URL: https://peachmarketplace.com
✓ Scraping successful

Processing [3/100]: peachmcintyre.com
Cleaned URL: https://peachmcintyre.com
✓ Scraping successful

Processing [4/100]: peachmedical.com
Cleaned URL: https://peachmedical.com
✓ Scraping successful

Processing [5/100]: peachmeleggings.com
Cleaned URL: https://peachmeleggings.com
✓ Scraping successful

Processing [6/100]: peachmode.com
Cleaned URL: https://peachmode.com
✓ Scraping successful

Processing [7/100]: peachmodern.com
Cleaned URL: https://peachmodern.com
✓ Scraping successful

Processing [8/100]: peach-momo.com
Cleaned URL: https://peach-momo.com
✓ Scraping successful

Processing [9/100]: peachmountain.com
Cleaned URL: https://peachmountain.com
✓ Scraping successful

Processing [10/100]: peachmusic.com
Cleaned URL: https://peachmus

In [22]:
print("\nSaving scraping results...")
with open(scraping_output, 'w', encoding='utf-8') as f:
    json.dump(scraping_results, f, indent=2, ensure_ascii=False)


Saving scraping results...


In [24]:
print("\n3. Starting content analysis...")
analysis_results = content_analyzer.process_scraped_data(
    scraping_output,
    analysis_output
)


3. Starting content analysis...

Processing [1/97]: http://Sanjosebengalcats.com
✓ Analysis successful

Processing [2/97]: peachmarketplace.com
✓ Analysis successful

Processing [3/97]: peachmcintyre.com
✓ Analysis successful

Processing [4/97]: peachmedical.com
✓ Analysis successful

Processing [5/97]: peachmeleggings.com
✓ Analysis successful

Processing [6/97]: peachmode.com
✓ Analysis successful

Processing [7/97]: peachmodern.com
✓ Analysis successful

Processing [8/97]: peach-momo.com
✓ Analysis successful

Processing [9/97]: peachmountain.com
✓ Analysis successful

Processing [10/97]: peachmusic.com
✗ Skipped - No valid content

Processing [11/97]: peachmyk.com
✗ Skipped - No valid content

Processing [12/97]: peachofmind.com
✓ Analysis successful

Processing [13/97]: peachonaleash.com
✓ Analysis successful

Processing [14/97]: peachorchardapartments.com
✓ Analysis successful

Processing [15/97]: peachpalettedesign.com
✓ Analysis successful

Processing [16/97]: peachpaperdesign

In [26]:
successful_scrapes = len([r for r in scraping_results.values() if r['status'] == 'success'])
successful_analyses = len([r for r in analysis_results.values() if r['status'] == 'success'])
        

In [27]:
print("\n=== Final Summary ===")
print(f"Total URLs processed: {len(urls)}")
print("\nScraping Results:")
print(f"- Successful: {successful_scrapes}")
print(f"- Failed: {len(urls) - successful_scrapes}")
print(f"- Output file: {scraping_output}")

print("\nAnalysis Results:")
print(f"- Successful: {successful_analyses}")
print(f"- Failed/Skipped: {len(urls) - successful_analyses}")
print(f"- Output file: {analysis_output}")


=== Final Summary ===
Total URLs processed: 100

Scraping Results:
- Successful: 84
- Failed: 16
- Output file: output/scraping/scraped_data_20241224_154243.json

Analysis Results:
- Successful: 84
- Failed/Skipped: 16
- Output file: output/analysis/analysis_results_20241224_154243.json


In [36]:
for url,data in analysis_results.items():
  print(url , data)

http://Sanjosebengalcats.com {'status': 'success', 'analysis': {'keywords': ['Bengal cats', 'San Jose Bengal Cats', 'Bengal kittens for sale', 'Bengal adults for sale', 'Are Bengal Cats smarter than other Cat Breeds?', 'Bengal kitten pricing'], 'business_name': 'San Jose Bengal Cats', 'products_services': 'Bengal kittens and Bengal cats for sale, Bengal breeding program, health checked by a licensed veterinarian, Pedigree Paperwork', 'target_audience': 'New York, Los Angeles, Miami, and beyond!', 'emails': ['rene@sanjosebengalcats.com'], 'phones': []}, 'timestamp': '2024-12-24T16:14:00.057634'}
peachmarketplace.com {'status': 'success', 'analysis': {'keywords': ['leopard', 'pumpkin', ' Halloween', 'embroidery', 'accessories', 't-shirts', 'long sleeve', 'quality', 'reviews'], 'business_name': 'Peach Marketplace', 'products_services': 'Home accessories, holiday collection, embroidery, clearance, full catalog, reviews.', 'emails': [], 'phones': []}, 'timestamp': '2024-12-24T16:14:01.61260

In [35]:
import pandas as pd
pd.DataFrame(analysis_results).T.to_csv('analysis_results.csv')

In [37]:
# store status , keywords , etc in diffrece columms 
res=pd.DataFrame(analysis_results)
res

Unnamed: 0,http://Sanjosebengalcats.com,peachmarketplace.com,peachmcintyre.com,peachmedical.com,peachmeleggings.com,peachmode.com,peachmodern.com,peach-momo.com,peachmountain.com,peachmusic.com,...,peachtreehearing.com,peachtreehoops.com,peachtreehotelgroup.com,peachtreeinvitational.com,peachtreekidney.com,peachtreekidsnantucket.com,peachtreelanehoa.com,peachtreelifeinsurance.com,peachtreelighting.com,peachtreemedicalcenter.com
status,success,success,success,success,success,success,success,success,success,skipped,...,success,success,success,success,success,success,success,success,success,success
analysis,"{'keywords': ['Bengal cats', 'San Jose Bengal ...","{'keywords': ['leopard', 'pumpkin', ' Hallowee...","{'keywords': ['social media content creator', ...","{'keywords': ['Pfizer', 'COVID-19', 'flu', 'ho...","{'keywords': ['peachmeleggings', 'leggings', '...","{'keywords': ['sarees', 'fabric', 'cotton', 's...","{'keywords': ['furniture', 'art', 'collectible...","{'articles': [{'title': 'Best Now', 'descripti...","{'keywords': ['tanks', 'fortifications', 'ship...",,...,"{'keywords': ['hearing loss', 'tinnitus treatm...","{'keywords': ['Atlanta Hawks', 'NBA Draft', 'N...","{'keywords': ['commercial real estate', 'hotel...","{'keywords': ['Peachtree Men's Invitational', ...","{'keywords': ['kidney', 'hypertension', 'diabe...","{'keywords': ['Nantucket', 'baby gifts', 'clot...","{'keywords': ['Peach Tree Lane HOA', 'communit...","{'keywords': ['life insurance', 'compare quote...","{'keywords': ['commercial lighting', 'products...","{'keywords': ['high quality care', 'Peachtree ..."
timestamp,2024-12-24T16:14:00.057634,2024-12-24T16:14:01.612604,2024-12-24T16:14:03.021023,2024-12-24T16:14:04.486476,2024-12-24T16:14:06.078729,2024-12-24T16:14:08.237915,2024-12-24T16:14:09.341041,2024-12-24T16:14:33.396954,2024-12-24T16:14:34.800705,2024-12-24T16:14:34.800766,...,2024-12-24T16:16:20.481493,2024-12-24T16:16:21.796476,2024-12-24T16:16:23.888615,2024-12-24T16:16:25.964528,2024-12-24T16:16:27.517621,2024-12-24T16:16:28.801890,2024-12-24T16:16:29.792112,2024-12-24T16:16:31.165828,2024-12-24T16:16:32.236269,2024-12-24T16:16:33.927366
error,,,,,,,,,,No valid content to analyze,...,,,,,,,,,,


In [None]:
data = []
for url, result in analysis_results.items():
    row = {
        'url': url,
        'status': result['status'],
        'timestamp': result.get('timestamp', '')
    }
    
    # Add analysis data if successful
    if result['status'] == 'success':
        analysis = result['analysis']
        row.update({
            'business_name': analysis.get('business_name', ''),
            'products_services': analysis.get('products_services', ''),
            'target_audience': analysis.get('target_audience', ''),
            'keywords': '|'.join(analysis.get('keywords', [])),
            'emails': '|'.join(analysis.get('emails', [])),
            'phones': '|'.join(analysis.get('phones', []))
        })
    else:
        row.update({
            'error': result.get('error', ''),
            'business_name': '',
            'products_services': '',
            'target_audience': '',
            'keywords': '',
            'emails': '',
            'phones': ''
        })
    
    data.append(row)

# Convert to DataFrame and save to CSV
df = pd.DataFrame(data)



In [42]:
df.to_csv('output/analysis_results_new.csv', index=False)

In [41]:
df

Unnamed: 0,url,status,timestamp,business_name,products_services,target_audience,keywords,emails,phones,error
0,http://Sanjosebengalcats.com,success,2024-12-24T16:14:00.057634,San Jose Bengal Cats,"Bengal kittens and Bengal cats for sale, Benga...","New York, Los Angeles, Miami, and beyond!",Bengal cats|San Jose Bengal Cats|Bengal kitten...,rene@sanjosebengalcats.com,,
1,peachmarketplace.com,success,2024-12-24T16:14:01.612604,Peach Marketplace,"Home accessories, holiday collection, embroide...",,leopard|pumpkin| Halloween|embroidery|accessor...,,,
2,peachmcintyre.com,success,2024-12-24T16:14:03.021023,Peach McIntyre Inc.,"Social Media Content Creator, Home Shop E-Guid...",,social media content creator|big bag big bank|...,Inc.813.797.1396support@peachmcintyre.com|supp...,,
3,peachmedical.com,success,2024-12-24T16:14:04.486476,Peach Medical,Providing industry-leading fulfillment technol...,,Pfizer|COVID-19|flu|home test|rapid test,,,
4,peachmeleggings.com,success,2024-12-24T16:14:06.078729,Peach Me Leggings,"Women's leggings, shorts, flare pants, and set...",,peachmeleggings|leggings|shorts|flare pants|sets,,,
...,...,...,...,...,...,...,...,...,...,...
92,peachtreekidsnantucket.com,success,2024-12-24T16:16:28.801890,Peachtree Kids,"clothing, shoes, and accessories for infants a...",,Nantucket|baby gifts|clothing|outerwear,hello@peachtreekidsnantucket.com,,
93,peachtreelanehoa.com,success,2024-12-24T16:16:29.792112,Peach Tree Lane Homeowners Association,"maintenance, management, and security services...",residents of Peach Tree Lane and surrounding a...,Peach Tree Lane HOA|community living|neighborh...,,,
94,peachtreelifeinsurance.com,success,2024-12-24T16:16:31.165828,Peachtree Life Insurance,Life insurance quotes and policies,Individuals across the nation seeking the best...,life insurance|compare quotes|best rates|indep...,,,
95,peachtreelighting.com,success,2024-12-24T16:16:32.236269,Peachtree Lighting,Our products are sold to the commercial lighti...,,commercial lighting|products sold|network of p...,,,
