In [17]:
import requests
from bs4 import BeautifulSoup

def scrape_grow_truncate_text(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find all divs with class 'myContainer'
        containers = soup.find_all('div', class_='myContainer')
        
        # Ensure there is at least a second 'myContainer'
        if containers and len(containers) >= 2:
            second_container = containers[1]  # Get the second div
            
            # Find all 'grow truncate' divs within the second 'myContainer'
            grow_truncate_divs = second_container.find_all('div', class_='grow truncate')
            
            # Extract text from each 'grow truncate' div and store in a list
            grow_truncate_texts = [div.get_text(strip=True) for div in grow_truncate_divs]
            
            return grow_truncate_texts
        else:
            print("The second specified container was not found on the page.")
            return []
    else:
        print(f"Failed to fetch the page, status code: {response.status_code}")
        return []

# Example usage
url = "https://betalist.com/topics"
texts = scrape_grow_truncate_text(url)
for text in texts:
    print(text)



3D
3D Printing
3D Technology
Accounting
Active Lifestyle
Ad Targeting
Adventure Travel
Advertising
Advertising Exchanges
Advertising Networks
Advertising Platforms
Advice
Agriculture
Algorithms
All Markets
All Students
Analytics
Android
Angel Investing
Animal Feed
App Discovery
Apple Watch
Application Performance Monitoring
Application Platforms
Apps
App Stores
Architecture
Art
Artificial Intelligence
Artists Globally
Audio
Augmented Reality
Automotive
B2B
Babies
Baby Accessories
Banking
Banking Tech
Batteries
Beauty
Bicycles
Big Data
Big Data Analytics
Billing
Biometrics
Biotechnology
Bitcoin
Blockchain
Blogging Platforms
Boating Industry
Books
Bots
Brand Marketing
Bridging Online and Offline
Broadcasting
Browser Extensions
Building Products
Business Analytics
Business Development
Business Information Systems
Business Intelligence
Business Productivity
Business Services
Business Travelers
Career Management
Career Planning
Cars
Casual Games
Cause Marketing
Celebrity
Charities
Charity
C

In [49]:
categories = texts
lowercase_strings = [s.lower() for s in categories]
lowercase_strings = [s.lower().replace(" ", "-") for s in lowercase_strings]


In [57]:
import requests
from bs4 import BeautifulSoup
import json

def scrape_category_details(url, category):
    response = requests.get(url)
    details = []
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        container = soup.find('div', class_='infinite-startups')
        if container:
            links = container.find_all('a', class_="block whitespace-nowrap text-ellipsis overflow-hidden font-medium")
            for link in links:
                # For each link, store both its text and the href attribute
                details.append({
                    "text": link.get_text(strip=True),
                    "href": link.get('href')  # Extract the href attribute
                })
    else:
        print(f"Failed to fetch the page for {category}, status code: {response.status_code}")
    return details

def scrape_all_categories_and_save_each(base_url, categories, filename="output_sites.json"):
    data = []
    for category in categories:
        formatted_category = category.replace(' ', '-').lower()
        full_url = f"{base_url}{formatted_category}"  # Adjusting category for URL and lowercase
        print(full_url)
        category_details = scrape_category_details(full_url, category)
        # Store the category with its corresponding startups' details
        data.append({"category": category, "startups": category_details})
        # Save to JSON file after processing each category
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"Data saved to {filename}")

# Assuming lowercase_strings is defined somewhere above this snippet
# Example category list
categories = lowercase_strings  # Use your actual categories here
base_url = "https://betalist.com/topics/"

# Run the scraping and saving process
scrape_all_categories_and_save_each(base_url, categories)


https://betalist.com/topics/3d
https://betalist.com/topics/3d-printing
https://betalist.com/topics/3d-technology
https://betalist.com/topics/accounting
https://betalist.com/topics/active-lifestyle
https://betalist.com/topics/ad-targeting
https://betalist.com/topics/adventure-travel
https://betalist.com/topics/advertising
https://betalist.com/topics/advertising-exchanges
https://betalist.com/topics/advertising-networks
https://betalist.com/topics/advertising-platforms
https://betalist.com/topics/advice
https://betalist.com/topics/agriculture
https://betalist.com/topics/algorithms
https://betalist.com/topics/all-markets
https://betalist.com/topics/all-students
https://betalist.com/topics/analytics
https://betalist.com/topics/android
https://betalist.com/topics/angel-investing
https://betalist.com/topics/animal-feed
https://betalist.com/topics/app-discovery
https://betalist.com/topics/apple-watch
https://betalist.com/topics/application-performance-monitoring
https://betalist.com/topics/ap

In [53]:
import json
import os

# Path to the 'output.json' file
input_file_path = 'output.json'  # Adjust this to your file's location

# Path to the 'all_startups' folder where you want to save the split JSON files
folder_path = 'all_startups'  # Adjust this to your desired location

# Ensure the 'all_startups' folder exists
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Load the JSON data from the file
with open(input_file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Define the number of categories per file
categories_per_file = 30

# Split the data into chunks of 30 categories each
chunks = [data[i:i + categories_per_file] for i in range(0, len(data), categories_per_file)]

# Save each chunk to a separate JSON file within the 'all_startups' folder
for index, chunk in enumerate(chunks):
    filename = os.path.join(folder_path, f'output_part_{index + 1}.json')
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(chunk, file, ensure_ascii=False, indent=4)

print("Files saved successfully.")


Files saved successfully.


In [58]:
import requests
from bs4 import BeautifulSoup
import json

def scrape_details_from_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        title = soup.find('h2').get_text(strip=True) if soup.find('h2') else 'Title Not Found'
        main_content_div = soup.find('div', class_='main content')
        description = ' '.join(p.get_text(strip=True) for p in main_content_div.find_all('p')) if main_content_div else 'Description Not Found'
        return title, description
    else:
        print(f"Failed to fetch the page for URL: {url}, status code: {response.status_code}")
        return 'Title Not Found', 'Description Not Found'

def process_dataset_and_save_each_iteration(input_dataset, output_filename):
    base_url = "https://betalist.com"
    scraped_data = []

    for category_item in input_dataset:
        category = category_item["category"]
        for startup in category_item["startups"]:
            text = startup["text"]
            href = startup["href"]
            full_url = f"{base_url}{href}"
            title, description = scrape_details_from_url(full_url)
            
            # Append the current startup's scraped data
            scraped_data.append({
                "text": text,
                "href": href,
                "category": category,
                "title": title,
                "description": description
            })
            
            # Save to JSON file after processing each startup
            with open(output_filename, 'w', encoding='utf-8') as file:
                json.dump(scraped_data, file, ensure_ascii=False, indent=4)
    
    print(f"All data saved to {output_filename}")

# Assuming the input dataset is loaded from 'input_dataset.json'
with open('output_sites.json', 'r', encoding='utf-8') as file:
    input_dataset = json.load(file)

output_filename = 'scraped_website_details.json'
process_dataset_and_save_each_iteration(input_dataset, output_filename)


Failed to fetch the page for URL: https://betalist.com/startups/codulab, status code: 404
Failed to fetch the page for URL: https://betalist.com/startups/wpflexiblepay, status code: 404
Failed to fetch the page for URL: https://betalist.com/startups/linkgage, status code: 404
Failed to fetch the page for URL: https://betalist.com/startups/let-s-remote, status code: 404
All data saved to scraped_website_details.json
