In [1]:
import logging
import pandas as pd
import re
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

logging.basicConfig(level=logging.INFO)


class ProductHuntScraper:
    def __init__(self):
        self.edge_options = Options()
        self.edge_options.add_argument("--headless")
        self.base_url = "https://www.producthunt.com"
        self.unwanted_tags = ['script', 'style', 'button', 'input', 'img', 'video', 'head', 'svg']

    def _get_soup(self, url):
        with webdriver.Edge(options=self.edge_options) as driver:
            driver.get(url)
            scroll_downs = 5
            for _ in range(scroll_downs):
                driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
                time.sleep(2)
            html_content = driver.page_source

        soup = BeautifulSoup(html_content, 'html.parser')
        for tag in self.unwanted_tags:
            for unwanted_tag in soup.find_all(tag):
                unwanted_tag.decompose()
        return soup

    def extract_product_url(self, url):
        soup = self._get_soup(url)
        for link in soup.find_all("a", href=True):
            match = re.match(r'^/products/[^/]+$', link['href'])
            if match:
                return match.group()

    def extract_product_site_url(self, url):
        soup = self._get_soup(url)
        product_webpage_tag = soup.find("a", {"class": "styles_button__7X8Df", "data-test": "product-header-visit-button"})
        if product_webpage_tag:
            product_webpage = product_webpage_tag['href']
            return product_webpage

    @staticmethod
    def clean_url(url):
        cleaned = re.sub(r'^https?://', '', url)
        cleaned = re.sub(r'\?.*$', '', cleaned)
        return cleaned

    def get_product_details(self, url):
        soup = self._get_soup(url)
        about_content = soup.find(id="about")
        data = about_content.text

        product_details = {
            'followers': re.search(r'(\d+(\.\d+)?[Kk]?)followers', data).group(1),
            'created_by': re.search(r'Made by (.+?)\.', data).group(1).split(", ")[:-1] + re.search(r'Made by (.+?)\.', data).group(1).split(" and ")[-1:],
            'upvotes': int(re.search(r'Upvotes([\d,]+)', data).group(1).replace(',', '')),
            'comments': int(re.search(r'Comments(\d+)', data).group(1))
        }
        return product_details

    def get_latest_product_details(self):
        soup = self._get_soup(self.base_url)
        products, product_descriptions, page_urls, created_by, followers, upvotes, comments, product_urls, product_webpage_urls = ([] for _ in range(9))

        section = soup.find('div', attrs={'data-test': 'homepage-section-0'})
        if section:
            for i, item in enumerate(section.find_all("div", class_="styles_item__Dk_nz")):
                product_name_tag = item.find("a", class_="styles_title__HzPeb")
                product_description_tag = item.find("a", class_="styles_tagline__Dwvza")
                
                if product_name_tag and product_description_tag:
                    products.append(product_name_tag.text.strip())
                    product_descriptions.append(product_description_tag.text.strip())
                    page_urls.append(product_name_tag['href'])

                    product_details = self.get_product_details(self.base_url + product_name_tag['href'])
                    created_by.append(product_details['created_by'])
                    followers.append(product_details['followers'])
                    upvotes.append(product_details['upvotes'])
                    comments.append(product_details['comments'])

                    product_page_url = self.extract_product_url(self.base_url + product_name_tag['href'])
                    product_urls.append(product_page_url)
                    product_webpage_urls.append(self.extract_product_site_url(self.base_url + product_page_url))

        cleaned_urls = [self.clean_url(url) for url in product_webpage_urls]

        df = pd.DataFrame({
            'Product': products,
            'Product Description': product_descriptions,
            'Website': cleaned_urls,
            'Created By': created_by,
            'Followers': followers,
            'Upvotes': upvotes,
            'Comments': comments
        })

        return df

In [2]:
scraper = ProductHuntScraper()

start = time.perf_counter()
df = scraper.get_latest_product_details()
finish = time.perf_counter()

print(f"\nFinished in {round(finish - start, 2)} second(s)")


Scraping Product Hunt...

Soup Created!!!...


Scraping homepage-section-0...


Scraping item:  0

Extracting product url:  https://www.producthunt.com/posts/fedica

Extracting product site url:  https://www.producthunt.com/products/fedica

Product site url:  https://fedica.com?ref=producthunt

Scraping item:  1

Extracting product url:  https://www.producthunt.com/posts/freelogo-dev

Extracting product site url:  https://www.producthunt.com/products/freelogo-dev

Product site url:  https://www.freelogo.dev?ref=producthunt

Scraping item:  2

Extracting product url:  https://www.producthunt.com/posts/salesos-with-ai-customer-persona

Extracting product site url:  https://www.producthunt.com/products/salesos-startos-sales-marketing-system

Product site url:  https://ajinkyabhat.com/salesos?ref=producthunt

Scraping item:  3

Extracting product url:  https://www.producthunt.com/posts/ultimate-etsy-guide-for-digital-products

Extracting product site url:  https://www.producthunt.com/prod

In [3]:
df

Unnamed: 0,Product,Product Description,Website,Created By,Followers,Upvotes,Comments
0,Fedica,The one AI social media powerhouse,fedica.com,"[Samir Al-Battran, Erin Heywood, Dmitry Ustime...",775,544,167
1,FreeLogo.dev,"Free logo generator, no bullshit, takes seconds",www.freelogo.dev,"[Miłosz Jankiewicz, Łukasz Cybulski]",271,253,56
2,SalesOS with AI Customer Persona,Nail your target customer and fix your sales f...,ajinkyabhat.com/salesos,[Ajinkya Bhat | Notion X Startups],256,211,82
3,Ultimate Etsy Guide for Digital Products,Etsy success: your complete handbook,jordilabs.gumroad.com/l/utimate-etsy-guide,[Jordi Rodriguez],202,180,70
4,ChatGPT Prompts for AI,Use ChatGPT to become an AI guru,davidecamera.gumroad.com/l/ChatGPT_AI,[Davide Camera],215,167,29
5,playCSS,Improve your CSS skills by playing daily chall...,playcss.app,[Frank Eno],145,143,33
6,Meal Planner,Automatically track your calories and plan hea...,heykelseys.gumroad.com/l/Book,[Kelsey S],115,118,39
7,PetSitter Kit,Streamline pet care business success,www.laiew.com/petsitter,[leo],119,108,26
8,HabitScore,"Focus on your habits, not on your streaks",apps.apple.com/us/app/habitscore-habit-tracker...,[Sven Navez],91,92,11
9,U-Eyes,A Chrome Extension for simulating and capture ...,eyeballer.dev,[Han Tran],82,84,9


In [5]:
ph_df = pd.read_csv("../datasets/producthunt_company_details.csv")

Unnamed: 0,Product,Product Description,Website,Created By,Followers,Upvotes,Comments
0,Heyscribe,Manage and monetize your creative work-in-process,heyscribe.com,"['Artem Samarsky', 'Anastasia Morgan']",447,392,136
1,◯˚GitStart,Pull requests as a service,gitstart.com,"['Edward So', 'Miracle Ogunlade', 'Emily Luo',...",310,292,60
2,myReach,"Your second brain, powered by AI – ChatGPT for...",myreach.io,"['Christopher Payne', 'Alex Calle', 'Agnes Fel...",229,214,66
3,dyrector.io,Open-source SelfOps platform for containers.,app.dyrectorio.com,['Orbán Levente'],296,160,43
4,X2Image,Convert tweets to stylish images,x2image.app,['Ömer Taban'],157,157,61


In [6]:
updated_df = pd.concat([df, ph_df], ignore_index=True)

In [7]:
updated_df.shape, ph_df.shape, df.shape

((159, 7), (133, 7), (26, 7))

In [8]:
updated_df.to_csv("../datasets/producthunt_company_details.csv", index=False)