In [None]:
import requests
import pandas as pd
import time  # Import the time module for sleep

# Define the API URL OF THE WEBSITE (AS OBSERVED IN CHROME'S DEVELOPMENT TOOL)
url = "https://zapier.com/explore-api"

# Define the headers
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36",
    "Content-Type": "application/json",
    "x-api-key": "YnhcscfPMog8CmM-28oHy_AyCEMJP!r36m3ZqDMf",
    "x-csrftoken": "nQATqivRqac5Fpk5aW5W4lOLxITQ4YsdzBFreMy8KsDoIB3rmcSVgPtJVpBUS1Dl"
}

# Create an empty DataFrame to store the results
# DataFrame is preferred for easy comparative checking (easy to track errors)
all_apps = pd.DataFrame()

# Loop through offset values from 0 to 7712, with a step of 22
for offset in range(0, 7713, 22):
    # Define the payload with dynamic offset and constant limit
    # I extracted the payload parameters from Network tab in Chrome's developer tool
    data = {
        "operationName": "CategoryAppsBFFQuery",
        "query": """
        query CategoryAppsBFFQuery($categorySlug: String = "all", $limit: Int = 10, $offset: Int = 0, $orderBy: AppSortOrder, $filterBy: String) {
          appCategory: appCategoryWithSlug(slug: $categorySlug) {
            id
            title
            apps(
              orderBy: $orderBy
              limit: $limit
              offset: $offset
              additionalCategorySlug: $filterBy
            ) {
              results {
                name
                profileUrl
                __typename
              }
              count
              hasNextPage
              __typename
            }
            __typename
          }
        }
        """,
        "variables": {
            "categorySlug": "all",
            "limit": 22,
            "offset": offset,
            "orderBy": "POPULARITY",
            "filterBy": ""
        }
    }
    
    # Send the POST request
    response = requests.post(url, headers=headers, json=data)
    
    # Check the response status
    if response.status_code == 200:
        print(f"Request successful for offset {offset}")
        # Process the response JSON data
        results = response.json()
        
        # Extract the relevant data from the response
        apps = results['data']['appCategory']['apps']['results']
        
        # Append to the DataFrame (app name and profile URL)
        df = pd.DataFrame(apps)[['name', 'profileUrl']]
        all_apps = pd.concat([all_apps, df], ignore_index=True)
    else:
        print(f"Request failed with status code: {response.status_code} for offset {offset}")
    
    # Sleep for 2 seconds to rate limit requests (Ethical Scraping)
    time.sleep(2)

# Display the entire DataFrame
print(all_apps)

In [None]:
#LETS NOW USE BEAUTIFUL SOUP TO SCRAPE THE TARGET CONTENT IN profileurls GENERATED ABOVE
from bs4 import BeautifulSoup

# THESE ARE THE profileUrl that we got in our previous code above
profile_url_list = all_apps['profileUrl'].tolist()

# LISTS TO STORE THE EXTRACTED DATA FROM EACH PAGE
app_description = []
app_url = []
failed_urls = []  # To keep track of failed URLs

# BATCH SIZE
batch_size = 100

# GET LIST OF BEAUTIFULSOUP OBJECTS
def getpage(url_list):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
    }
    all_bs = []  # List to hold all BeautifulSoup objects
    for url in url_list:  # Use the batch-specific URL list
        attempts = 0
        success = False  # Flag to track successful requests

        while attempts < 10:  # Retry up to 10 times
            try:
                response = requests.get(url, headers=headers, timeout=10)  # Set a timeout
                response.raise_for_status()  # Raise an error for bad responses (4xx or 5xx)
                bs = BeautifulSoup(response.text, 'html.parser')
                all_bs.append(bs)
                success = True  # Mark success
                time.sleep(2) # Ethical scraping, no exhausting the server
                break  # Break after a successful request
            except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
                attempts += 1
                wait_time = min(60, 2 ** attempts)
                print(f"Connection error occurred: {e}. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            except requests.exceptions.RequestException as e:
                attempts += 1  # Increment attempts for any failed request
                print(f"Request failed: {e}. Retrying in 1 second...")
                time.sleep(1)

        if not success:  # If the request was not successful after all attempts
            print(f"Failed to scrape {url} after {attempts} attempts.")
            failed_urls.append(url)
            all_bs.append(None)

    return all_bs

# EXTRACT CONTENT
def getcontent(bs_list):
    for bs in bs_list:
        if bs is None:
            app_description.append("not given")
            app_url.append("not given")
            continue

        # Extracting app description
        app_details = bs.find('div', {'class': 'css-8gold7-AppDetails__appDescription'})
        if app_details:
            app_description.append(app_details.get_text())
        else:
            app_description.append("not given")

        # Extracting URLs for "Learn more"
        found_url = False

        for link in bs.find_all('a', href=True):
            if link.find(string="Learn more"):
                href_value = link['href']
                app_url.append(href_value)
                found_url = True

        if not found_url:
            app_url.append("not given")

# PROCESS IN BATCHES (This approach helps to save the RAM and track record easily)
def process_batches(profile_url_list, batch_size):
    app_names = all_apps['name'].tolist()
    
    # Start from o to 7717 (len(profile_url_list)), in steps of 100 (batch size)
    for start_idx in range(0, len(profile_url_list), batch_size):
        end_idx = start_idx + batch_size
        profile_url_batch = profile_url_list[start_idx:end_idx]
        app_name_batch = app_names[start_idx:end_idx]
        
        print(f"Processing batch from {start_idx} to {end_idx}")
        
        # Get BeautifulSoup objects for the batch
        bs_list = getpage(profile_url_batch)
        
        # Extract content from the batch
        getcontent(bs_list)
        
        # ARRANGING THE DATA FOR BETTER PRINTING
        if len(app_description) == len(app_name_batch) and len(app_url) == len(app_name_batch):
            ZapierScrapingResults = pd.DataFrame({
                "App Name": app_name_batch,
                "Description": app_description,
                "URL": app_url
            })
            # Save to each batch to Excel
            ZapierScrapingResults.to_excel(rf"E:\PYTHON- DATA SCIENCE\Data hub\ZapierScrapingResults_batch_{start_idx//batch_size + 1}.xlsx", index=False)
            print(f"Batch {start_idx//batch_size + 1} saved successfully.")
        else:
            print("Length of lists does not match to create the DataFrame for this batch.")
        
        # Clear lists after saving the batch
        app_description.clear()
        app_url.clear()
        print(f"Memory cleared after saving batch {start_idx//batch_size + 1}.")

# RUN THE BATCH PROCESSING
process_batches(profile_url_list, batch_size)


In [None]:
# THE FINAL STEP IS TO COMBINE ALL EXCEL FILES GENERATED ABOVE INTO A SINGLE EXCEL FILE
import glob
import os

# Path where batch files downloaded above are saved
path = r"E:\PYTHON- DATA SCIENCE\Data hub"  # Replace this with your own storage path

# Create a list of all files matching your file name pattern
file_pattern = os.path.join(path, "ZapierScrapingResults_batch_*.xlsx")
all_files = glob.glob(file_pattern)

# Read and concatenate all files
df_list = [pd.read_excel(file) for file in all_files]  # Read each file into a DataFrame
combined_df = pd.concat(df_list, ignore_index=True)  # Concatenate all DataFrames

# Save the combined DataFrame to a new Excel file
combined_df.to_excel(os.path.join(path, "Combined_ZapierScrapingResults.xlsx"), index=False)

print(r"All files have been successfully concatenated and saved in E:\PYTHON- DATA SCIENCE\Data hub.")
