In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import csv
import pandas as pd

base_url = "https://famapp.in/"
paths = [
    "press",
    "blog",
    "",
    "blog/",
    "contact",
    "parent",
    "terms",
    "about",
    "partner",
    "privacy",
    "faqs",
    "careers",
    "friends"
]

visited_urls = set()
saved_urls = set()

In [5]:
def get_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        if href is not None and not href.startswith('#'):
            abs_url = urljoin(url, href)
            if any(abs_url.endswith('/'+p) or abs_url.endswith('/'+p+'/') or abs_url.endswith('/'+p+'.html') for p in paths) and abs_url not in visited_urls:
                visited_urls.add(abs_url)
                if "fam" in abs_url:
                    links.append(abs_url)
    return links

def crawl(url, depth, writer):
    if depth == 0:
        return
    print("Crawling:", url)
    links = get_links(url)
    for link in links:
        if link not in saved_urls:
            writer.writerow([url, link])
            saved_urls.add(link)
            crawl(link, depth-1, writer)

In [6]:
# Open CSV file for writing
with open("famapp_crawled_links.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["From", "To"])  # Write header row
    crawl(base_url, depth=5, writer=writer)

Crawling: https://famapp.in/
Crawling: https://famcard.me/
Crawling: https://fampay.in/
Crawling: https://fampay.in/about
Crawling: https://fampay.in/blog
Crawling: https://fampay.in/blog/tag/teens/
Crawling: https://fampay.in/blog/tag/parents/
Crawling: https://fampay.in/blog/tag/news/
Crawling: https://fampay.in/blog/tag/famsays/
Crawling: https://fampay.in/blog/tag/community/
Crawling: https://fampay.in/blog/tag/culture/
Crawling: https://fampay.in/blog/tag/growth/
Crawling: https://fampay.in/blog/tag/lifestyle/
Crawling: https://fampay.in/blog/tag/pr-news/
Crawling: https://fampay.in/blog/tag/product/
Crawling: https://fampay.in/blog/tag/tech/
Crawling: https://fampay.in/blog/how-to-clean-up-your-android-apps-dependencies-the-ultimate-guide-2/
Crawling: https://fampay.in/blog/author/fampay/
Crawling: https://fampay.in/blog/discover-fampays-engagement-service-missions-3/
Crawling: https://fampay.in/blog/implementing-an-in-memory-queue/
Crawling: https://fampay.in/blog/covid-19-vacci

In [7]:
df = pd.read_csv("famapp_crawled_links.csv")
df.drop_duplicates(inplace=True)
df.to_csv("fapapp_cleaned_crawled_links.csv", index=False)