In [44]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import csv
import pandas as pd

base_url = "https://fampay.in/"
paths = [
    "press",
    "blog",
    "",
    "blog/",
    "contact",
    "parent",
    "terms",
    "about",
    "partner",
    "privacy",
    "faqs",
    "careers",
    "friends"
]

visited_urls = set()
saved_urls = set()

In [45]:
def get_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        if href is not None and not href.startswith('#'):
            abs_url = urljoin(url, href)
            if any(abs_url.endswith('/'+p) or abs_url.endswith('/'+p+'/') or abs_url.endswith('/'+p+'.html') for p in paths) and abs_url not in visited_urls:
                visited_urls.add(abs_url)
                if "fam" in abs_url:
                    links.append(abs_url)
    return links

def crawl(url, depth, writer):
    if depth == 0:
        return
    print("Crawling:", url)
    links = get_links(url)
    for link in links:
        if link not in saved_urls:
            writer.writerow([url, link])
            saved_urls.add(link)
            crawl(link, depth-1, writer)

In [46]:
# Open CSV file for writing
with open("crawled_links.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["From", "To"])  # Write header row
    crawl(base_url, depth=5, writer=writer)

Crawling: https://fampay.in/
Crawling: https://famcard.me/
Crawling: https://fampay.in/
Crawling: https://fampay.in/about
Crawling: https://fampay.in/parent
Crawling: https://fampay.in/blog
Crawling: https://fampay.in/blog/tag/teens/
Crawling: https://fampay.in/blog/decor-ideas-that-would-transform-your-room-at-home-or-hostel/
Crawling: https://wa.me/?text=D%C3%A9cor%20ideas%20that%20would%20transform%20your%20room%20at%20home%20or%20hostel%F0%9F%8E%A8%20https://fampay.in/blog/decor-ideas-that-would-transform-your-room-at-home-or-hostel/
Crawling: https://www.facebook.com/sharer/sharer.php?u=https://fampay.in/blog/decor-ideas-that-would-transform-your-room-at-home-or-hostel/
Crawling: https://twitter.com/intent/tweet?text=D%C3%A9cor%20ideas%20that%20would%20transform%20your%20room%20at%20home%20or%20hostel%F0%9F%8E%A8&url=https://fampay.in/blog/decor-ideas-that-would-transform-your-room-at-home-or-hostel/
Crawling: https://facebook.com/fampay.in/
Crawling: https://fampay.in/blog/social

In [47]:
df = pd.read_csv("crawled_links.csv")
df.drop_duplicates(inplace=True)
df.to_csv("cleaned_crawled_links.csv", index=False)