-
Notifications
You must be signed in to change notification settings - Fork 2
/
crawler.py
80 lines (65 loc) · 2.37 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python3
# """
# Title: Links Crawler
# author: MKNC [https://github.com/Madhav-MKNC/]
# created: 12-01-2023 01:50
# """
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import os
from platform import uname
def printScreen():
os.system('cls' if 'win' in uname().system.lower() else 'clear')
print("["+"="*30+" LINKS CRAWLER "+"="*30+"]\n")
class Crawler:
def __init__(self, base_url, restricted_domain='https://'):
self.urls = [base_url]
self.urls_dict = dict()
self.links_found = list(self.urls_dict.keys())
self.restricted_domain = restricted_domain
def inList(self, url):
try: return self.urls_dict[url]
except: return False
def crawl_all_links(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a'):
path = link.get('href')
if not path.startswith('http'): path = urljoin(url,path)
if self.inList(path): continue
if '#' in path: continue
if not path.startswith(self.restricted_domain): continue
print("[*] Link found -",path)
self.urls.append(path)
self.links_found.append(path)
self.urls_dict[path] = True
def crawl(self):
while self.urls:
url = self.urls.pop(0)
print("\n[+] Crawling:",url)
try:
self.crawl_all_links(url)
except AttributeError:
print(f'[-] No href on : {url}\n')
except Exception as e:
print(f'[!] Failed to crawl: {url}')
print("[!] REASON:",e,'\n')
def saveData(self, fname='links_found.txt'):
with open(fname,'w',encoding="utf-8") as file:
file.write("\n".join(self.links_found))
print("[+] list saved in 'links_found.txt'")
if __name__ == '__main__':
printScreen()
url = input('[=] Enter the base url you want to crawl: ')
# url = "https://docs.aave.com/"
if not url.startswith('http'): url = 'https://'+url
try:
aave = Crawler(base_url=url)
aave.crawl()
except KeyboardInterrupt:
print("\n[Program Stopped]")
aave.saveData()
print()
print("len(urls) =",len(aave.links_found))
print("len(set(urls)) =",len(set(aave.links_found)))