# WEB CRAWLER USING PYTHON

### in this notebook i will describe 'How to make a web crawler using python'

In [None]:
import requests
import re    
from urllib.parse import urlparse

### In order to build a web crawler i am using three in-built libraries of python.
### 1) requests
### 2) re
### 3) urlparse
### And the first step is to import all these libraries(code written above)

In [None]:
visited = set()

### I am using a set named visited and i will discuss the use of this set later in this notebook(code written above) 

In [None]:
def get_page(url):
    try:
        html = requests.get(url, timeout = 5)
    except Exception as e:
        print(e)
        return ''
    return html.content.decode('latin-1')

### For finding all the links in a webpage it's important to get the source code of that page. So, we can find all the links present in that page and for do the same i had defined a function i.e get_page.
### The main functionality of this page is to return the source code of that page(code written above)

In [None]:
def get_links(url): 
    html = get_page(url)    
    parsed = urlparse(url)    
    base = f"{parsed.scheme}://{parsed.netloc}"    
    links = re.findall('''<a\s+(?:[^>]*?\s+)?href="([^"]*)"''', html)    
    for i, link in enumerate(links):    
        if not urlparse(link).netloc:    
            link_with_base = base + link    
            links[i] = link_with_base    

    return set(filter(lambda x: 'mailto' not in x, links))

### Now as we have the source code of the webpage it's time to get all the links from that source code.In order to extract all the links i had defined a function get_links(code written above)
### The main functionality of this function is to return a set including all the links available in that source code.

In [None]:
def extract_info(url):    
    html = get_page(url)    
    meta = re.findall("<meta .*?name=[\"'](.*?)['\"].*?content=[\"'](.*?)['\"].*?>", html)    
    return dict(meta)

### when you search anything on google you might had seen that google provides the webpage links with their description(content in that page). To do the same i had defined a function extract_info(code written above).
### The main functionality of this function is to return a dictionary containing description, keywords in a webpage.

In [None]:
def crawl(url):    
    for link in get_links(url):    
        if link in visited:    
            continue    
        visited.add(link)    
        info = extract_info(link)    

        print(f"""Link: {link}    
Description: {info.get('description')}    
Keywords: {info.get('keywords')}    
            """)    
        crawl(link) 

### As we have extracted all the links and we have their description also it's time to present(display) them in a good way. to do the same i had define a function crawl(code written above).
### It uses the visited (set defined at the start of this notebook) and store all the links it visit along with their info and one another functionality of this function is that it will never return the same link again.

## FULL CODE

In [None]:
import requests
import re    
from urllib.parse import urlparse

visited = set()

def get_page(url):
    try:
        html = requests.get(url, timeout = 5)
    except Exception as e:
        print(e)
        return ''
    return html.content.decode('latin-1')

def get_links(url): 
    html = get_page(url)    
    parsed = urlparse(url)    
    base = f"{parsed.scheme}://{parsed.netloc}"    
    links = re.findall('''<a\s+(?:[^>]*?\s+)?href="([^"]*)"''', html)    
    for i, link in enumerate(links):    
        if not urlparse(link).netloc:    
            link_with_base = base + link    
            links[i] = link_with_base    

    return set(filter(lambda x: 'mailto' not in x, links))

def extract_info(url):    
    html = get_page(url)    
    meta = re.findall("<meta .*?name=[\"'](.*?)['\"].*?content=[\"'](.*?)['\"].*?>", html)    
    return dict(meta)

def crawl(url):    
    for link in get_links(url):    
        if link in visited:    
            continue    
        visited.add(link)    
        info = extract_info(link)    

        print(f"""Link: {link}    
Description: {info.get('description')}    
Keywords: {info.get('keywords')}    
            """)    
        crawl(link)

## TESTING THE CRAWLER(RUN BELOW CODE FOR TESTING THE WEB CRAWLER)

In [None]:
crawl('https://www.crawler-test.com/')

# Note : This  above code is for crawling a webpage using your own ip address
# Stay Connected If you want to know 'How to make a web crawler using proxy server and python'