# Crawler Project

### Firstly, we define some methods that will be used in the "Crawler" cell

In [None]:
def create_metadata(soup):
    title = soup.find("meta",  property="og:title")
    desc = soup.find("meta",  property="og:description")
    language = soup.find("meta",  property="og:locale")
    keywords = soup.find("meta", attrs = {'name':"keywords"})
    metadata = {
        'Title': title['content'] if title else None,
        'Description': desc['content']if desc else None,
        'Language': language['content'] if language else None,
        'Keywords': keywords['content'] if keywords else None}
    return metadata

### We install this [library](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) because it helps us extracting data from the webpages

In [None]:
!pip3 install beautifulsoup4

In [None]:
#Crawler workflow
import requests, re
from bs4 import BeautifulSoup         
   
visited = set()
to_visit = ['https://neo4j.com']
crawled_host = 'https://neo4j.com'
results = []

# This limit be changed depending on how many nodes we want in the results
# With 100, it has taken 90 seconds to run the cell, but it can depend on the internet 
queue_limit = 100 


while len(to_visit) > 0 and len(results) < queue_limit :

    crawled_url = to_visit.pop()
    urls = set()

    if crawled_url not in visited:
        r = requests.get(crawled_url)
        visited.add(crawled_url)

        if r.status_code == 200:
            soup = BeautifulSoup(r.text)
            
            for a in soup.find_all('a', href=True):
                current_path = a['href']
            
                if current_path.startswith('/') and not '#' in current_path:
                    current_url = crawled_host + current_path

                    if current_url.find('?') != -1:
                        current_url = current_url[0:current_url.find('?')]
                    
                    if not current_url.endswith('/'):
                        current_url += '/' 

                    if current_url not in visited and current_url not in to_visit:
                        urls.add(current_url)
                        to_visit.append(current_url)
           
            metadata = create_metadata(soup)
            result = (crawled_url, list(urls), metadata)
            results.append(result)
        else:
            print(r.status_code)


In [None]:
# This is how one of those tuples looks like
results[0]

### Now, we define some methods that will be used in the "Neo4j" cells

In [None]:
def connect(uri, username, password):
    try:
        return GraphDatabase.driver(uri, auth=(username, password))
    except ClientError as e:
        print(e)
        return -1

In [None]:
def create_node(tx, tup):
    url,subpages,metadata = tup[:3]
    tx.run("MERGE (main:Url {url: $url}) "
           "SET main = $properties, main.url = $url "
           "FOREACH (sub IN $subpages| "
                    "MERGE (u:Url {url: sub})"
                    "MERGE (main) -[:CONTAINS]-> (u)) ",
           url=url, subpages=subpages, properties=metadata)

In [None]:
def add_constraint(tx):
    tx.run("CREATE CONSTRAINT ON (u:Url) ASSERT u.url IS UNIQUE") 

In [None]:
def add_user(tx, username, password):
    try:
        result = tx.run("CALL dbms.security.createUser($username,$password, false)", username=username, password=password)
    except ClientError as e:
        print(e)

In [None]:
def change_password(tx, password):
    try:
        result = tx.run("CALL dbms.security.changePassword($password)", password=password)
    except ClientError as e:
        print(e)

In [None]:
def delete_all(tx):
    try:
        result = tx.run("MATCH (n) DETACH DELETE n")
    except ClientError as e:
        print(e)

### We install this driver to work with Neo4j

In [None]:
!pip install neo4j

### At this point, you should have started the Neo4j database on your localhost

In [None]:
from neo4j import GraphDatabase
from neo4j.exceptions import ClientError

uri = "bolt://localhost:7687"
our_username = "ads"
our_password = "ads"
message = "Response"
default = "neo4j"

# Logging with default user and password
driver = connect(uri, default, default)
# If we have already run this before, then the password is already changed
if driver != -1:
    ''' But if it is the first time we log as that default user, 
        we are forced to change the password '''
    with driver.session() as session:
        session.write_transaction(change_password, our_password) # e.g. our_password
    # We should exit and connect again after changing the password
    driver.close()

driver = connect(uri, default, our_password)

# Add the user that will be used for the rest of operations
with driver.session() as session:
    try: 
        session.write_transaction(add_user, our_username, our_password)
    except ClientError as e:
        print(e)
driver.close()


In [None]:
# Add a constraint to make UNIQUE the url of every node
driver = connect(uri, our_username, our_password)
with driver.session() as session:
    session.write_transaction(add_constraint)
driver.close()

In [None]:
# Create the nodes of the results
driver = connect(uri, our_username, our_password)
with driver.session() as session:
    for res in results:
        session.write_transaction(create_node, res)
driver.close()

### Now to check the results:
1. Go to http://localhost:7474/ on your browser
2. Log in with:
    1. Username: ads
    2. Password: ads
3. Run this query on the field at the top of the page: 
##### MATCH (n) RETURN n

### After this, we are going to create a simpler list with results, in order to show that our script works when there are connections in both directions between two pages

In [None]:
# Delete all the nodes of the results
driver = connect(uri, our_username, our_password)
with driver.session() as session:
    session.write_transaction(delete_all)
driver.close()

In [None]:
# Some fake results to try connections in both ways
fake_results = [('url1', ['url2', 'url3'], {'Title': 'title1'}),
                ('url2', ['url4', 'url5'], {'Title': 'title2'}),
                ('url4', ['url2', 'url5'], {'Title': 'title4'})]

In [None]:
fake_results

In [None]:
# Create the nodes of the results
driver = connect(uri, our_username, our_password)
with driver.session() as session:
    for res in fake_results:
        session.write_transaction(create_node, res)
driver.close()

### Now to check the results:
1. Go again to http://localhost:7474/ on your browser
2. If you are not logged in, do it like before
3. Run this query on the field at the top of the page: 
##### MATCH (n) RETURN n