In [1]:
#Get Neo4J official Python driver
!pip install neo4j



In [1]:
def connect(uri, username, password):
    try:
        return GraphDatabase.driver(uri, auth=(username, password))
    except ClientError as e:
        print(e)
        return -1

In [46]:
def create_node(tx, url, subpages, metadata):
    tx.run("MERGE (main:Url)"
            "ON CREATE SET main = $properties, main.url = $url "
           , url=url, properties=metadata)
    for subpage in subpages:
        tx.run("CREATE (s:Url {url: $url})", url=subpage)

In [39]:
def add_constraint(tx):
    tx.run("CREATE CONSTRAINT ON (u:Url)ASSERT u.url IS UNIQUE") 

In [3]:
def add_user(tx, username, password):
    try:
        result = tx.run("CALL dbms.security.createUser($username,$password, false)", username=username, password=password)
    except ClientError as e:
        print(e)

In [4]:
def change_password(tx, password):
    try:
        result = tx.run("CALL dbms.security.changePassword($password)", password=password)
    except ClientError as e:
        print(e)

In [5]:
def create_metadata(soup):
    title = soup.find("meta",  property="og:title")
    desc = soup.find("meta",  property="og:description")
    language = soup.find("meta",  property="og:locale")
    keywords = soup.find("meta", attrs = {'name':"keywords"})
    metadata = {
        'Title': title['content'] if title else None,
        'Description': desc['content']if desc else None,
        'Language': language['content'] if language else None,
        'Keywords': keywords['content'] if keywords else None}
    return metadata

In [6]:
#Get Requests
!pip3 install beautifulsoup4

[33mYou are using pip version 18.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [7]:
#Simple get example
import requests

r = requests.get('https://www.agh.edu.pl/')
r.status_code
r.headers['content-type']
r.encoding
# r.text holds html file

'utf-8'

In [51]:
#Crawler workflow
import requests, re
from bs4 import BeautifulSoup

queue_limit = 10
visited = set()
to_visit = ['https://neo4j.com/']
crawled_host = 'https://neo4j.com'

results = []

while len(to_visit) > 0 and len(results) < queue_limit :

    crawled_url = to_visit.pop()
    urls = set()

    if crawled_url not in visited:
        r = requests.get(crawled_url)
        visited.add(crawled_url)

        if r.status_code == 200:
            soup = BeautifulSoup(r.text)
            
            for a in soup.find_all('a', href=True):
                current_path = a['href']
            
                if current_path.startswith('/') and not '#' in current_path:
                    current_url = crawled_host + current_path
                
                    if current_url not in visited:
                        urls.add(current_url)
                        to_visit.append(current_url)
                        # print("Found the URL:", current_url)
            
            metadata = create_metadata(soup)
            result = (crawled_url, list(urls), metadata)
            results.append(result)
        else:
            print(r.status_code)

In [52]:
results[0]

('https://neo4j.com/',
 ['https://neo4j.com/startup-program/',
  'https://neo4j.com/developer/data-modeling/',
  'https://neo4j.com/terms/',
  'https://neo4j.com/blog/',
  'https://neo4j.com/whitepapers/rdbms-developers-graph-databases-ebook/?ref=home-2',
  'https://neo4j.com/sandbox-v2/?ref=hcard',
  'https://neo4j.com/graphacademy/',
  'https://neo4j.com/licensing/',
  'https://neo4j.com/industries/financial-services/',
  'https://neo4j.com/startup-program/?ref=developers',
  'https://neo4j.com/subscriptions/',
  'https://neo4j.com/whitepapers/sustainable-competitive-advantage-graph-databases/?ref=home',
  'https://neo4j.com/download/',
  'https://neo4j.com/events',
  'https://neo4j.com/graphacademy/neo4j-certification/',
  'https://neo4j.com/use-cases/social-network/',
  'https://neo4j.com/download-center/',
  'https://neo4j.com/industries/retail/',
  'https://neo4j.com/developer/graph-db-vs-rdbms/',
  'https://neo4j.com/community/',
  'https://neo4j.com/docs/',
  'https://neo4j.com

In [None]:
#Neo4J connection test (Neo4j must be running on your localhost)
from neo4j import GraphDatabase
from neo4j.exceptions import ClientError

uri = "bolt://localhost:7687"
our_username = "ads"
our_password = "ads"
message = "Response"
default = "neo4j"

# Logging with default user and password
driver = connect(uri, default, default)
# If we have already run this before, then the password is already changed
if driver != -1:
    ''' But if it is the first time we log as that default user, 
        we are forced to change the password '''
    with driver.session() as session:
        session.write_transaction(change_password, our_password) # e.g. our_password
    # We should exit and connect again after changing the password
    driver.close()

driver = connect(uri, default, our_password)

# Add the user that will be used for the rest of operations
with driver.session() as session:
    try: 
        session.write_transaction(add_user, our_username, our_password)
    except ClientError as e:
        print(e)
driver.close()


In [40]:
driver = connect(uri, our_username, our_password)
with driver.session() as session:
    session.write_transaction(add_constraint)
driver.close()

Failed to write data to connection Address(host='localhost', port=7687) (Address(host='127.0.0.1', port=7687)); ("0; 'Underlying socket connection gone (_ssl.c:2259)'")
Failed to write data to connection Address(host='localhost', port=7687) (Address(host='127.0.0.1', port=7687)); ("0; 'Underlying socket connection gone (_ssl.c:2259)'")


In [56]:
driver = connect(uri, our_username, our_password)
with driver.session() as session:
    session.write_transaction(create_node, results[0][0], results[0][1][:5], results[0][2])
driver.close()

ConstraintError: Node(1) already exists with label `Url` and property `url` = 'https://neo4j.com/startup-program/'

In [14]:
'''
driver = connect(uri, our_username, our_password)
with driver.session() as session:
    greeting = session.write_transaction(create_and_return_greeting, message)
    print(greeting) 
driver.close() '''

'''cqlCreate = "CREATE (neo4j:crawled_url{name:{}})".format(results[0][0])
driver = connect(uri, our_username, our_password)
with driver.session() as session:
    session.run(cqlCreate)'''

'cqlCreate = "CREATE (neo4j:crawled_url{name:{}})".format(results[0][0])\ndriver = connect(uri, our_username, our_password)\nwith driver.session() as session:\n    session.run(cqlCreate)'