In [4]:
#Get Neo4J official Python driver
!pip install neo4j

Collecting neo4j
[33m  Cache entry deserialization failed, entry ignored[0m
[33m  Cache entry deserialization failed, entry ignored[0m
  Downloading https://files.pythonhosted.org/packages/01/c0/5143e27477d5b7db2f9f9b784762ec65b56fbe42fe618e5adfac1ed7bb57/neo4j-1.7.4.tar.gz
[31mCould not import setuptools which is required to install from a source distribution.
Please install setuptools.[0m
[33mYou are using pip version 9.0.3, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [None]:
def connect(uri, username, password):
    try:
        return GraphDatabase.driver(uri, auth=(username, password))
    except ClientError as e:
        print(e)
        return -1

In [None]:
def create_and_return_greeting(tx, message):
    result = tx.run("CREATE (a:Greeting) "
                    "SET a.message = $message "
                    "RETURN a.message + ', from node ' + id(a)", message=message)
    return result.single()[0]

In [None]:
def add_user(tx, username, password):
    try:
        result = tx.run("CALL dbms.security.createUser($username,$password, false)", username=username, password=password)
    except ClientError as e:
        print(e)

In [None]:
def change_password(tx, password):
    try:
        result = tx.run("CALL dbms.security.changePassword($password)", password=password)
    except ClientError as e:
        print(e)

In [8]:
def create_metadata(soup):
    title = soup.find("meta",  property="og:title")
    desc = soup.find("meta",  property="og:description")
    language = soup.find("meta",  property="og:locale")
    keywords = soup.find("meta", attrs = {'name':"keywords"})
    metadata = {
        'Title': title['content'] if title else None,
        'Description': desc['content']if desc else None,
        'Language': language['content'] if language else None,
        'Keywords': keywords['content'] if keywords else None}
    return metadata

In [None]:
#Neo4J connection test
from neo4j import GraphDatabase
from neo4j.exceptions import ClientError

uri = "bolt://localhost:7687"
our_username = "ads"
our_password = "ads"
message = "Response"
default = "neo4j"

# Logging with default user and password
driver = connect(uri, default, default)
# If we have already run this before, then the password is already changed
if driver != -1:
    ''' But if it is the first time we log as that default user, 
        we are forced to change the password '''
    with driver.session() as session:
        session.write_transaction(change_password, our_password) # e.g. our_password
    # We should exit and connect again after changing the password
    driver.close()

driver = connect(uri, default, our_password)

# Add the user that will be used for the rest of operations
with driver.session() as session:
    try: 
        session.write_transaction(add_user, our_username, our_password)
    except ClientError as e:
        print(e)
driver.close()

driver = connect(uri, our_username, our_password)
with driver.session() as session:
    greeting = session.write_transaction(create_and_return_greeting, message)
    print(greeting) 
driver.close()

In [3]:
#Get Requests
!pip3 install beautifulsoup4

Collecting beautifulsoup4
  Using cached https://files.pythonhosted.org/packages/1d/5d/3260694a59df0ec52f8b4883f5d23b130bc237602a1411fa670eae12351e/beautifulsoup4-4.7.1-py3-none-any.whl
Collecting soupsieve>=1.2 (from beautifulsoup4)
  Using cached https://files.pythonhosted.org/packages/b9/a5/7ea40d0f8676bde6e464a6435a48bc5db09b1a8f4f06d41dd997b8f3c616/soupsieve-1.9.1-py2.py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.7.1 soupsieve-1.9.1
[33mYou are using pip version 8.1.1, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [9]:
#Simple get example
import requests

r = requests.get('https://www.agh.edu.pl/')
r.status_code
r.headers['content-type']
r.encoding
# r.text holds html file

'utf-8'

In [11]:
#Crawler workflow
import requests, re
from bs4 import BeautifulSoup

queue_limit = 10
visited = set()
to_visit = ['https://neo4j.com/']
crawled_host = 'https://neo4j.com'

results = []

while len(to_visit) > 0 and len(results) < queue_limit :

    crawled_url = to_visit.pop()
    urls = []

    if crawled_url not in visited:
        r = requests.get(crawled_url)
        visited.add(crawled_url)

        if r.status_code == 200:
            soup = BeautifulSoup(r.text)
            
            for a in soup.find_all('a', href=True):
                current_path = a['href']
            
                if current_path.startswith('/') and not '#' in current_path:
                    current_url = crawled_host + current_path
                
                    if current_url not in visited:
                        urls.append(current_url)
                        to_visit.append(current_url)
                        # print("Found the URL:", current_url)
            
            metadata = create_metadata(soup)
            result = (crawled_url, urls, metadata)
            results.append(result)
        else:
            print(r.status_code)
    
print(results)
print(visited)
print(to_visit)

[('https://neo4j.com/', ['https://neo4j.com/news/', 'https://neo4j.com/product/', 'https://neo4j.com/graph-machine-learning-algorithms/', 'https://neo4j.com/use-cases/', 'https://neo4j.com/use-cases/', 'https://neo4j.com/use-cases/artificial-intelligence-analytics/', 'https://neo4j.com/use-cases/fraud-detection/', 'https://neo4j.com/use-cases/identity-and-access-management/', 'https://neo4j.com/use-cases/knowledge-graph', 'https://neo4j.com/use-cases/master-data-management/', 'https://neo4j.com/use-cases/network-and-it-operations/', 'https://neo4j.com/use-cases/real-time-recommendation-engine/', 'https://neo4j.com/use-cases/social-network/', 'https://neo4j.com/industries/retail/', 'https://neo4j.com/industries/financial-services/', 'https://neo4j.com/industries/retail/', 'https://neo4j.com/partners/oem-partner/', 'https://neo4j.com/use-cases/gdpr-compliance/', 'https://neo4j.com/business-edge/', 'https://neo4j.com/partners/', 'https://neo4j.com/partners/', 'https://neo4j.com/partners/'

In [4]:
r = requests.get('https://www.coffeedesk.pl/blog/')
soup = BeautifulSoup(r.text)
title = soup.find("meta",  property="og:title")
desc = soup.find("meta",  property="og:description")
language = soup.find("meta",  property="og:locale")
keywords = soup.find("meta", attrs = {'name':"keywords"})
print (title['content'] if title else None)
print(desc['content']if desc else None)
print(language['content'] if language else None)
print(keywords['content'] if keywords else None)

CoffeeDesk - Blog - Najwięcej kawowych wieści w Polsce!
CoffeeDesk.pl to największy sklep z kawą specialty i akcesoriami w Polsce, a nasz blog doskonale uzupełnia cały asortyment o merytoryczną wiedzę kawową!
pl_PL
None


In [5]:
for name in soup.find_all('meta'):
    print('name: ', name.get('name'))
    print('content:', name.get('content'))
    print('property: ', name.get('property'))

name:  None
content: None
property:  None
name:  viewport
content: width=device-width, initial-scale=1.0
property:  None
name:  description
content: CoffeeDesk.pl to największy sklep z kawą specialty i akcesoriami w Polsce, a nasz blog doskonale uzupełnia cały asortyment o merytoryczną wiedzę kawową!
property:  None
name:  None
content: pl_PL
property:  og:locale
name:  None
content: website
property:  og:type
name:  None
content: CoffeeDesk - Blog - Najwięcej kawowych wieści w Polsce!
property:  og:title
name:  None
content: CoffeeDesk.pl to największy sklep z kawą specialty i akcesoriami w Polsce, a nasz blog doskonale uzupełnia cały asortyment o merytoryczną wiedzę kawową!
property:  og:description
name:  None
content: https://www.coffeedesk.pl/blog/
property:  og:url
name:  None
content: Blog Coffeedesk.pl
property:  og:site_name
name:  twitter:card
content: summary
property:  None
name:  twitter:description
content: CoffeeDesk.pl to największy sklep z kawą specialty i akcesoriami 

In [6]:
metadata = {
    'Title': title['content'] if title else None,
    'Description': desc['content']if desc else None,
    'Language': language['content'] if language else None,
    'Keywords': keywords['content'] if keywords else None}

In [7]:
soup.find_all('meta')

[<meta charset="utf-8"/>,
 <meta content="width=device-width, initial-scale=1.0" name="viewport">
 <link href="https://www.coffeedesk.pl/favicon-192x192.png" rel="shortcut icon"/>
 <link href="https://www.coffeedesk.pl/blog/xmlrpc.php" rel="pingback"/>
 <!-- head extras -->
 <link href="https://www.coffeedesk.pl/blog/wp-content/cache/autoptimize/css/autoptimize_f0ba74ac3ef3d6dcc8d86e9c4c8b3e19.css" media="all" rel="stylesheet" type="text/css"><title>CoffeeDesk - Blog - Najwięcej kawowych wieści w Polsce!</title>
 <link href="https://www.coffeedesk.pl/blog" hreflang="pl" rel="alternate">
 <link href="https://www.coffeedesk.com/blog/" hreflang="en" rel="alternate"/>
 <!-- This site is optimized with the Yoast SEO plugin v11.3 - https://yoast.com/wordpress/plugins/seo/ -->
 <meta content="CoffeeDesk.pl to największy sklep z kawą specialty i akcesoriami w Polsce, a nasz blog doskonale uzupełnia cały asortyment o merytoryczną wiedzę kawową!" name="description"/>
 <link href="https://www.cof