# Patents View

In [None]:
!pip install cssselect
!pip install requests_html
!pip install nest-asyncio
!pip install pypatent

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
import requests
import pypatent
import pandas as pd
import json
import urllib
import pandas as pd
from requests_html import HTML
from requests_html import HTMLSession
from requests_html import AsyncHTMLSession
import asyncio
import nest_asyncio
import secrets
from bs4 import BeautifulSoup
nest_asyncio.apply()

In [None]:
url = 'https://api.patentsview.org/patents/query?q={"_and":[{"_gte":{"patent_date":"2006-01-01"}},{"_text_any":{"patent_abstract":"hydrogen"}},{"_text_any":{"patent_title":"hydrogen"}},{"assignee_lastknown_country":"US"}]}&f=["patent_number", "patent_title", "patent_year", "patent_num_claims", "patent_abstract"]&o={"page":1,"per_page":100}'
response = requests.get(url, timeout=30)
print(response.status_code)

200


In [None]:
data = response.json()
data

In [None]:
def get_source(url):
    """Return the source code for the provided URL.

    Args:
        url (string): URL of the page to scrape.

    Returns:
        response (object): HTTP response object from requests_html.
    """

    try:
        session = HTMLSession()
        response = session.get(url)
        return response

    except requests.exceptions.RequestException as e:
        print(e)

def scrape_google(query):

    query = urllib.parse.quote_plus(query)
    response = get_source("https://www.google.co.uk/search?q=" + query)

    links = list(response.html.absolute_links)
    google_domains = ('https://www.google.',
                      'https://google.',
                      'https://webcache.googleusercontent.',
                      'http://webcache.googleusercontent.',
                      'https://policies.google.',
                      'https://support.google.',
                      'https://maps.google.')

    for url in links[:]:
        if url.startswith(google_domains):
            links.remove(url)

    return links

def get_results(query):

    query = urllib.parse.quote_plus(query)
    response = get_source("https://www.google.co.uk/search?q=" + query)

    return response

def parse_results(response):

    css_identifier_result = ".tF2Cxc"
    css_identifier_title = "h3"
    css_identifier_link = ".yuRUbf a"
    css_identifier_text = ".VwiC3b"

    results = response.html.find(css_identifier_result)

    output = []

    for result in results:

        item = {
            'title': result.find(css_identifier_title, first=True).text,
            'link': result.find(css_identifier_link, first=True).attrs['href'],
            'text': result.find(css_identifier_text, first=True).text
        }

        output.append(item)

    return output

def google_search(query):
    response = get_results(query)
    return parse_results(response)

In [None]:
# scrape_google("data science blogs")

results = google_search("web scraping")
results

# CNN Search Common Crawl

In [None]:
!pip install cdx_toolkit

In [None]:
import cdx_toolkit
cdx = cdx_toolkit.CDXFetcher(source='cc')
url='https://www.reddit.com/r/dataisbeautiful/*'
objs = list(cdx.iter(url, from_ts='202002', to='202006',
                     limit=1000, filter='=status:200'))
[o.data for o in objs]

In [None]:
import requests
import argparse
import time
import json
import io
import gzip
import csv
import codecs
from bs4 import BeautifulSoup
import sys

domain = 'cnn.com'

# list of available indices
index_list = ["2014-52","2015-06","2015-11","2015-14","2015-18","2015-22","2015-27"]

#
# Searches the Common Crawl Index for a domain.
#
def search_domain(domain):

    record_list = []

    print("[*] Trying target domain: %s" % domain)

    for index in index_list:

        print("[*] Trying index %s" % index)

        cc_url  = "http://index.commoncrawl.org/CC-MAIN-%s-index?" % index
        cc_url += "url=%s&matchType=domain&output=json" % domain

        response = requests.get(cc_url)

        if response.status_code == 200:

            records = response.content.splitlines()

            for record in records:
                record_list.append(json.loads(record))

            print("[*] Added %d results." % len(records))


    print("[*] Found a total of %d hits." % len(record_list))

    return record_list

#
# Downloads a page from Common Crawl - adapted graciously from @Smerity - thanks man!
# https://gist.github.com/Smerity/56bc6f21a8adec920ebf
#
def download_page(record):

    offset, length = int(record['offset']), int(record['length'])
    offset_end = offset + length - 1

    # We'll get the file via HTTPS so we don't need to worry about S3 credentials
    # Getting the file on S3 is equivalent however - you can request a Range
    prefix = 'https://aws-publicdatasets.s3.amazonaws.com/'

    # We can then use the Range header to ask for just this set of bytes
    resp = requests.get(prefix + record['filename'], headers={'Range': 'bytes={}-{}'.format(offset, offset_end)})

    # The page is stored compressed (gzip) to save space
    # We can extract it using the GZIP library
    raw_data = StringIO(resp.content.decode('utf-8'))
    f = gzip.GzipFile(fileobj=raw_data)

    # What we have now is just the WARC response, formatted:
    data = f.read()

    response = ""

    if len(data):
        try:
            warc, header, response = data.strip().split('\r\n\r\n', 2)
        except:
            pass

    return response

#
# Extract links from the HTML
#
def extract_external_links(html_content,link_list):

    parser = BeautifulSoup(html_content)

    links = parser.find_all("a")

    if links:

        for link in links:
            href = link.attrs.get("href")

            if href is not None:

                if domain not in href:
                    if href not in link_list and href.startswith("http"):
                        print("[*] Discovered external link: %s" % href)
                        link_list.append(href)

    return link_list




In [None]:
record_list = search_domain(domain)

In [None]:
link_list   = []

for record in record_list:

    html_content = download_page(record)

    print("[*] Retrieved %d bytes for %s" % (len(html_content),record['url']))

    link_list = extract_external_links(html_content,link_list)


print("[*] Total external links discovered: %d" % len(link_list))

with codecs.open("%s-links.csv" % domain,"wb",encoding="utf-8") as output:

    fields = ["URL"]

    logger = csv.DictWriter(output,fieldnames=fields)
    logger.writeheader()

    for link in link_list:
        logger.writerow({"URL":link})

# Crawler Common

In [None]:
import requests
import argparse
import time
import json
import io
import gzip
import csv
import codecs
from bs4 import BeautifulSoup
import sys


In [None]:
def search_domain(domain):

    record_list = []

    print("[*] Trying target domain: %s" % domain)

    for index in index_list:

        print("[*] Trying index %s" % index)

        cc_url  = "http://index.commoncrawl.org/CC-MAIN-%s-index?" % index
        cc_url += "url=%s&matchType=domain&output=json" % domain

        response = requests.get(cc_url)

        if response.status_code == 200:

            records = response.content.splitlines()

            for record in records:
                record_list.append(json.loads(record))

            print("[*] Added %d results." % len(records))


    print("[*] Found a total of %d hits." % len(record_list))

    return record_list

In [None]:
def download_page(record):

    offset, length = int(record['offset']), int(record['length'])
    offset_end = offset + length - 1

    prefix = 'https://data.commoncrawl.org/'

    resp = requests.get(prefix + record_list[0]['filename'],
                        headers={'Range': 'bytes={}-{}'.format(offset,
                                                               offset_end)})

    if resp.status_code != 200 and resp.status_code != 206:
        print(f"{resp.status_code}: {prefix + record_list[0]['filename']}")
        return

    data = resp.content
    try:
        raw_data = gzip.decompress(data)
    except Exception as e:
        print(e)
        return

    response = ""

    if len(data):
        try:
            warc, header, response = str(raw_data).strip().split(r'\r\n\r\n', 2)

        except:
            pass

    return response



In [None]:
def extract_external_links(html_content,link_list):

    parser = BeautifulSoup(html_content)

    links = parser.find_all("a")

    if links:

        for link in links:
            href = link.attrs.get("href")

            if href is not None:

                if domain not in href:
                    if href not in link_list and href.startswith("http"):
                        print("[*] Discovered external link: %s" % href)
                        link_list.append(href)

    return link_list


In [None]:
domain = 'cnn.com'

# list of available indices
index_list = ["2014-52","2015-06","2015-11","2015-14","2015-18","2015-22","2015-27"]

In [None]:
record_list = search_domain(domain)

In [None]:
wet_url = record_list[0]['filename'].replace('/warc/', '/wet/').replace('warc.gz', 'warc.wet.gz')

In [None]:
resp = requests.get('https://data.commoncrawl.org/' + wet_url)
print(resp.status_code)

200


In [None]:
data = resp.content
try:
    raw_data = gzip.decompress(data)
except Exception as e:
    print(e)

In [None]:
wet, header, response = str(raw_data).strip().split(r'\r\n\r\n', 2)

In [None]:
with open('/content/drive/MyDrive/WhisperVideo/test.txt', 'w') as f:
    f.write(response)

In [None]:
link_list   = []

for idx, record in enumerate(record_list):

    if idx >= 1:
        break

    html_content = download_page(record)

    if html_content is None:
        continue

    print("[*] Retrieved %d bytes for %s" % (len(html_content),record['url']))

    link_list = extract_external_links(html_content,link_list)



In [None]:
with codecs.open("%s-links.csv" % domain,"wb",encoding="utf-8") as output:

    fields = ["URL"]

    logger = csv.DictWriter(output,fieldnames=fields)
    logger.writeheader()

    for link in link_list:
        logger.writerow({"URL":link})