In [None]:
# Give Colab Access to Google Drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import os
os.chdir('drive/MyDrive/SearchEngine')
print(os.getcwd())

/content/drive/MyDrive/SearchEngine


In [None]:
import re

def NGram(words, N=3):
  if N > len(words):
    print('Invalid N')
    return None
  grams = []
  for i in range(len(words) - N + 1):
    grams.append(words[i : i + N])
  return grams

my_str = '\n\n\n This is 8 & some Text'
words = re.findall('\w+', my_str)
print(' '.join(words))


# print(NGram(words, 5))

This is 8 some Text


In [None]:
# Main Script

"""
Web Crawling

1. Start with a seed link: 
2. Visit the seed link
3. Parse the seed for 
- Links
- Text content
4. Store the Links into our database
5. Store text into our database

Notes:
Keep two JSONs, one for visited links, and one for unvisited links (Use unvisited JSON file as a queue sort of)
"""

import os, errno
import json
import requests # Module for making HTTP requests
from bs4 import BeautifulSoup
from datetime import datetime
from timeout.timeout import timeout
import random

# OPEN JSONS FOR STORING VISITED & UNVISITED LINKS
visited_file = 'json_files/visited.json'
unvisited_file = 'json_files/unvisited.json'
content_file = 'json_files/content.json'

# Open 'visited.json' -> visited_json
visited_json = open(visited_file, 'r')
visited = json.load(visited_json)

# Open 'unvisited.json' -> unvisited_json
unvisited_json = open(unvisited_file, 'r')
unvisited = json.load(unvisited_json)

# Text JSON
content_json = open(content_file, 'r')
content = json.load(content_json)

def save(visited, unvisited, content):
  # After we are done Crawling & Parsing, save our results
  with open(visited_file, 'w') as visited_json:
      visited_json.write(json.dumps(visited, indent=4))

  with open(unvisited_file, 'w') as unvisited_json:
      unvisited_json.write(json.dumps(unvisited, indent=4))

  with open(content_file, 'w') as content_json:
      content_json.write(json.dumps(content, indent=4))

# Determine the seed website
seed_website = 'https://finance.yahoo.com'

# Blacklisted websites
blacklist = ['tiktok', 'instagram', 'image', 'yahoo', 'imdb']

# Count of times we have visited a domain (<something>.com)
domains = {}
MAX_DOMAIN_VISITS = 25

# Add the seed website to our unvisited links
if len(unvisited) == 0:
    unvisited[seed_website] = 1

extensions = ['.com', '.org', '.net']
def getDomain(website):

    def get_without_www(website):
        www_idx = website.find('www')
        if www_idx != -1:
            return website[www_idx + len('www.') : ]
        return website

    def get_http_idx(website):
        http_idx = website.find('http://')
        return http_idx

    def get_https_idx(website):
        https_idx = website.find('https://')
        return https_idx

    def get_domain_ext(website):
        ext_list = [website.find(ext) for ext in extensions]
        ext_idx = None
        for i in range(len(ext_list)):
            if ext_list[i] != -1:
                ext_idx = ext_list[i]
        return ext_idx

    http_idx = get_http_idx(website)
    domain_ext = get_domain_ext(website)
    if domain_ext == None:
        return None

    if http_idx != -1:
        return get_without_www(website[http_idx + len('http://'): domain_ext])

    https_idx = get_https_idx(website)
    if https_idx != -1:
        return get_without_www(website[https_idx + len('https://'): domain_ext])

    return None

# Timeout after 'x' s of parsing the same website
def crawlAndParseHTML(website):

    # If NULL -> Continue with parsing (haven't explored this link yet)
    # Otherwise, we have already seen it, don't parse it
    if visited.get(website, None) != None:
        visited[website] = visited[website] + 1
        return 'Already Visited Link'

    domains[domain] = domains.get(domain, 0) + 1
    if domains.get(domain) >= MAX_DOMAIN_VISITS:
        return 'Capped Domain Visits'
    else:
        print(f'{MAX_DOMAIN_VISITS - domains[domain]} visits remaining for {domain}')

    # Make HTTP GET request on the seed website
    response = requests.get(website)

    # Get the soup object from the HTML content
    soup = BeautifulSoup(response.content)

    # Get the anchor tags (links)
    anchor_tags = soup.find_all('a', href=True)

    # Store all the links into a UNVISITED JSON FILE
    for anchor in anchor_tags:
        link = anchor['href']

        # Check if the link is valid
        if len(link) == 0:
          continue

        # Remove trailing slash
        if link[-1] == '/':
            link = link[:-1]

        # Only add links with HTTP or not in blacklist
        if (not link.startswith('http')) or any([1 for x in blacklist if x in link]):
            continue

        # Check domain visits
        if domains.get(domain, 0) + 1 >= MAX_DOMAIN_VISITS:
            continue

        unvisited[link] = 1

    # Get content
    paragraph_tags = soup.find_all('p')
    text = ''
    for paragraph in paragraph_tags:
        text = text + ' ' + paragraph.get_text()
    
    # Extract a list of the individual words from the text
    words = re.findall('\w+', text)
    # Space delimited words as content
    if words:
      text = ' '.join(words)

    # Store the content of the website
    content[website] = text

    # Initialize that we have visited this website for the first time
    visited[website] = 1

time = datetime.now()

LINKS_TO_PROCESS = 5000
CHECKPOINT = 100
while (LINKS_TO_PROCESS and len(unvisited) > 0):

    # Number of remaining links to process
    print(LINKS_TO_PROCESS)

    # Get the next link to process from our unvisited map
    next_link = random.choice( list(unvisited.keys()) )

    # Check link against blacklist
    if any([1 for x in blacklist if x in next_link]):
        try:
            unvisited.pop(next_link) # Get rid of the link
        except Exception as e:
            print(e)
        continue

    domain = getDomain(next_link)

    # Check if we capped the domain visits
    domain = getDomain(next_link)
    if not domain:
        try:
            unvisited.pop(next_link)
        except Exception as e:
            print(e)
        continue

    print(f'Visiting: {next_link} | domain: {domain}')

    # Remove trailing slash (do it after checking link cuz we pop the raw link)
    if next_link[-1] == '/':
        next_link = next_link[:-1]

    try:
        with timeout(seconds=25):
            crawlAndParseHTML(next_link)
    except Exception as e:
        print(e)

    # Remove the link from the unvisited json
    try:
        unvisited.pop(next_link)
    except Exception as e:
        print(e)

    # One less link to visit
    LINKS_TO_PROCESS -= 1

    # Save if we hit checkpoint, save progress
    if LINKS_TO_PROCESS % CHECKPOINT == 0:
      save(visited, unvisited, content)

# Close files
visited_json.close()
unvisited_json.close()
content_json.close()

print(f'Processing time: {(datetime.now() - time).total_seconds()}s')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1793
Visiting: https://www.gothamist.com/2019/06/18/belvedere_castle_2019.php | domain: gothamist
24 visits remaining for gothamist
1792
Visiting: https://legend-quest.fandom.com/wiki/England | domain: legend-quest.fandom
21 visits remaining for legend-quest.fandom
1791
Visiting: https://comicvine.gamespot.com/wd/4075-936 | domain: comicvine.gamespot
24 visits remaining for comicvine.gamespot
1790
Visiting: https://www.behindthevoiceactors.com/video-games/Triangle-Strategy | domain: behindthevoiceactors
1789
Visiting: https://www.madewell.com/womens/trends/fun-favorites-under-100?prefn1=isBackroom&prefv1=false&prefn2=lilyStyle&prefv2=Headband | domain: madewell
1788
Visiting: https://www.newsy-today.com/marco-borsato-has-already-punished-you-enough-wel-nl/#respond | domain: newsy-today
1787
Visiting: https://ar-ar.facebook.com/login/?next=https%3A%2F%2Fwww.facebook.com%2FLyndaBaquero4NY | domain: ar-ar.facebook
24 visits 