# Introduction

## Websites to be consumed

## A rationale for extracting the web content 

## Content coverage of the data extracted 

## Complexity of the content layout

## Website/data copyright considerations

## Metadata supplementation and rational for the supplementation 

## Content extractor to export the important aspects of the data and/or metadata 

## Demonstration of the application of the WebCrawler (i.e. screen shots)

## Methodology of processing, cleaning, and storing harvested data for NLP 
tasking 

## Summary and visualisation of the harvested data. Preliminary EDA is 
acceptable in this section as well.

# Configuration

## Modules

In [None]:
import requests
from bs4 import BeautifulSoup
from pprint import pprint
from sklearn.pipeline import Pipeline
import pandas as pd
import re
import random
import os
import numpy as np
import time

## Constants

In [None]:
MAIN_URL = 'https://www.dailymail.co.uk/news/breaking_news/index.html'
DOMAIN = 'https://www.dailymail.co.uk'
SCRAPE_OUTPUT_FILE = '/content/drive/MyDrive/MA5851_A3/scrape_results.csv'
ARTICLE_TAG = 'a'
URL_HTML_TAG = 'href'
URLS_START_WITH = ['https://www.dailymail.co.uk']
URLS_NOT_END_WITH = ['#video']
TEXT_TAG = 'p'
REMOVE_SUBSTRINGS = ["]",'"',"'",".",",","[","/",">","<"]
SEED = 42

## File Storage

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Functions

### Utilities

In [None]:
def Set_All_Seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    np.random.RandomState(seed)

### HTML Element Selection

In [None]:
def Get_Soup(url: str):
  data = requests.get(url)
  html = BeautifulSoup(data.text, 'html.parser')
  return html

In [None]:
def Get_Links(soup: BeautifulSoup, find_tag: str, get_tag: str):
  results = []
  for link in soup.find_all(find_tag):
    results.append(link.get(get_tag))
  return results

In [None]:
def Get_Content(soup: BeautifulSoup, tag: str):
  results = []
  for p in soup.find_all(tag):
    results.append(p.contents)
  return results

### Link Selection

In [None]:
def Select_Links_Starts_With(links: list, stem: str):
  results = []
  for link in links:
    if not isinstance(link, str):
      continue
    if link.startswith(stem):
      results.append(link)
  return results

def Select_Links_Ends_With(links: list, stem: str):
  results = []
  for link in links:
    if not isinstance(link, str):
      continue
    if link.endswith(stem):
      results.append(link)
  return results

def Remove_Links(func, links: list, stems: list):
  for s in stems:
    delta = func(links = links, stem=s)
    links = list(set(links) - set(delta))
  return links

def Append_Links(func, links: list, stems: list):
  results = []
  for s in stems:
    delta = func(links = links, stem=s)
    results.append(delta)
  return results[0]

### Text Processing

In [None]:
def Remove_Sub_Strings(string: str, remove: list):
  for r in remove:
    assert isinstance(string, str)
    string = string.replace(r,"")
  return string

def Clean_String(s: str):
  s = re.compile(r'<[^>]+>')
  return re.sub('(^|\s+)FIRST($|\s+)', '', s)

def Get_Text_From_Page(url: str):
  web_text = Get_Content(soup = Get_Soup(url), tag=TEXT_TAG)
  return Remove_Sub_Strings(str(web_text), remove = REMOVE_SUBSTRINGS)

# Execute Pipeline

In [None]:
Set_All_Seeds(SEED)
start_time = time.time()

In [None]:
URLs = Get_Soup(MAIN_URL)
URLs = Get_Links(soup = URLs, find_tag = ARTICLE_TAG, get_tag = URL_HTML_TAG)
URLs = Append_Links(func = Select_Links_Starts_With,links = URLs, stems = URLS_START_WITH)
URLs = Remove_Links(func = Select_Links_Ends_With,links = URLs, stems = URLS_NOT_END_WITH)

In [None]:
Get_Text_From_Page(URLs[1])

'By  a class=author href=homesearchhtml?s=&amp;authornamef=Tommy+Taylor rel=nofollowTommy Taylora  and  a class=author href=homesearchhtml?s=&amp;authornamef=Ronny+Reyes+For+DailymailCom rel=nofollowRonny Reyes For DailymailComa   span class=article-timestamp article-timestamp-published span class=article-timestamp-labelPublished:span time datetime=2021-11-18T19:35:17+0000 19:35 GMT 18 November 2021 time span  |  span class=article-timestamp article-timestamp-updated span class=article-timestamp-labelUpdated:span time datetime=2021-11-19T04:50:56+0000 04:50 GMT 19 November 2021 time span     89 View  br  comments  A terminated Maryland cop his suspended police accomplice and his two daughters who he kidnapped were all found dead inside a crashed vehicle in an apparent murder-suicide after a five-day manhunt on Thursday police said\\xa0\\xa0\\xa0 Robert Vicosa 42 had taken his daughters Aminah 6 and Giana 7 from their Windsor Pennsylvania home on Sunday He was accompanied by\\xa0Sgt Tin

In [None]:
bag_of_words = []
for url in URLs:
  bag_of_words.append(Get_Text_From_Page(url))


In [None]:
titles = []
for url in URLs:
  titles.append(Get_Soup(url).find("title").contents[0])

In [None]:
output = pd.DataFrame({"URLS":URLs,"Bag Of Words":bag_of_words,"Title":titles}).drop_duplicates()
output.to_csv(SCRAPE_OUTPUT_FILE)

In [None]:
len(output)


In [None]:
execution_time = time.time() - start_time
execution_time 

156.2861065864563

## Output Profiling