<a href="https://colab.research.google.com/github/Ilvecho/Web-Scraping/blob/main/Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scrapy - web crawler

In [None]:
! pip install Scrapy

In [None]:
import numpy as np
import pandas as pd
import re
import scrapy
from scrapy.crawler import CrawlerProcess

from google.colab import files,drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Create the spider to crawl the desired URLs

In [None]:
class HrSpider(scrapy.Spider):
  name = 'hr_spider'

  def start_requests(self):
    urls = ['https://www.getimpactly.com/post/hr-compliance-checklist',
            'https://mariopeshev.com/4-ps-employee-relations-conflict-management/']

    for url in urls:
      yield scrapy.Request(url=url, callback=self.parse)

  def parse(self, response):
    output_text = ''
    # Extract the content of interest
    paragraphs = response.css('p, li::text, h2::text').extract()

    # Stitch together everything
    for content in paragraphs:
      output_text = output_text + '\n' + content

    # Do the parsing
    output_text = output_text.replace("<p>","")
    output_text = output_text.replace("</p>","")
    output_text = re.sub(r'<a href="[^"]*">', '', output_text)
    output_text = output_text.replace("</a>","")

    # Save the text in a file
    with open(r'/content/drive/MyDrive/hr_content.txt', 'w') as text_file:
      text_file.write(output_text)
      text_file.close()


In [None]:
process = CrawlerProcess()  # Look what Scrapy settings are # settings={'FEEDS': {'item.txt': {'format': 'txt'}}}
process.crawl(HrSpider)
process.start()

In [None]:
process.stop()

<DeferredList at 0x7a8ceb082fe0 current result: []>

# Beautiful Soup for HTML parsing

In [None]:
from bs4 import BeautifulSoup
import numpy as np
import re

In [None]:
def find_max_text_element(tag):
    max_text_length = 0
    max_text_element = None
    length_list = []

    for child in tag.find_all(recursive=False):
      # Find the direct children only, not nested elements
      text_length = len(child.get_text(strip=True))
      length_list.append(text_length)
      if text_length > max_text_length:
        max_text_length = text_length
        max_text_element = child

    length_list = np.array(length_list) / max_text_length
    print(length_list)
    length_list[np.where(length_list == 1)] = 0

    if (length_list > 0.10).any():

      # Keep only the meaningful elements
      for child in tag.find_all(recursive=False):
        text_length = len(child.get_text(strip=True))
        if (text_length / max_text_length) < 0.02:
          print(f"popped: {text_length} / {max_text_length} = {text_length / max_text_length}")
          child.extract()

      return max_text_element, True

    else:
      return max_text_element, False

In [None]:
def extract_main_content(html_content):
  soup = BeautifulSoup(html_content, 'html.parser')
  main_content = []

  current_tag = soup.body  # Start from the <body> tag

  # Remove unnecessary elements
  for child in current_tag.find_all(name=['script', 'template', 'figure', 'img', 'style', 'label', 'button', 'span']):
    child.extract()

  to_return = ''
  stop = False
  while not stop:
    max_text_element, stop = find_max_text_element(current_tag)

    # Move to the tag with the maximum text content
    current_tag = max_text_element

  return max_text_element.get_text()

In [None]:
# Example HTML content (replace this with your actual HTML content)
html_content = open(r'/content/drive/MyDrive/raw_body.txt', 'r')

main_content = extract_main_content(html_content)

main_content = re.sub(r'<a .*?>', '', main_content)
main_content = main_content.replace("</a>","")


with open(r'/content/drive/MyDrive/test_1.txt', 'w') as text_file:
  text_file.write(main_content)
  text_file.close()

[0.         1.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.01671429]
[0.00109786 0.01390617 1.         0.00966113]
[1.]
[1.]
[1.        0.1316051]
In if


# newspaper

God given library to scrape articles from the web

In [None]:
! pip install newspaper3k

In [29]:
from newspaper import Article
import re

# Replace the URL with the actual URL of the article you want to scrape
article_url = 'https://mariopeshev.com/4-ps-employee-relations-conflict-management/'

# Create an Article object and download the article
article = Article(article_url)
article.download()

# Parse the article content
article.parse()

# Get the output text and parse it with RegEx
output_text = article.text
# output_text = "test1 . test2! . \n\n test1 . test2!"

# Use a regular expression to find consecutive duplicate content
pattern = re.compile(r'(\b.*?\b[.!?])(?:[.\n\s]*)\1')

# Replace consecutive duplicate content with the first occurrence
result = re.sub(pattern, r'\1', output_text)

# print(result)

with open(r'/content/drive/MyDrive/test_2.txt', 'w') as text_file:
  text_file.write(result)
  text_file.close()