<a href="https://colab.research.google.com/github/Ilvecho/Web-Scraping/blob/main/Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scrapy - web crawler

In [None]:
! pip install Scrapy

In [None]:
import numpy as np
import pandas as pd
import re
import scrapy
from scrapy.crawler import CrawlerProcess

from google.colab import files,drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Create the spider to crawl the desired URLs

In [None]:
class HrSpider(scrapy.Spider):
  name = 'hr_spider'

  def start_requests(self):
    urls = ['https://www.getimpactly.com/post/hr-compliance-checklist',
            'https://mariopeshev.com/4-ps-employee-relations-conflict-management/']

    for url in urls:
      yield scrapy.Request(url=url, callback=self.parse)

  def parse(self, response):
    output_text = ''
    # Extract the content of interest
    paragraphs = response.css('p, li::text, h2::text').extract()

    # Stitch together everything
    for content in paragraphs:
      output_text = output_text + '\n' + content

    # Do the parsing
    output_text = output_text.replace("<p>","")
    output_text = output_text.replace("</p>","")
    output_text = re.sub(r'<a href="[^"]*">', '', output_text)
    output_text = output_text.replace("</a>","")

    # Save the text in a file
    with open(r'/content/drive/MyDrive/hr_content.txt', 'w') as text_file:
      text_file.write(output_text)
      text_file.close()


In [None]:
process = CrawlerProcess()  # Look what Scrapy settings are # settings={'FEEDS': {'item.txt': {'format': 'txt'}}}
process.crawl(HrSpider)
process.start()

In [None]:
process.stop()

<DeferredList at 0x7a8ceb082fe0 current result: []>

# Beautiful Soup for HTML parsing

In [1]:
from bs4 import BeautifulSoup
import numpy as np
import re

In [2]:
def find_max_text_element(tag):
    max_text_length = 0
    max_text_element = None
    length_list = []

    for child in tag.find_all(recursive=False):
      # Find the direct children only, not nested elements
      text_length = len(child.get_text(strip=True))
      length_list.append(text_length)
      if text_length > max_text_length:
        max_text_length = text_length
        max_text_element = child

    length_list = np.array(length_list) / max_text_length
    # print(length_list)
    length_list[np.where(length_list == 1)] = 0

    if (length_list > 0.10).any():

      # Keep only the meaningful elements
      for child in tag.find_all(recursive=False):
        text_length = len(child.get_text(strip=True))
        if (text_length / max_text_length) < 0.02:
          # print(f"popped: {text_length} / {max_text_length} = {text_length / max_text_length}")
          child.extract()

      return max_text_element, True

    else:
      return max_text_element, False

In [3]:
def extract_main_content(html_content):
  soup = BeautifulSoup(html_content, 'html.parser')
  main_content = []

  current_tag = soup.body  # Start from the <body> tag

  # Remove unnecessary elements
  for child in current_tag.find_all(name=['script', 'template', 'figure', 'img', 'style', 'label', 'button', 'span']):
    child.extract()

  to_return = ''
  stop = False
  while not stop:
    max_text_element, stop = find_max_text_element(current_tag)

    # Move to the tag with the maximum text content
    current_tag = max_text_element

  return max_text_element.get_text()

In [None]:
# Example HTML content (replace this with your actual HTML content)
html_content = open(r'/content/drive/MyDrive/raw_body.txt', 'r')

main_content = extract_main_content(html_content)

main_content = re.sub(r'<a .*?>', '', main_content)
main_content = main_content.replace("</a>","")


with open(r'/content/drive/MyDrive/test_1.txt', 'w') as text_file:
  text_file.write(main_content)
  text_file.close()

[0.         1.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.01671429]
[0.00109786 0.01390617 1.         0.00966113]
[1.]
[1.]
[1.        0.1316051]
In if


# newspaper

God given library to scrape articles from the web

In [None]:
! pip install newspaper3k
! pip install thefuzz
! pip install requests

In [None]:
import numpy as np
from newspaper import Article
from bs4 import BeautifulSoup
import re
from thefuzz import fuzz
import requests

from google.colab import files,drive
drive.mount('/content/gdrive')

In [3]:
def find_max_text_element(tag):
    max_text_length = 0
    max_text_element = None
    length_list = []

    for child in tag.find_all(recursive=False):
      # Find the direct children only, not nested elements
      text_length = len(child.get_text(strip=True))
      length_list.append(text_length)
      if text_length > max_text_length:
        max_text_length = text_length
        max_text_element = child

    length_list = np.array(length_list) / max_text_length
    length_list[np.where(length_list == 1)] = 0

    if (length_list > 0.10).any():

      # Keep only the meaningful elements
      for child in tag.find_all(recursive=False):
        text_length = len(child.get_text(strip=True))
        if (text_length / max_text_length) < 0.02:
          child.extract()

      return max_text_element, True

    else:
      return max_text_element, False

In [4]:
def parse_url(url):
  r = requests.get(url.strip())

  soup = BeautifulSoup(r.text, 'html.parser')

  current_tag = soup.body  # Start from the <body> tag

  # Remove unnecessary elements
  for child in current_tag.find_all(name=['script', 'template', 'figure', 'img', 'style', 'label', 'button', 'span']):
    child.extract()

  stop = False
  while not stop:
    max_text_element, stop = find_max_text_element(current_tag)

    # Move to the tag with the maximum text content
    if max_text_element:
      current_tag = max_text_element
    else:
      return ''

  main_content = max_text_element.get_text()

  main_content = re.sub(r'<a .*?>', '', main_content)
  main_content = main_content.replace("</a>","")

  return main_content

In [8]:
# Using readlines()
url_file = open('/content/gdrive/MyDrive/Syllog/the_urls.txt', 'r')

count = 1
# Strips the newline character
for url in url_file.readlines():

  url = url.strip()

  if len(url) == 0:
    break

  print(f'{count}: {url}')

  if url.split('.')[-1] == 'pdf':
    with open('/content/gdrive/MyDrive/Syllog/failed_urls.txt', 'a') as file:
      file.write(url.strip() + '\n')
    count += 1
    continue

  try:
    # Create an Article object and download the article
    article = Article(url)
    article.download()

    print('Article downloaded')

    # Parse the article content
    article.parse()

    # Get the output text and parse it with RegEx
    output_text = article.text

    # Use a regular expression to find consecutive duplicate content
    pattern = re.compile(r'(\b.*?\b[.!?])(?:[.\n\s]*)\1')

    # Replace consecutive duplicate content with the first occurrence
    result = re.sub(pattern, r'\1', output_text)

    # Get the text length from the HTML and compare it with the extracted text

    score = fuzz.token_set_ratio(result, parse_url(url))
    print(score)

    if score > 80:

      with open(f'/content/gdrive/MyDrive/Syllog/transcripts/text_{count}.txt', 'w') as text_file:
        print('OK')
        text_file.write(result)
        text_file.close()
    else:
      with open('/content/gdrive/MyDrive/Syllog/failed_urls.txt', 'a') as file:
        print('ERROR')
        file.write(url.strip() + '\n')

  except:
    # If there is any issue in the above pipeline, note down the URL to be scraped manually
    with open('/content/gdrive/MyDrive/Syllog/failed_urls.txt', 'a') as file:
      file.write(url.strip() + '\n')

  print("############")
  count += 1

1: https://www.getimpactly.com/post/hr-compliance-checklist
Article downloaded
97
OK
############
2: https://www.dol.gov/agencies/oasam/centers-offices/civil-rights-center/internal/policies/DOL-Policy-Statement-on-Harassing-Conduct-in-the-Work
Article downloaded
100
OK
############
3: https://business.linkedin.com/talent-solutions/recruiting-tips/12-essential-recruiting-tips-and-tactics
Article downloaded
22
ERROR
############
4: https://www.linkedin.com/advice/1/what-some-tools-methods-you-use-conduct-job-analysis
Article downloaded
54
ERROR
############
5: https://www.coursehero.com/tutors-problems/Human-Resource-Management/26761067-Part-AIdentify-and-summarize-the-five-steps-involved-in-the-job-analy/
Article downloaded
0
ERROR
############
6: https://www.linkedin.com/pulse/top-5-talent-acquisition-strategies-2023-rchilli-inc-/
Article downloaded
96
OK
############
7: https://mashable.com/article/27-job-interview-questions-glassdoor
Article downloaded
95
OK
############
8: https://w

  length_list = np.array(length_list) / max_text_length


0
ERROR
############
66: https://www.personio.com/hr-lexicon/strategic-human-resource-management/
Article downloaded
99
OK
############
67: https://hbr.org/2022/07/its-time-to-reimagine-employee-retention
Article downloaded
59
ERROR
############
68: https://hbr.org/2021/10/how-companies-can-improve-employee-engagement-right-now
Article downloaded
100
OK
############
69: https://www.forbes.com/advisor/business/employee-retention-strategies/
Article downloaded
############
70: https://blog.workday.com/en-us/2022/secret-employee-retention-employee-engagement.html
Article downloaded
70
ERROR
############
71: https://knowledge.wharton.upenn.edu/article/motivation-up-attrition-down-employee-engagement/
Article downloaded
100
OK
############
72: https://www2.deloitte.com/us/en/insights/focus/human-capital-trends/2016/employee-engagement-and-retention.html
Article downloaded
100
OK
############
73: https://www.culturemonkey.io/employee-engagement/workplace-employee-engagement-and-retention/
Ar