<a href="https://colab.research.google.com/github/Ilvecho/Web-Scraping/blob/main/Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import os
import re
import plotly.graph_objects as go

from google.colab import userdata
from google.colab import files,drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Scrapy - web crawler

In [None]:
! pip install Scrapy

In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Create the spider to crawl the desired URLs

In [None]:
class HrSpider(scrapy.Spider):
  name = 'hr_spider'

  def start_requests(self):
    urls = ['https://www.getimpactly.com/post/hr-compliance-checklist',
            'https://mariopeshev.com/4-ps-employee-relations-conflict-management/']

    for url in urls:
      yield scrapy.Request(url=url, callback=self.parse)

  def parse(self, response):
    output_text = ''
    # Extract the content of interest
    paragraphs = response.css('p, li::text, h2::text').extract()

    # Stitch together everything
    for content in paragraphs:
      output_text = output_text + '\n' + content

    # Do the parsing
    output_text = output_text.replace("<p>","")
    output_text = output_text.replace("</p>","")
    output_text = re.sub(r'<a href="[^"]*">', '', output_text)
    output_text = output_text.replace("</a>","")

    # Save the text in a file
    with open(r'/content/drive/MyDrive/hr_content.txt', 'w') as text_file:
      text_file.write(output_text)
      text_file.close()


In [None]:
process = CrawlerProcess()  # Look what Scrapy settings are # settings={'FEEDS': {'item.txt': {'format': 'txt'}}}
process.crawl(HrSpider)
process.start()

In [None]:
process.stop()

<DeferredList at 0x7a8ceb082fe0 current result: []>

# Beautiful Soup for HTML parsing

In [None]:
from bs4 import BeautifulSoup

In [None]:
def find_max_text_element(tag):
    max_text_length = 0
    max_text_element = None
    length_list = []

    for child in tag.find_all(recursive=False):
      # Find the direct children only, not nested elements
      text_length = len(child.get_text(strip=True))
      length_list.append(text_length)
      if text_length > max_text_length:
        max_text_length = text_length
        max_text_element = child

    length_list = np.array(length_list) / max_text_length
    # print(length_list)
    length_list[np.where(length_list == 1)] = 0

    if (length_list > 0.10).any():

      # Keep only the meaningful elements
      for child in tag.find_all(recursive=False):
        text_length = len(child.get_text(strip=True))
        if (text_length / max_text_length) < 0.02:
          # print(f"popped: {text_length} / {max_text_length} = {text_length / max_text_length}")
          child.extract()

      return max_text_element, True

    else:
      return max_text_element, False

In [None]:
def extract_main_content(html_content):
  soup = BeautifulSoup(html_content, 'html.parser')
  main_content = []

  current_tag = soup.body  # Start from the <body> tag

  # Remove unnecessary elements
  for child in current_tag.find_all(name=['script', 'template', 'figure', 'img', 'style', 'label', 'button', 'span']):
    child.extract()

  to_return = ''
  stop = False
  while not stop:
    max_text_element, stop = find_max_text_element(current_tag)

    # Move to the tag with the maximum text content
    current_tag = max_text_element

  return max_text_element.get_text()

In [None]:
# Example HTML content (replace this with your actual HTML content)
html_content = open(r'/content/drive/MyDrive/raw_body.txt', 'r')

main_content = extract_main_content(html_content)

main_content = re.sub(r'<a .*?>', '', main_content)
main_content = main_content.replace("</a>","")


with open(r'/content/drive/MyDrive/test_1.txt', 'w') as text_file:
  text_file.write(main_content)
  text_file.close()

[0.         1.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.01671429]
[0.00109786 0.01390617 1.         0.00966113]
[1.]
[1.]
[1.        0.1316051]
In if


# newspaper

God given library to scrape articles from the web

In [None]:
! pip install newspaper3k
! pip install thefuzz
! pip install requests

In [None]:
from newspaper import Article
from bs4 import BeautifulSoup
from thefuzz import fuzz
import requests

In [None]:
def find_max_text_element(tag):
    max_text_length = 0
    max_text_element = None
    length_list = []

    for child in tag.find_all(recursive=False):
      # Find the direct children only, not nested elements
      text_length = len(child.get_text(strip=True))
      length_list.append(text_length)
      if text_length > max_text_length:
        max_text_length = text_length
        max_text_element = child

    length_list = np.array(length_list) / max_text_length
    length_list[np.where(length_list == 1)] = 0

    if (length_list > 0.10).any():

      # Keep only the meaningful elements
      for child in tag.find_all(recursive=False):
        text_length = len(child.get_text(strip=True))
        if (text_length / max_text_length) < 0.02:
          child.extract()

      return max_text_element, True

    else:
      return max_text_element, False

In [None]:
def parse_url(url):
  r = requests.get(url.strip())

  soup = BeautifulSoup(r.text, 'html.parser')

  current_tag = soup.body  # Start from the <body> tag

  # Remove unnecessary elements
  for child in current_tag.find_all(name=['script', 'template', 'figure', 'img', 'style', 'label', 'button', 'span']):
    child.extract()

  stop = False
  while not stop:
    max_text_element, stop = find_max_text_element(current_tag)

    # Move to the tag with the maximum text content
    if max_text_element:
      current_tag = max_text_element
    else:
      return ''

  main_content = max_text_element.get_text()

  main_content = re.sub(r'<a .*?>', '', main_content)
  main_content = main_content.replace("</a>","")

  return main_content

In [None]:
# Using readlines()
url_file = open('/content/gdrive/MyDrive/Syllog/the_urls.txt', 'r')

count = 1
# Strips the newline character
for url in url_file.readlines():

  url = url.strip()

  if len(url) == 0:
    break

  print(f'{count}: {url}')

  if url.split('.')[-1] == 'pdf':
    with open('/content/gdrive/MyDrive/Syllog/failed_urls.txt', 'a') as file:
      file.write(url.strip() + '\n')
    count += 1
    continue

  try:
    # Create an Article object and download the article
    article = Article(url)
    article.download()

    print('Article downloaded')

    # Parse the article content
    article.parse()

    # Get the output text and parse it with RegEx
    output_text = article.text

    # Use a regular expression to find consecutive duplicate content
    pattern = re.compile(r'(\b.*?\b[.!?])(?:[.\n\s]*)\1')

    # Replace consecutive duplicate content with the first occurrence
    result = re.sub(pattern, r'\1', output_text)

    # Get the text length from the HTML and compare it with the extracted text

    score = fuzz.token_set_ratio(result, parse_url(url))
    print(score)

    if score > 80:

      with open(f'/content/gdrive/MyDrive/Syllog/transcripts/text_{count}.txt', 'w') as text_file:
        print('OK')
        text_file.write(result)
        text_file.close()
    else:
      with open('/content/gdrive/MyDrive/Syllog/failed_urls.txt', 'a') as file:
        print('ERROR')
        file.write(url.strip() + '\n')

  except:
    # If there is any issue in the above pipeline, note down the URL to be scraped manually
    with open('/content/gdrive/MyDrive/Syllog/failed_urls.txt', 'a') as file:
      file.write(url.strip() + '\n')

  print("############")
  count += 1

1: https://www.getimpactly.com/post/hr-compliance-checklist
Article downloaded
97
OK
############
2: https://www.dol.gov/agencies/oasam/centers-offices/civil-rights-center/internal/policies/DOL-Policy-Statement-on-Harassing-Conduct-in-the-Work
Article downloaded
100
OK
############
3: https://business.linkedin.com/talent-solutions/recruiting-tips/12-essential-recruiting-tips-and-tactics
Article downloaded
22
ERROR
############
4: https://www.linkedin.com/advice/1/what-some-tools-methods-you-use-conduct-job-analysis
Article downloaded
54
ERROR
############
5: https://www.coursehero.com/tutors-problems/Human-Resource-Management/26761067-Part-AIdentify-and-summarize-the-five-steps-involved-in-the-job-analy/
Article downloaded
0
ERROR
############
6: https://www.linkedin.com/pulse/top-5-talent-acquisition-strategies-2023-rchilli-inc-/
Article downloaded
96
OK
############
7: https://mashable.com/article/27-job-interview-questions-glassdoor
Article downloaded
95
OK
############
8: https://w

  length_list = np.array(length_list) / max_text_length


0
ERROR
############
66: https://www.personio.com/hr-lexicon/strategic-human-resource-management/
Article downloaded
99
OK
############
67: https://hbr.org/2022/07/its-time-to-reimagine-employee-retention
Article downloaded
59
ERROR
############
68: https://hbr.org/2021/10/how-companies-can-improve-employee-engagement-right-now
Article downloaded
100
OK
############
69: https://www.forbes.com/advisor/business/employee-retention-strategies/
Article downloaded
############
70: https://blog.workday.com/en-us/2022/secret-employee-retention-employee-engagement.html
Article downloaded
70
ERROR
############
71: https://knowledge.wharton.upenn.edu/article/motivation-up-attrition-down-employee-engagement/
Article downloaded
100
OK
############
72: https://www2.deloitte.com/us/en/insights/focus/human-capital-trends/2016/employee-engagement-and-retention.html
Article downloaded
100
OK
############
73: https://www.culturemonkey.io/employee-engagement/workplace-employee-engagement-and-retention/
Ar

# Docs analysis

In [None]:
file_list = os.listdir('/content/gdrive/MyDrive/Syllog/transcripts')

tot_chars = []

for file in file_list:

  # This is the file where the full Law is reported
  if 'ITA' in file:
    continue

  num_chars = 0
  f = open('/content/gdrive/MyDrive/Syllog/transcripts/' + file, 'r')
  for line in f:
    num_chars += len(line)
  print(f'{file}: {num_chars}')
  tot_chars.append(num_chars)

In [None]:
data = np.array(tot_chars) / 3.0

In [None]:
# Create a histogram trace
histogram_trace = go.Histogram(x=data, nbinsx=100, name='Histogram')

# Create layout
layout = go.Layout(title='Distribution of # Tokens per doc', xaxis=dict(title='Values'), yaxis=dict(title='Frequency'))

# Create figure
fig = go.Figure(data=[histogram_trace], layout=layout)

# Show the plot
fig.show()

# Docs elaboration

In [2]:
!pip install -qU langchain

In [3]:
!pip install -qU openai

In [4]:
from openai import OpenAI
from langchain_core.prompts import PromptTemplate
import json

In [5]:
os.environ['OPENAI_API_KEY'] = userdata.get('OpenAI_API_Key')

In [54]:
client = OpenAI()

model_3_5_turbo_1106 = "gpt-3.5-turbo-1106"
model = model_3_5_turbo_1106

template_general_questions = """
    I provide you with the following context: '''{transcript}'''.
    You must identify the general topic that is discussed in the provided context.
    Once the general topic is identified, you need to generate five pairs of Question-Answer on the general topic.
    Since the questions are generic, the answers must be at least 2 sentences (but do not go above 6 sentences).
"""

template_specific_questions = """
    I provide you with the following context: '''{transcript}'''.
    You must identify the general topic that is discussed in the provided context.
    Once the general topic is identified, you need to identify five sub-topics covered in the provided context.
    Create *at least two* Question-Answer pair for each identified sub-topic.
    Since the question are specific to a sub-topic, the answer must be at most 4 sentences long.
    Do not mention the identified topics and sub-topics. All I want is the question-answer pairs.
"""

content = """
    You are a helpful assistant that reads documents, understand their content, and generate Question-Answer pairs.
    Your output will be used to perform supervised fine tuning of a LLM - keep it in mind when formulating both the question and the answer.
    The desired output format is the following:
    - Identify the questions with "Question:" and the answers with "Answer:"
    - each question and each answer need to be in one line only. The result of this is that each line will start either with "Question:" or with "Answer:"
    - Do not add anything else
    Avoid referring to any Named Entity in the questions, unless extremely relevant for the document content.
    Email addresses and phone numbers are not relevant for me - do not mention them at any time.
"""

Still need to define the "transcript" variable

In [55]:
transcript = open('/content/gdrive/MyDrive/Syllog/transcripts/text_61.txt', 'r').read()

In [56]:
prompt_question = PromptTemplate(
            input_variables = ["transcript"],
            template=template_general_questions #  template_specific_questions
        )

query = prompt_question.format(
    transcript = transcript,
)

response = client.chat.completions.create(
    model=model,
    messages=[
        {"role": "system", "content": content},
        {"role": "user", "content": query}
    ],
    temperature=0.0,
)

flashcard_content = response.choices[0].message.content

In [57]:
flashcard_content

"Topic: Strategic Human Resource Management\n\nSub-topics:\n1. Definition and Importance of Strategic Human Resource Management\n2. Traditional HRM vs Strategic HRM\n3. Examples of Strategic Human Resource Management in Companies\n4. Creating a Strategic Human Resource Management Plan\n5. Emphasizing Data in Strategic Human Resource Management\n\nQuestion: What is the aim of Strategic Human Resource Management?\nAnswer: Strategic Human Resource Management aims to align HR practices with business objectives to achieve organizational goals through the strategic deployment of a highly committed and capable workforce.\n\nQuestion: How does Strategic Human Resource Management differ from Traditional HRM?\nAnswer: Traditional HRM was primarily reactive and task-oriented, while Strategic HRM is proactive, integrated with business strategies, and focuses on planning and taking action to ensure the organization has the workforce it needs to achieve its goals.\n\nQuestion: Can you provide exampl