# Web Scraping and Text analysis

### Import required libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from helper import load_words, count_syllables
import string
import re
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\miran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\miran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Fetching URL

In [2]:
URL = 'https://insights.blackcoffer.com/how-will-covid-19-affect-the-world-of-work-2/'
page = requests.get(URL)

In [3]:
# Extract the HTML content
html = page.text
# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(html, "html.parser")

### Extracting Information

In [8]:
# Extract the title of the article
title_element = soup.find('h1', {'class': 'entry-title'})
if title_element is None:
    title_element = soup.find('h1', {'class': 'tdb-title-text'})
if title_element:
    title = title_element.text.strip()
    print(title)

In [9]:
# Extract the main body content of the article
body_element = soup.find('div', {'class': 'td-post-content tagdiv-type'})
if body_element is None:
    body_element = soup.find('div', {'class': 'td_block_wrap tdb_single_content tdi_130 td-pb-border-top td_block_template_1 td-post-content tagdiv-type'})
if body_element:
    body = body_element.get_text()

### Save article to file

In [None]:
article_dir = 'ArticleText'
os.makedirs(article_dir, exist_ok=True)
article_file = os.path.join(article_dir, f"{url_id}.txt")

# Save article to text file
with open(article_file, 'w', encoding='utf-8') as f:
    f.write(body)
print(f"Article {url_id} scraped and saved.")

In [84]:
# Load stop words from list of files
stop_word_files = ['stopWords/stopWords_Auditor.txt', 'stopWords/stopWords_currencies.txt', 'stopWords/StopWords_DatesandNumbers.txt', 'stopWords/StopWords_GenericLong.txt' , 'StopWords/stopwords_Generic.txt', 'stopWords/StopWords_Geographic.txt', 'StopWords/stopwords_Names.txt']
stop_words = load_words(stop_word_files)

In [85]:
# Load positive and negative word list
positive_words = load_words(['MasterDictionary/positive-words.txt'])
negative_words = load_words(['MasterDictionary/negative-words.txt'])

In [86]:
# Tokenize the text into sentences and words
sentences = sent_tokenize(body)
tokens = word_tokenize(body)

# Filter out stop words and punctuation
filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word not in string.punctuation]

### 1. Sentiment Analysis

In [87]:
positive_score = sum(1 for word in filtered_tokens if word in positive_words)
negative_score = sum(-1 for word in filtered_tokens if word in negative_words)
negative_score *= -1

# Calculate polarity and subjectivity scores
polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
subjectivity_score = (positive_score + negative_score) / (len(filtered_tokens) + 0.000001)

In [88]:
print("Positive Score:", positive_score)
print("Negative Score:", negative_score)
print("Polarity Score:", polarity_score)
print("Subjectivity Score:", subjectivity_score)

Positive Score: 54
Negative Score: 6
Polarity Score: 0.7999999866666669
Subjectivity Score: 0.07109004730913501


### 2. Readability Analysis

In [72]:
avg_sentence_length = len(tokens) / len(sentences)

complex_words = [word for word in filtered_tokens if count_syllables(word) > 2]
percent_complex_words = len(complex_words) / len(filtered_tokens)

fog_index = 0.4 * (avg_sentence_length + percent_complex_words * 100)

In [73]:
print("Average Sentence Length:", avg_sentence_length)
print("Percentage of Complex Words:", percent_complex_words * 100, "%")
print("Fog Index:", fog_index)

Average Sentence Length: 20.526315789473685
Percentage of Complex Words: 49.28909952606635 %
Fog Index: 27.926166126216017


### 3. Average Number of Words Per Sentence 

In [74]:
avg_words_per_sentence = len(tokens) / len(sentences)

print("Average No. Of Words Per Sentence:", avg_words_per_sentence)

Average No. Of Words Per Sentence: 20.526315789473685


### 4. Complex Word Count

In [75]:
complex_word_count = len(complex_words)

print("Complex Word Count:", complex_word_count)

Complex Word Count: 416


### 5. Word Count Using nltk Library

In [76]:
nltk_stop_words = set(stopwords.words('english'))
nltk_word_count = sum(1 for w in tokens if not w.lower() in nltk_stop_words and w not in string.punctuation)

print("Word count:", nltk_word_count)

Word count: 991


### 6. Syllable Count Per Word

In [77]:
total_syllables = sum(count_syllables(word) for word in filtered_tokens)
syllable_per_word = total_syllables / nltk_word_count

print("Syllable Count Per Word:", syllable_per_word)

Syllable Count Per Word: 2.134207870837538


### 7. Personal Pronouns

In [78]:
personal_pronouns = ['i', 'me', 'we', 'us', 'you', 'he', 'him', 'she', 'her', 'it', 'they', 'them']

# Reg ex used
# \b -> boundary anchor, [\w.] -> any word character (letters, digits, underscores) or a period (.))
pronoun_count = sum(1 for word in re.findall(r'\b[\w.]+\b', body) if word.lower() in personal_pronouns and word not in ['US', 'U.S.', 'U.S'])
print("Personal Pronoun Count:", pronoun_count) 

Personal Pronoun Count: 71


### 8. Average Word Length

In [79]:
sum_of_characters = sum(len(word) for word in tokens) 
avg_word_length = sum_of_characters / len(tokens)
avg_word_length

4.841538461538462