In [2]:
# Original Version
import requests
from bs4 import BeautifulSoup
import spacy
from textblob import TextBlob
import pandas as pd
from nltk import pos_tag
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK data
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Initialize NLP model
nlp = spacy.load("en_core_web_sm")

# Get the main page data
page_data = requests.get('https://www.healthline.com/directory/topics')
soup = BeautifulSoup(page_data.text, 'html.parser')

# Extract href links from <a> tags with the specified class
web_pages_to_visit = []
for link in soup.find_all('a', class_='css-1hacg05', href=True):
    web_pages_to_visit.append(link['href'])

# Limit to the first 100 pages for efficiency
web_pages_to_visit = web_pages_to_visit[:100]

# NLP Analysis Results
nlp_results = []

# Visit each page and perform NLP analysis
for page in web_pages_to_visit:
    # Fetch page content
    page_content = requests.get(f'{page}')
    soup_page = BeautifulSoup(page_content.text, 'html.parser')

    # Extract text from specific paragraphs
    paragraphs = soup_page.find_all('p')#, class_='css-1rnzyga')
    text = ' '.join([p.get_text().lower() for p in paragraphs])

    # Tokenize and POS tagging
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)

    # Count verbs
    verb_count = sum(1 for word, tag in pos_tags if tag.startswith('VB'))

    # Count singular Common Nouns (NN) and plural Common Nouns (NNS)
    common_noun_count = sum(1 for word, tag in pos_tags if tag in ('NN', 'NNS'))

    # Count singular Proper Nouns (NNP)  and plural Proper Nouns (NNPS)
    proper_noun_count = sum(1 for word, tag in pos_tags if tag in ('NNP', 'NNPS'))

    # Perform NLP analysis for other statistics
    doc = nlp(text)
    word_count = len([token.text for token in doc])
    sentence_count = len(list(doc.sents))

    # Calculate Type-Token Ratio
    unique_words = set(tokens)
    type_token_ratio = len(unique_words) / len(tokens) if tokens else 0

    # Store results
    nlp_results.append({
        'word_count': word_count,
        'sentence_count': sentence_count,
        'verb_count': verb_count,
        'common_noun_count': common_noun_count,
        'proper_noun_count': proper_noun_count,
        'type_token_ratio': type_token_ratio,
    })

# Convert to DataFrame
df = pd.DataFrame(nlp_results)

# Save to a CSV file
df.to_csv('nlp_statistics.csv', index=False)

# Calculate and save aggregated results
aggregated_results = df.mean()
aggregated_results.to_csv('aggregated_results.csv', header=False)
print('Results saved to aggregated_results.csv')




[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\JANA\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JANA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


len(web_pages_to_visit 150
Aggregated Results
word_count           1712.000000
sentence_count         81.000000
verb_count            270.000000
common_noun_count     403.000000
proper_noun_count       5.500000
type_token_ratio        0.308374
dtype: float64


PermissionError: [Errno 13] Permission denied: 'aggregated_results.csv'