Calculating NLP Statistics

In [54]:
import requests
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd
import nltk
import spacy

nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")


class WebScraper:
    def __init__(self, url):
        self.url = url

    def extract_article_text(self):
        response = requests.get(self.url)
        html_content = response.content
        soup = BeautifulSoup(html_content, "html.parser")
        article_text = soup.get_text()
        return article_text


class TextProcessor:
    def __init__(self, nltk_stopwords):
        self.nltk_stopwords = nltk_stopwords

    def tokenize_and_clean(self, text):
        words = nltk.word_tokenize(text)
        filtered_words = [word.lower() for word in words if word.isalpha() and word.lower() not in self.nltk_stopwords]
        return filtered_words


def extract_named_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities


def perform_sentiment_analysis(text):
    doc = nlp(text)
    sentiment_score = doc.sentiment
    return sentiment_score


def perform_pos_tagging(text):
    doc = nlp(text)
    pos_tags = [(token.text, token.pos_) for token in doc]
    return pos_tags


def run_etl(base_url, num_pages):
    nltk_stopwords = set(stopwords.words("english"))
    
    word_freq_list = []
    entities_list = []
    sentiment_scores_list = []
    pos_tags_list = []

    for page_number in range(1, num_pages + 1):
        page_url = f"{base_url}/page/{page_number}"

        scraper = WebScraper(page_url)
        article_text = scraper.extract_article_text()

        processor = TextProcessor(nltk_stopwords)
        filtered_words = processor.tokenize_and_clean(article_text)

        # Top Keywords
        top_keywords = Counter(filtered_words)
        df_word_freq = pd.DataFrame(top_keywords.items(), columns=["Words", "Frequencies"])

        # Convert 'Frequencies' column to string type
        df_word_freq['Frequencies'] = df_word_freq['Frequencies'].astype(str)

        # Named Entity Recognition Entities
        entities = extract_named_entities(article_text)
        df_entities = pd.DataFrame(entities, columns=["Entity", "Label"])

        # Sentiment Analysis
        sentiment_score = perform_sentiment_analysis(article_text)
        df_sentiment_scores = pd.DataFrame([sentiment_score], columns=["Sentiment_Score"])

        # Part-of-Speech Tagging Distribution
        pos_tags = perform_pos_tagging(article_text)
        df_pos_tags = pd.DataFrame(pos_tags, columns=["Token", "POS"])

        # Combine all results into a single dataframe
        combined_df = pd.concat([df_word_freq, df_entities, df_sentiment_scores, df_pos_tags], axis=1)

        # Save the combined results to a single CSV file
        combined_df.to_csv(f"page_{page_number}_statistics.csv", index=False)

        # Append dataframes to lists
        word_freq_list.append(df_word_freq)
        entities_list.append(df_entities)
        sentiment_scores_list.append(df_sentiment_scores)
        pos_tags_list.append(df_pos_tags)

    return word_freq_list, entities_list, sentiment_scores_list, pos_tags_list

def calculate_average_and_save_to_csv(dataframes_list, output_csv_path):
    # Concatenate the input DataFrames along rows
    combined_df = pd.concat(dataframes_list, ignore_index=True)

    # Convert all columns to numeric type
    combined_df = combined_df.apply(pd.to_numeric, errors='coerce', downcast='float')

    # Calculate the average for each column
    avg_df = combined_df.mean()

    # If there is only one row in the average DataFrame, convert it to a DataFrame with a single column
    if avg_df.ndim == 0:
        avg_df = pd.DataFrame({'Value': avg_df})

    # Save the average results to a CSV file with column names
    avg_df.to_csv(output_csv_path, index=False)


if __name__ == "__main__":
    base_url = "https://medium.com/tag/technology"
    num_pages = 100  # Set the desired number of pages to scrape

    word_freq_list, entities_list, sentiment_scores_list, pos_tags_list = run_etl(base_url, num_pages)

    # Calculate and save the average results to a single CSV file
    calculate_average_and_save_to_csv(
        word_freq_list + entities_list + sentiment_scores_list + pos_tags_list,
        "C:\\Users\\ual-laptop\\OneDrive - University of Arizona\\Documents\\DA Projects\\NLP Final\\aggregated_results.csv"
    )


[nltk_data] Downloading package stopwords to C:\Users\ual-
[nltk_data]     laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
