In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import re

# Read URLs from Excel file
data = pd.read_excel("Documents/InternTest/Input.xlsx")

def create_sentiment_dictionaries(stop_words):
    positive_words = set()
    negative_words = set()

    for file in ["Documents/InternTest/PosNev/positive-words.txt", "Documents/InternTest/PosNev/negative-words.txt"]:  # Adjust file paths as needed
        with open(file, "r") as f:
            words = f.read().splitlines()
            words = {word for word in words if word not in stop_words}  # Remove stop words
            if "positive" in file:
                positive_words.update(words)
            else:
                negative_words.update(words)

    return positive_words, negative_words

# Load stop words from multiple documents (adjust paths as needed)
stop_words = set()
for file in ["Documents/InternTest/StopWords/StopWords_Auditor.txt", "Documents/InternTest/StopWords/StopWords_Currencies.txt", "Documents/InternTest/StopWords/StopWords_DatesandNumbers.txt", "Documents/InternTest/StopWords/StopWords_Generic.txt", "Documents/InternTest/StopWords/StopWords_Generic.txt", "Documents/InternTest/StopWords/StopWords_Geographic.txt", "Documents/InternTest/StopWords/StopWords_Names.txt"]:  # Add more files as needed
    with open(file, "r") as f:
        stop_words.update(f.read().splitlines())

# Create sentiment dictionaries
positive_words, negative_words = create_sentiment_dictionaries(stop_words)

# Sentiment analysis function
def analyze_sentiment(text):
    blob = TextBlob(text)
    cleaned_text = " ".join(word.lower()  # Convert to lowercase for consistency
        for word in word_tokenize(text)  # Tokenize text using NLTK
        if word not in stop_words and word.isalpha()  # Remove stop words and non-alphabetic words
    )

    positive_score = sum(word in positive_words for word in cleaned_text.split())
    negative_score = sum(word in negative_words for word in cleaned_text.split())  # Count negative with -1 weight
    if negative_score < 0:
        negative_score*(-1)
    else:
        negative_score

    total_words = len(cleaned_text.split())

    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)

    return {
        "POSITIVE SCORE": positive_score,
        "NEGATIVE SCORE": negative_score,
        "POLARITY SCORE": polarity_score,
        "SUBJECTIVITY SCORE": subjectivity_score,
    }

def calculate_complex_words(words):
    complex_words = 0
    for word in words:
        if len(re.findall(r'\w+', word)) >= 3:  # Check if the word has three or more syllables
            complex_words += 1
    return complex_words

def analyze_text_complexity(text):
    # Split text into sentences
    sentences = re.split(r'[.!?]', text)
    num_sentences = len(sentences)
    
    # Split text into words
    words = text.split()
    num_words = len(words)
    
    # Calculate Average Sentence Length
    average_sentence_length = num_words / num_sentences
    
    # Calculate Percentage of Complex Words
    num_complex_words = calculate_complex_words(words)
    percentage_complex_words = (num_complex_words / num_words) * 100
    
    # Calculate Fog Index
    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)
    
    # return average_sentence_length, percentage_complex_words, fog_index
    return {
        "AVG SENTENCE LENGTH": average_sentence_length,
        "PERCENTAGE OF COMPLEX WORDS": percentage_complex_words,
        "FOG INDEX": fog_index,
    }

def calculate_average_words_per_sentence(text):
    # Split text into sentences
    sentences = re.split(r'[.!?]', text)
    
    total_words = 0
    total_sentences = len(sentences)
    
    # Calculate total number of words
    for sentence in sentences:
        words = sentence.split()
        total_words += len(words)
        
    # Calculate Average Number of Words Per Sentence
    average_words_per_sentence = total_words / total_sentences
    
    return {
        "AVG NUMBER OF WORDS PER SENTENCE": average_words_per_sentence,
    }
    

def calculate_complex_word_count(text):
    # Split text into words
    words = text.split()
    
    # Initialize count for complex words
    complex_word_count = 0
    
    # Calculate complex word count
    for word in words:
        if len(re.findall(r'\w+', word)) >= 3:  # Check if the word has three or more syllables
            complex_word_count += 1
    
    return {
        "COMPLEX WORD COUNT": complex_word_count,
    }

def calculate_cleaned_word_count(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    
    # Remove stopwords and punctuation
    cleaned_words = [
        word.lower().strip("!.,?;")  # Convert to lowercase and remove punctuation
        for word in word_tokenize(text)
        if word not in stop_words and word.isalpha()  # Remove stop words and non-alphabetic words
    ]
    
    # Count the total cleaned words
    cleaned_word_count = len(cleaned_words)
    
    return {
        "WORD COUNT": cleaned_word_count,
    }

def count_syllables(word):
    # Remove non-alphabetic characters and convert to lowercase
    word = re.sub(r'[^a-zA-Z]', '', word.lower())
    
    # Words with 3 or fewer letters are counted as one syllable
    if len(word) <= 3:
        return 1
    
    # Count vowel sequences (a group of consecutive vowels) as syllables
    vowels = "aeiouy"
    vowel_count = 0
    prev_char_is_vowel = False
    for index, char in enumerate(word):
        if char in vowels:
            if not prev_char_is_vowel:
                if index != 0 or (index == 0 and char != 'e'):
                    vowel_count += 1
            prev_char_is_vowel = True
        else:
            prev_char_is_vowel = False
    
    # Handle exceptions for words ending with "es" or "ed"
    if word.endswith('es') or word.endswith('ed'):
        if word[-3] not in vowels:
            vowel_count -= 1
    
    # Adjust for silent 'e' at end of word
    if word.endswith('e'):
        vowel_count -= 1
    
    # Ensure at least one syllable is counted
    return max(1, vowel_count)

def calculate_syllables_per_word(text):
    # Tokenize the text into words
    words = text.split()
    
    # Calculate syllable count for each word
    syllables_per_word = [count_syllables(word) for word in words]
    return {
        "SYLLABLE PER WORD": syllables_per_word,
    }

def count_personal_pronouns(text):
    # Define a regex pattern to match personal pronouns
    pronoun_pattern = r'\b(I|me|my|mine|myself|you|your|yours|yourself|he|him|his|hers|she|her|herself|it|its|itself|we|us|our|ours|ourselves|you|your|yours|yourselves|they|them|their|theirs|themselves)\b'
    
    # Find all matches of personal pronouns in the text
    pronouns = re.findall(pronoun_pattern, text, flags=re.IGNORECASE)
    
    # Remove "US" from the list of personal pronouns
    pronouns = [pronoun for pronoun in pronouns if pronoun.lower() != 'us']
    
    # Count the occurrences of personal pronouns
    pronoun_count = len(pronouns)
    
    return {
        "PERSONAL PRONOUNS": pronoun_count,
    }

def calculate_average_word_length(text):
    # Tokenize the text into words
    words = text.split()
    
    # Calculate the total number of characters in all words
    total_characters = sum(len(word) for word in words)
    
    # Calculate the total number of words
    total_words = len(words)
    
    # Calculate the average word length
    average_word_length = total_characters / total_words if total_words > 0 else 0
    
    return {
        "AVG WORD LENGTH": average_word_length,
    }

def scrape_urls(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        # Adjust these selectors based on the target website's HTML structure
        title_element = soup.find('title')  # Example selector
        text_elements = soup.find_all(class_="td-post-content tagdiv-type")  # Example selector
    
        title = title_element.text.strip() if title_element else None
        texts = "\n".join(element.text.strip() for element in text_elements) if text_elements else None
    
        # Merge the title and text content
        text = f"{title}\n\n{texts}"
        return text

    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {url}, {e}")

# Extract article data
articles = []
for index, row in data.iterrows():
    url_id = row["URL_ID"]
    url = row["URL"]
    scraped_content = scrape_urls(url)
    sentiment = analyze_sentiment(scraped_content)
    complexity_scores = analyze_text_complexity(scraped_content)
    word_per_sen = calculate_average_words_per_sentence(scraped_content)
    comp_wor_cnt = calculate_complex_word_count(scraped_content)
    total_cleaned_words = calculate_cleaned_word_count(scraped_content)
    syllables_per_word = calculate_syllables_per_word(scraped_content)
    pronoun_count = count_personal_pronouns(scraped_content)
    average_word_length = calculate_average_word_length(scraped_content)
    article_data = {
        "URL_ID": url_id,
        "URL": url,
        **sentiment,  # Unpack sentiment dictionary
        **complexity_scores,
        **word_per_sen, 
        **comp_wor_cnt,
        **total_cleaned_words,
        **syllables_per_word,
        **pronoun_count,
        **average_word_length
    }
    articles.append(article_data)

print(articles)
df = pd.DataFrame(articles)
df.to_excel("Documents/InternTest/FinalOutput2.xlsx", index=False)

[{'URL_ID': 'blackassign0001', 'URL': 'https://insights.blackcoffer.com/rising-it-cities-and-its-impact-on-the-economy-environment-infrastructure-and-city-life-by-the-year-2040-2/', 'POSITIVE SCORE': 44, 'NEGATIVE SCORE': 6, 'POLARITY SCORE': 0.7599999848000003, 'SUBJECTIVITY SCORE': 0.042408820998805066, 'AVG SENTENCE LENGTH': 15.525, 'PERCENTAGE OF COMPLEX WORDS': 0.0, 'FOG INDEX': 6.210000000000001, 'AVG NUMBER OF WORDS PER SENTENCE': 15.5375, 'COMPLEX WORD COUNT': 0, 'WORD COUNT': 1179, 'SYLLABLE PER WORD': [2, 1, 2, 1, 1, 2, 1, 1, 3, 3, 4, 1, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 4, 1, 3, 1, 1, 1, 4, 1, 2, 1, 1, 1, 2, 1, 1, 4, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 3, 1, 4, 1, 1, 4, 4, 1, 1, 1, 2, 2, 1, 1, 1, 4, 2, 1, 1, 2, 1, 1, 4, 2, 2, 1, 3, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 4, 4, 1, 1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 1, 3, 1, 1, 3, 1, 3, 1, 2, 1, 1, 1, 1, 3, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,