In [2]:
import pandas as pd
from newspaper import Article
import re
import string
from collections import Counter

def load_word_list(word_file):
    with open(word_file, 'r') as file:
        words = file.read().splitlines()
    return set(words)

def extract_main_content_and_title_from_url(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.title, article.text
    except Exception as e:
        print(f"Failed to retrieve or parse the URL: {e}")
        return "", ""

def clean_text(text, stopwords):
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    cleaned_words = [word for word in words if word.lower() not in stopwords]
    cleaned_text = ' '.join(cleaned_words)
    total_words = len(cleaned_words)
    return cleaned_text, total_words

def calculate_sentiment_score(text, positive_words, negative_words):
    words = text.split()
    positive_score = sum(1 for word in words if word.lower() in positive_words)
    negative_score = sum(1 for word in words if word.lower() in negative_words)
    return positive_score, negative_score

def calculate_polarity_score(positive_score, negative_score):
    score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    return round(score,4)

def calculate_subjectivity_score(positive_score, negative_score, total_words):
    subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)
    return round(subjectivity_score,4)

def calculate_avg_sentence_length(text):
    sentences = re.split(r'[.!?]', text)
    sentences = [sentence for sentence in sentences if sentence.strip()]
    num_sentences = len(sentences)
    words = re.findall(r'\b\w+\b', text)
    num_words = len(words)
    return num_words / num_sentences if num_sentences > 0 else 0, num_sentences

def syllable_count(word):
    word = word.lower()
    vowels = "aeiou"
    count = 0
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

def count_complex_words(text):
    words = text.split()
    complex_words = [word for word in words if syllable_count(word) > 2]
    return len(complex_words)

def calculate_percentage_complex_words(complex_word_count, total_words):
    percentage_complex_words = complex_word_count / (total_words + 0.000001)
    return round(percentage_complex_words,4)

def calculate_fog_index(avg_sentence_length, percentage_complex_words):
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    return round(fog_index,4)

def calculate_syllable_per_word(text):
    words = text.split()
    total_syllables = sum(syllable_count(word) for word in words)
    syllable_per_word = total_syllables / len(words) if words else 0
    return round(syllable_per_word,4)

def count_personal_pronouns(text):
    personal_pronouns = ["i", "we", "my", "ours","us"]
    text_lower = text.lower() 
    words = re.findall(r'\b\w+\b', text_lower)
    word_counts = Counter(words)
    specific_word_counts = {word: word_counts[word] for word in personal_pronouns}
    total_count = sum(specific_word_counts.values())
    return total_count

def calculate_average_word_length(text):
    words = text.split()
    total_characters = sum(len(word) for word in words)
    total_words = len(words)
    if total_words == 0:
        return 0  # To handle division by zero
    avg_length = total_characters / total_words
    return round(avg_length,4)

def process_urls_from_excel(excel_file, stopwords_file, positive_words_file, negative_words_file, output_excel_file):
    stopwords = load_word_list(stopwords_file)
    positive_words = load_word_list(positive_words_file)
    negative_words = load_word_list(negative_words_file)
    
    df = pd.read_excel(excel_file)
    
    necessary_columns = [
        'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
        'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
        'AVG NUMBER OF WORDS PER SENTENCE', 'COMPLEX WORD COUNT', 'WORD COUNT',
        'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVERAGE WORD COUNT']
    
    for col in necessary_columns:
        if col not in df.columns:
            df[col] = 0 if 'SCORE' not in col else 0.0
            
    for index, row in df.iterrows():
        url_id = row['URL_ID']
        url = row['URL']
        
        title, text = extract_main_content_and_title_from_url(url)
        
        cleaned_text, total_words = clean_text(text, stopwords)
        
        positive_score, negative_score = calculate_sentiment_score(cleaned_text, positive_words, negative_words)
        
        avg_sentence_length, num_sentences = calculate_avg_sentence_length(cleaned_text)
        complex_word_count = count_complex_words(cleaned_text)
        percentage_complex_words = calculate_percentage_complex_words(complex_word_count, total_words)
        fog_index = calculate_fog_index(avg_sentence_length, percentage_complex_words)
        avg_words_per_sentence = total_words / num_sentences if num_sentences > 0 else 0
        syllable_per_word = calculate_syllable_per_word(cleaned_text)
        personal_pronouns_count = count_personal_pronouns(cleaned_text)
        average_word_length = calculate_average_word_length(cleaned_text)
        
        df.at[index, 'POSITIVE SCORE'] = positive_score
        df.at[index, 'NEGATIVE SCORE'] = negative_score
        df.at[index, 'POLARITY SCORE'] = calculate_polarity_score(positive_score, negative_score)
        df.at[index, 'SUBJECTIVITY SCORE'] = calculate_subjectivity_score(positive_score, negative_score, total_words)
        df.at[index, 'AVG SENTENCE LENGTH'] = avg_sentence_length
        df.at[index, 'PERCENTAGE OF COMPLEX WORDS'] = percentage_complex_words
        df.at[index, 'FOG INDEX'] = fog_index
        df.at[index, 'AVG NUMBER OF WORDS PER SENTENCE'] = avg_words_per_sentence
        df.at[index, 'COMPLEX WORD COUNT'] = complex_word_count
        df.at[index, 'WORD COUNT'] = total_words
        df.at[index, 'SYLLABLE PER WORD'] = syllable_per_word
        df.at[index, 'PERSONAL PRONOUNS'] = personal_pronouns_count
        df.at[index, 'AVERAGE WORD COUNT'] = average_word_length

        print(f"Processed URL {url_id}:\nPOSITIVE SCORE={positive_score},\tNEGATIVE SCORE={negative_score},\tPOLARITY SCORE={df.at[index, 'POLARITY SCORE']},\tSUBJECTIVITY SCORE={df.at[index, 'SUBJECTIVITY SCORE']},\nAVG SENTENCE LENGTH={avg_sentence_length},\tPERCENTAGE OF COMPLEX WORDS={percentage_complex_words},\tFOG INDEX={fog_index},\nAVG NUMBER OF WORDS PER SENTENCE={avg_words_per_sentence},\tCOMPLEX WORD COUNT={complex_word_count},\tWORD COUNT={total_words},\tSYLLABLE PER WORD={syllable_per_word},\nPERSONAL PRONOUNS={personal_pronouns_count},\tAVERAGE WORD COUNT={average_word_length}")
        print("---------------------------------------------------------------------------------------------------------------")
        
    df.to_excel(output_excel_file, index=False)

excel_file = 'Input.xlsx'  # Input Excel file containing URLs
stopwords_file = 'StopWords.txt'  # File containing stopwords
positive_words_file = 'positive-words.txt'  # File containing positive words
negative_words_file = 'negative-words.txt'  # File containing negative words
output_excel_file = 'Output Data Structure.xlsx'  # Output Excel file

process_urls_from_excel(excel_file, stopwords_file, positive_words_file, negative_words_file, output_excel_file)

print("--*--*--Program Ended--*--*--")

Processed URL blackassign0001:
POSITIVE SCORE=33,	NEGATIVE SCORE=6,	POLARITY SCORE=0.6923,	SUBJECTIVITY SCORE=0.0698,
AVG SENTENCE LENGTH=559.0,	PERCENTAGE OF COMPLEX WORDS=0.2576,	FOG INDEX=223.703,
AVG NUMBER OF WORDS PER SENTENCE=559.0,	COMPLEX WORD COUNT=144,	WORD COUNT=559,	SYLLABLE PER WORD=2.0054,
PERSONAL PRONOUNS=0,	AVERAGE WORD COUNT=6.5617
---------------------------------------------------------------------------------------------------------------
Processed URL blackassign0002:
POSITIVE SCORE=58,	NEGATIVE SCORE=31,	POLARITY SCORE=0.3034,	SUBJECTIVITY SCORE=0.1122,
AVG SENTENCE LENGTH=806.0,	PERCENTAGE OF COMPLEX WORDS=0.3934,	FOG INDEX=322.5574,
AVG NUMBER OF WORDS PER SENTENCE=793.0,	COMPLEX WORD COUNT=312,	WORD COUNT=793,	SYLLABLE PER WORD=2.396,
PERSONAL PRONOUNS=0,	AVERAGE WORD COUNT=7.4439
---------------------------------------------------------------------------------------------------------------
Processed URL blackassign0003:
POSITIVE SCORE=38,	NEGATIVE SCORE=24,	