In [None]:
!pip install nltk textstat
import os
import pandas as pd
import re
from bs4 import BeautifulSoup
import requests
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from textstat import textstat
import nltk




In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords', quiet=True)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
INPUT_FILE = '/content/Input.xlsx'
OUTPUT_DIR = '/content/Articles'
OUTPUT_FILE = '/content/Output_Data_MAIN.xlsx'
STOPWORDS_DIR = '/content/Stopwords'
MASTER_DICT_DIR = '/content/MasterDictionary'

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
def load_dictionary(file_path):
    try:
        with open(file_path, 'r', encoding='latin-1') as f:
            words = {line.strip() for line in f}
            return words

    except FileNotFoundError:
        print(f"Error: {file_path} not found. Returning an empty set.")
        return set()

In [None]:
stop_words = set(stopwords.words('english'))
try:
  with open(os.path.join(STOPWORDS_DIR, 'StopWords_Auditor.txt'), 'r', encoding='latin-1') as f:
      stop_words.update(line.strip() for line in f)
except FileNotFoundError:
    print("StopWords_Auditor.txt not found, using default stopwords.")


positive_words = load_dictionary(os.path.join(MASTER_DICT_DIR, 'positive-words.txt'))
negative_words = load_dictionary(os.path.join(MASTER_DICT_DIR, 'negative-words.txt'))

print(f"Positive words loaded: {len(positive_words)}")
print(f"Negative words loaded: {len(negative_words)}")


if positive_words: print(f"Sample positive words: {list(positive_words)[:5]}")
if negative_words: print(f"Sample negative words: {list(negative_words)[:5]}")

Positive words loaded: 2006
Negative words loaded: 4783
Sample positive words: ['glorify', 'unity', 'sincerely', 'clearly', 'fearlessly']
Sample negative words: ['irksomely', 'satirical', 'hurted', 'impractical', 'checkered']


In [None]:
def count_syllables(word):
    vowels = 'aeiouy'
    word = word.lower().strip()
    count = 0
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith('es') or word.endswith('ed'):
        count -= 1
    if count == 0:
        count += 1
    return count

In [None]:
def calculate_readability_metrics(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    word_count = len(words)
    sentence_count = len(sentences)
    avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0

    complex_word_count = sum([1 for word in words if count_syllables(word) > 2])
    percentage_complex_words = (complex_word_count / word_count * 100) if word_count > 0 else 0

    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    avg_word_length = sum(len(word) for word in words) / word_count if word_count > 0 else 0

    return avg_sentence_length, percentage_complex_words, fog_index, complex_word_count, word_count, avg_word_length

In [None]:
def calculate_sentiment_scores(text):
    words = word_tokenize(text.lower())
    print(f"Words tokenized: {words[:10]}...")

    word_count = len(words)
    print(f"Word Count: {word_count}")

    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)


    print(f"Positive Score: {positive_score}")
    print(f"Negative Score: {negative_score}")

    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (word_count + 0.000001)

    return positive_score, negative_score, polarity_score, subjectivity_score

In [None]:
def count_personal_pronouns(text):
    pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text, flags=re.IGNORECASE)
    return len(pronouns)

In [None]:
def extract_article_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        title_element = soup.find('h1', class_=re.compile(r'entry-title|td-page-title'))
        title = title_element.text.strip() if title_element else "Title Not Found"

        content_div = soup.find('div', class_=re.compile(r'td-post-content|entry-content'))
        article_text = ""
        if content_div:
            for element in content_div.children:
                if element.name in ['p', 'h1', 'h2', 'h3', 'pre', 'ul', 'ol']:
                    article_text += element.get_text(strip=True) + "\n"
                elif element.name in ['img', 'figure', 'table', 'iframe']:
                    break

        if not article_text:
            article_text = "Content Not Found"

        return title, article_text

    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return "Error Fetching URL", "Error Fetching Content"
    except AttributeError as e:
        print(f"Error extracting data from {url}: {e}")
        return "Error Extracting Data", "Error Extracting Content"
    except Exception as e:
        print(f"An unexpected error occurred for {url}: {e}")
        return "Unexpected Error", "Error Extracting Content"

In [None]:
def save_article(file_path, title, body):
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(title + "\n\n" + body)



def extract_articles(input_file, output_dir):
    df = pd.read_excel(input_file)

    for index, row in df.iterrows():
        url_id = row['URL_ID']
        url = row['URL']

        print(f"Processing URL_ID: {url_id}, URL: {url}")
        title, body = extract_article_content(url)

        if title and body:
            save_article(os.path.join(output_dir, f"{url_id}.txt"), title, body)

In [None]:
def analyze_articles(input_dir, output_file):
    df_input = pd.read_excel(INPUT_FILE)
    results = []

    for index, row in df_input.iterrows():
        url_id = row['URL_ID']
        url = row['URL']
        file_path = os.path.join(input_dir, f"{url_id}.txt")

        if os.path.exists(file_path):
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()

                pos_score, neg_score, polarity, subjectivity = calculate_sentiment_scores(text)
                avg_sent_len, perc_complex, fog_index, complex_words, word_count, avg_word_len = calculate_readability_metrics(text)
                personal_pronouns = count_personal_pronouns(text)
                syllable_per_word = sum(count_syllables(word) for word in word_tokenize(text)) / word_count if word_count > 0 else 0


                results.append({
                    'URL_ID': url_id,
                    'URL': url,
                    'POSITIVE SCORE': pos_score,
                    'NEGATIVE SCORE': neg_score,
                    'POLARITY SCORE': polarity,
                    'SUBJECTIVITY SCORE': subjectivity,
                    'AVG SENTENCE LENGTH': avg_sent_len,
                    'PERCENTAGE OF COMPLEX WORDS': perc_complex,
                    'FOG INDEX': fog_index,
                    'COMPLEX WORD COUNT': complex_words,
                    'WORD COUNT': word_count,
                    'SYLLABLE PER WORD': avg_word_len,
                    'PERSONAL PRONOUNS': personal_pronouns,
                    'AVG WORD LENGTH': avg_word_len,
                })

                print(f"All scores added for URL_ID: {url_id}")

            except Exception as e:
                print(f"Error analyzing file {file_path}: {e}")
        else:
             print(f"File not found for URL_ID: {url_id}")

    # Save to Excel
    df = pd.DataFrame(results)
    df.to_excel(output_file, index=False)

In [None]:
extract_articles(INPUT_FILE, OUTPUT_DIR)
analyze_articles(OUTPUT_DIR, OUTPUT_FILE)
print("Analysis complete and saved to Output_Data.xlsx")

Processing URL_ID: Netclan20241017, URL: https://insights.blackcoffer.com/ai-and-ml-based-youtube-analytics-and-content-creation-tool-for-optimizing-subscriber-engagement-and-content-strategy/
Processing URL_ID: Netclan20241018, URL: https://insights.blackcoffer.com/enhancing-front-end-features-and-functionality-for-improved-user-experience-and-dashboard-accuracy-in-partner-hospital-application/
Processing URL_ID: Netclan20241019, URL: https://insights.blackcoffer.com/roas-dashboard-for-campaign-wise-google-ads-budget-tracking-using-google-ads-ap/
Processing URL_ID: Netclan20241020, URL: https://insights.blackcoffer.com/efficient-processing-and-analysis-of-financial-data-from-pdf-files-addressing-formatting-inconsistencies-and-ensuring-data-integrity-for-a-toyota-dealership-management-firm/
Processing URL_ID: Netclan20241021, URL: https://insights.blackcoffer.com/development-of-ea-robot-for-automated-trading/
Processing URL_ID: Netclan20241022, URL: https://insights.blackcoffer.com/ai-