In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from textblob import TextBlob

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'URL_ID': 'bctech2011', 'URL': 'https://insights.blackcoffer.com/ml-and-ai-based-insurance-premium-model-to-predict-premium-to-be-charged-by-the-insurance-company/', 'POSITIVE_SCORE': 34, 'NEGATIVE_SCORE': 8, 'POLARITY_SCORE': 0.6190476043083905, 'SUBJECTIVITY_SCORE': 0.07155025541473552, 'AVG_SENTENCE_LENGTH': 587.0, 'PERCENTAGE_COMPLEX_WORDS': 51.618398637137986, 'FOG_INDEX': 255.4473594548552, 'AVG_NUMBER_OF_WORDS_PER_SENTENCE': 587.0, 'COMPLEX_WORD_COUNT': 303, 'WORD_COUNT': 587, 'SYLLABLE_PER_WORD': 2.5928449744463373, 'PERSONAL_PRONOUNS': 0, 'AVG_WORD_LENGTH': 7.630323679727428}
{'URL_ID': 'bctech2012', 'URL': 'https://insights.blackcoffer.com/streamlined-integration-interactive-brokers-api-with-python-for-desktop-trading-application/', 'POSITIVE_SCORE': 4, 'NEGATIVE_SCORE': 0, 'POLARITY_SCORE': 0.9999997500000625, 'SUBJECTIVITY_SCORE': 0.021390374217163773, 'AVG_SENTENCE_LENGTH': 187.0, 'PERCENTAGE_COMPLEX_WORDS': 42.780748663101605, 'FOG_INDEX': 91.91229946524065, 'AVG_NUMBER_

In [None]:
nltk.download('punkt')

def load_words(file_path, encoding='utf-8'):
    try:
        with open(file_path, 'r', encoding=encoding) as file:
            words = file.read().splitlines()
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='ISO-8859-1') as file:
            words = file.read().splitlines()
    return set(words)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Load stop words and sentiment words
stop_words = load_words('StopWords_Names.txt').union(
    load_words('StopWords_Geographic.txt'),
    load_words('StopWords_GenericLong.txt')
)

positive_words = load_words('positive-words.txt')
negative_words = load_words('negative-words.txt')



In [None]:
def read_article_text(url_id):
    try:
        with open(f'{url_id}.txt', 'r', encoding='utf-8') as file:
            text = file.read()
        return text
    except FileNotFoundError:
        print(f"File for URL_ID {url_id} not found.")
        return ""



In [None]:
def clean_text(text):
    words = word_tokenize(text)
    cleaned_text = [word for word in words if word.lower() not in stop_words and word.isalnum()]
    return ' '.join(cleaned_text)



In [None]:
def compute_sentiment_scores(cleaned_text):
    words = cleaned_text.split()
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 1e-6)
    subjectivity_score = (positive_score + negative_score) / (len(words) + 1e-6)
    return positive_score, negative_score, polarity_score, subjectivity_score



In [None]:
def compute_readability_metrics(cleaned_text):
    sentences = sent_tokenize(cleaned_text)
    words = cleaned_text.split()
    avg_sentence_length = len(words) / len(sentences)
    complex_words = [word for word in words if len(re.findall(r'[aeiouy]+', word)) > 2]
    percentage_complex_words = len(complex_words) / len(words) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = len(words) / len(sentences)
    complex_word_count = len(complex_words)
    word_count = len(words)
    syllable_count = sum([len(re.findall(r'[aeiouy]+', word)) for word in words])
    syllable_per_word = syllable_count / word_count
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', cleaned_text, re.I))
    avg_word_length = sum(len(word) for word in words) / word_count

    return {
        'AVG_SENTENCE_LENGTH': avg_sentence_length,
        'PERCENTAGE_COMPLEX_WORDS': percentage_complex_words,
        'FOG_INDEX': fog_index,
        'AVG_NUMBER_OF_WORDS_PER_SENTENCE': avg_words_per_sentence,
        'COMPLEX_WORD_COUNT': complex_word_count,
        'WORD_COUNT': word_count,
        'SYLLABLE_PER_WORD': syllable_per_word,
        'PERSONAL_PRONOUNS': personal_pronouns,
        'AVG_WORD_LENGTH': avg_word_length
    }

output_df = pd.DataFrame(columns=[
    'URL_ID', 'URL', 'POSITIVE_SCORE', 'NEGATIVE_SCORE', 'POLARITY_SCORE', 'SUBJECTIVITY_SCORE',
    'AVG_SENTENCE_LENGTH', 'PERCENTAGE_COMPLEX_WORDS', 'FOG_INDEX', 'AVG_NUMBER_OF_WORDS_PER_SENTENCE',
    'COMPLEX_WORD_COUNT', 'WORD_COUNT', 'SYLLABLE_PER_WORD', 'PERSONAL_PRONOUNS', 'AVG_WORD_LENGTH'
])



In [None]:
# Read the input file
input_df = pd.read_excel('Input.xlsx')
urls = input_df['URL']
url_ids = input_df['URL_ID']

for url_id, url in zip(url_ids, urls):
    text = read_article_text(url_id)
    if text.strip():
        cleaned_text = clean_text(text)
        pos_score, neg_score, pol_score, subj_score = compute_sentiment_scores(cleaned_text)
        readability_metrics = compute_readability_metrics(cleaned_text)

        row = {
            'URL_ID': url_id, 'URL': url, 'POSITIVE_SCORE': pos_score, 'NEGATIVE_SCORE': neg_score,
            'POLARITY_SCORE': pol_score, 'SUBJECTIVITY_SCORE': subj_score, **readability_metrics
        }
        print(row)
        output_df = pd.concat([output_df, pd.DataFrame([row])], ignore_index=True)
    else:
        print(f"No text extracted for URL_ID {url_id}.")



{'URL_ID': 'bctech2011', 'URL': 'https://insights.blackcoffer.com/ml-and-ai-based-insurance-premium-model-to-predict-premium-to-be-charged-by-the-insurance-company/', 'POSITIVE_SCORE': 34, 'NEGATIVE_SCORE': 8, 'POLARITY_SCORE': 0.6190476043083905, 'SUBJECTIVITY_SCORE': 0.07155025541473552, 'AVG_SENTENCE_LENGTH': 587.0, 'PERCENTAGE_COMPLEX_WORDS': 51.618398637137986, 'FOG_INDEX': 255.4473594548552, 'AVG_NUMBER_OF_WORDS_PER_SENTENCE': 587.0, 'COMPLEX_WORD_COUNT': 303, 'WORD_COUNT': 587, 'SYLLABLE_PER_WORD': 2.5928449744463373, 'PERSONAL_PRONOUNS': 0, 'AVG_WORD_LENGTH': 7.630323679727428}
{'URL_ID': 'bctech2012', 'URL': 'https://insights.blackcoffer.com/streamlined-integration-interactive-brokers-api-with-python-for-desktop-trading-application/', 'POSITIVE_SCORE': 4, 'NEGATIVE_SCORE': 0, 'POLARITY_SCORE': 0.9999997500000625, 'SUBJECTIVITY_SCORE': 0.021390374217163773, 'AVG_SENTENCE_LENGTH': 187.0, 'PERCENTAGE_COMPLEX_WORDS': 42.780748663101605, 'FOG_INDEX': 91.91229946524065, 'AVG_NUMBER_

In [None]:
output_df

Unnamed: 0,URL_ID,URL,POSITIVE_SCORE,NEGATIVE_SCORE,POLARITY_SCORE,SUBJECTIVITY_SCORE,AVG_SENTENCE_LENGTH,PERCENTAGE_COMPLEX_WORDS,FOG_INDEX,AVG_NUMBER_OF_WORDS_PER_SENTENCE,COMPLEX_WORD_COUNT,WORD_COUNT,SYLLABLE_PER_WORD,PERSONAL_PRONOUNS,AVG_WORD_LENGTH
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,34,8,0.619048,0.071550,587.0,51.618399,255.447359,587.0,303,587,2.592845,0,7.630324
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,4,0,1.000000,0.021390,187.0,42.780749,91.912299,187.0,80,187,2.347594,0,7.491979
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,4,0,1.000000,0.021622,185.0,44.324324,91.729730,185.0,82,185,2.367568,0,7.524324
3,bctech2014,https://insights.blackcoffer.com/effective-man...,4,0,1.000000,0.021390,187.0,43.850267,92.340107,187.0,82,187,2.390374,0,7.518717
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,4,0,1.000000,0.021277,188.0,44.148936,92.859574,188.0,83,188,2.372340,0,7.537234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,bctech2153,https://insights.blackcoffer.com/population-an...,10,8,0.111111,0.027273,660.0,38.181818,279.272727,660.0,252,660,2.201515,0,6.631818
143,bctech2154,https://insights.blackcoffer.com/google-lsa-ap...,14,9,0.217391,0.024758,929.0,34.337998,385.335199,929.0,319,929,2.174381,0,6.584499
144,bctech2155,https://insights.blackcoffer.com/healthcare-da...,9,6,0.200000,0.063025,238.0,36.974790,109.989916,238.0,88,238,2.239496,0,6.903361
145,bctech2156,https://insights.blackcoffer.com/budget-sales-...,3,0,1.000000,0.020270,148.0,43.918919,76.767568,148.0,65,148,2.445946,0,7.682432


In [None]:
# Save to Excel
output_df.to_excel('Output Data Structure.xlsx', index=False)
print("DataFrame saved successfully.")


DataFrame saved successfully.
