In [1]:
# !pip install requests
# !pip install beautifulsoup4
# !pip install openpyxl
# !pip install textblob

# Import the Required Libraries

In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from textblob import TextBlob
import re
import os
import numpy as np

# Load the Input Data (URLs)

In [10]:
# Load Input.xlsx
input_data = pd.read_excel("Input.xlsx")

# Display input data
print(input_data)

         URL_ID                                                URL
0    bctech2011  https://insights.blackcoffer.com/ml-and-ai-bas...
1    bctech2012  https://insights.blackcoffer.com/streamlined-i...
2    bctech2013  https://insights.blackcoffer.com/efficient-dat...
3    bctech2014  https://insights.blackcoffer.com/effective-man...
4    bctech2015  https://insights.blackcoffer.com/streamlined-t...
..          ...                                                ...
142  bctech2153  https://insights.blackcoffer.com/population-an...
143  bctech2154  https://insights.blackcoffer.com/google-lsa-ap...
144  bctech2155  https://insights.blackcoffer.com/healthcare-da...
145  bctech2156  https://insights.blackcoffer.com/budget-sales-...
146  bctech2157  https://insights.blackcoffer.com/amazon-buy-bo...

[147 rows x 2 columns]


# Define Helper Functions for Text Extraction and Analysis

In [14]:
def extract_text_from_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract all text from the webpage (headers, paragraphs, lists, etc.)
        elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'div']) 
        
        # Join all text content from the extracted elements
        text = ' '.join([element.get_text(separator=" ", strip=True) for element in elements])
        
        return text
    except Exception as e:
        print(f"Error extracting text from {url}: {e}")
        return None

# def extract_text_from_url(url):
#     try:
#         response = requests.get(url)
#         soup = BeautifulSoup(response.content, 'html.parser')
#         # Extract all text from the webpage
#         text = ' '.join([p.text for p in soup.find_all('p')])
#         return text
#     except Exception as e:
#         print(f"Error extracting text from {url}: {e}")
#         return None

In [15]:
# url = 'https://insights.blackcoffer.com/ml-and-ai-based-insurance-premium-model-to-predict-premium-to-be-charged-by-the-insurance-company/'
# d1 = extract_text_from_url(url)
# d1

'  Sign in Our Success Stories Banking, Financials, Securities, and Insurance Energy Entertainment Fast Moving Consumer Goods Government & Think Tanks Healthcare Infrastructure & Real Estate IT Lifestyle, eCommerce & Online Market Place News & Media Production & Manufacturing Research & Academia Retail & Supply Chain Telecom What We Do Banking, Financials, Securities, and Insurance Energy Entertainment Fast Moving Consumer Goods Government & Think Tanks Healthcare Hospitality Infrastructure & Real Estate IT Services Lifestyle, eCommerce & Online Market Place News & Media Production & Manufacturing Research & Academia Retail & Supply Chain What We Think Automobiles & Components BFSI Asset and Portfolio Banks Capital Markets Derivatives and Securities Diversified Financials Finance & Accounting Insurance Securities and Capital Markets Capital Goods Commercial & Professional Services Consumer Discretionary Consumer Durables & Apparel Consumer Services Consumer Staples Food & Staples Retai

## Text Preprocessing:

In [13]:
def clean_text(text, stopwords):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove stopwords
    words = text.split()
    cleaned_words = [word for word in words if word not in stopwords]
    return ' '.join(cleaned_words)

def load_stopwords():
    stopwords = []
    stopwords_dir = 'StopWords'
    for file_name in os.listdir(stopwords_dir):
        with open(os.path.join(stopwords_dir, file_name), 'r') as file:
            stopwords.extend(file.read().splitlines())
    return set(stopwords)

## Sentiment and Readability Scores:

In [6]:
def calculate_sentiment(text, positive_words, negative_words):
    words = text.split()
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 1e-6)
    subjectivity_score = TextBlob(text).sentiment.subjectivity
    return positive_score, negative_score, polarity_score, subjectivity_score

def calculate_readability_metrics(text):
    sentences = text.split('.')
    word_count = len(text.split())
    sentence_count = len(sentences)
    complex_word_count = sum(1 for word in text.split() if syllable_count(word) > 2)
    avg_sentence_length = word_count / sentence_count
    percentage_complex_words = complex_word_count / word_count
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    return avg_sentence_length, percentage_complex_words, fog_index, complex_word_count, word_count

def syllable_count(word):
    return len(re.findall(r'[aeiouy]+', word.lower()))

## Main Script to Process All URLs

In [7]:
# Load positive and negative word dictionaries
with open('MasterDictionary/positive-words.txt', 'r') as file:
    positive_words = set(file.read().splitlines())

with open('MasterDictionary/negative-words.txt', 'r') as file:
    negative_words = set(file.read().splitlines())

# Load stop words
stopwords = load_stopwords()

# List to store the results
results = []

for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    print(f"Processing URL ID {url_id}: {url}")
    
    # Extract text from the URL
    text = extract_text_from_url(url)
    
    if text:
        # Clean the text
        cleaned_text = clean_text(text, stopwords)
        
        # Calculate sentiment scores
        positive_score, negative_score, polarity_score, subjectivity_score = calculate_sentiment(cleaned_text, positive_words, negative_words)
        
        # Calculate readability metrics
        avg_sentence_length, percentage_complex_words, fog_index, complex_word_count, word_count = calculate_readability_metrics(cleaned_text)
        
        # Save the results for this URL
        result = {
            'URL_ID': url_id,
            'URL': url,
            'POSITIVE SCORE': positive_score,
            'NEGATIVE SCORE': negative_score,
            'POLARITY SCORE': polarity_score,
            'SUBJECTIVITY SCORE': subjectivity_score,
            'AVG SENTENCE LENGTH': avg_sentence_length,
            'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
            'FOG INDEX': fog_index,
            'COMPLEX WORD COUNT': complex_word_count,
            'WORD COUNT': word_count
        }
        
        results.append(result)

Processing URL ID bctech2011: https://insights.blackcoffer.com/ml-and-ai-based-insurance-premium-model-to-predict-premium-to-be-charged-by-the-insurance-company/
Processing URL ID bctech2012: https://insights.blackcoffer.com/streamlined-integration-interactive-brokers-api-with-python-for-desktop-trading-application/
Processing URL ID bctech2013: https://insights.blackcoffer.com/efficient-data-integration-and-user-friendly-interface-development-navigating-challenges-in-web-application-deployment/
Processing URL ID bctech2014: https://insights.blackcoffer.com/effective-management-of-social-media-data-extraction-strategies-for-authentication-security-and-reliability/
Processing URL ID bctech2015: https://insights.blackcoffer.com/streamlined-trading-operations-interface-for-metatrader-4-empowering-efficient-management-and-monitoring/
Processing URL ID bctech2016: https://insights.blackcoffer.com/efficient-aws-infrastructure-setup-and-management-addressing-security-scalability-and-complianc

In [8]:
# Convert results to DataFrame and save to Excel
output_df = pd.DataFrame(results)
output_df.to_excel("output.xlsx", index=False)

In [None]:
# Save the output