# Import libraries and load the Excel file

In [1]:
import pandas as pd

In [2]:
# Load the Excel file
input_df = pd.read_excel('C://Users//Kavya//Desktop//internshala assignment//Input (2).xlsx')

In [3]:
# Extract the URLs and their corresponding URL_IDs
urls = input_df['URL']
url_ids = input_df['URL_ID']

In [4]:
print(urls.head())  # To check if URLs are loaded correctly
print(url_ids.head())  # To check if URL_IDs are loaded correctly

0    https://insights.blackcoffer.com/ml-and-ai-bas...
1    https://insights.blackcoffer.com/streamlined-i...
2    https://insights.blackcoffer.com/efficient-dat...
3    https://insights.blackcoffer.com/effective-man...
4    https://insights.blackcoffer.com/streamlined-t...
Name: URL, dtype: object
0    bctech2011
1    bctech2012
2    bctech2013
3    bctech2014
4    bctech2015
Name: URL_ID, dtype: object


# Extracting the Article Content

In [5]:
import requests
from bs4 import BeautifulSoup

In [6]:
def extract_article(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract the title
    title = soup.find('title').get_text()

    # Extract the article text (this will vary based on website structure)
    article_body = soup.find_all('p')  # Example: Find all <p> tags
    
    # Combine the text into a single string
    article_text = ' '.join([p.get_text() for p in article_body])
    
    return title, article_text

In [7]:
# Example usage
url = urls[0]  # Use the first URL for testing
title, article_text = extract_article(url)
print(title)
print(article_text[:500])  # Print first 500 characters of the article

ML and AI-based insurance premium model to predict premium to be charged by the insurance company - Blackcoffer Insights
Healthcare AI ChatBot using LLAMA, LLM, Langchain Efficient Supply Chain Assessment: Overcoming Technical Hurdles for Web Application Development Streamlined Integration: Interactive Brokers API with Python for Desktop Trading Application Efficient Data Integration and User-Friendly Interface Development: Navigating Challenges in Web Application Deployment AI Chatbot using LLM, Langchain, LLama AI Bot Audio to audio Methodology for ETL Discovery Tool using LLMA, OpenAI, Langchain Methodology for


In [8]:
# Save Articles as Text Files
for i, url in enumerate(urls):
    url_id = url_ids[i]
    title, article_text = extract_article(url)
    
    # Combine title and article text
    full_text = title + "\n\n" + article_text
    
    # Save to a text file
    with open(f'{url_id}.txt', 'w', encoding='utf-8') as file:
        file.write(full_text)

# Text Analysis

In [9]:
# Load the Extracted Articles
import nltk
from nltk.corpus import stopwords
import re

def load_article_text(url_id):
    with open(f'{url_id}.txt', 'r', encoding='utf-8') as file:
        text = file.read()
    return text

In [44]:
# Load Stopwords
import os
from nltk.corpus import stopwords

# Ensure stopwords are downloaded
nltk.download('stopwords')
nltk.download('punkt')

def load_stopwords(stopwords_dir):
    stopwords_set = set()
    
    for filename in os.listdir(stopwords_dir):
        filepath = os.path.join(stopwords_dir, filename)
        with open(filepath, 'r', encoding='ISO-8859-1') as file:
            for line in file:
                stopwords_set.add(line.strip().lower())  # Add each word to the set of stopwords
                
    return stopwords_set

# Load all stopwords 
stopwords_dir = ("C://Users//Kavya//Desktop//internshala assignment//StopWords")
stopwords = load_stopwords(stopwords_dir)

print("Loaded stopwords:", list(stopwords)[:20])  # Print the first 20 stopwords to verify


Loaded stopwords: ['renea', 'shelia', 'kept', 'crouse', 'meagher', 'kyles', 'rosann', 'renda', 'ardella', 'hoppe', 'fallon', 'cade', 'swaim', 'mayorga', 'bobby', 'lauretta', 'nicola', 'margeret', 'up', 'schrader']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kavya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kavya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
# Define Positive/Negative Word Dictionaries
# Load positive and negative word lists
def load_word_list(file_path):
    with open(file_path, 'r') as file:
        words = file.read().splitlines()
    return set(words)

positive_words = load_word_list('C://Users//Kavya//Desktop//internshala assignment//MasterDictionary//positive-words.txt')
negative_words = load_word_list('C://Users//Kavya//Desktop//internshala assignment//MasterDictionary//negative-words.txt')

In [12]:
positive_words

{'attentive',
 'catchy',
 'coherence',
 'lavish',
 'daringly',
 'righteous',
 'exalt',
 'romantic',
 'envy',
 'distinguished',
 'genuine',
 'agility',
 'masterpieces',
 'unequivocally',
 'fairness',
 'merriment',
 'ambitious',
 'inspirational',
 'masterpiece',
 'desirous',
 'lower-priced',
 'delicious',
 'contentment',
 'counter-attack',
 'beauty',
 'innovative',
 'prompt',
 'luck',
 'fulfillment',
 'high-spirited',
 'enviously',
 'fortitude',
 'harmoniously',
 'enthusiasm',
 'bliss',
 'reverence',
 'successfully',
 'enchanting',
 'ecstatic',
 'gold',
 'successful',
 'outshone',
 'faultless',
 'peacefully',
 'cashback',
 'groundbreaking',
 'stylish',
 'amicability',
 'ennoble',
 'ingenuously',
 'trusting',
 'jovial',
 'modest',
 'ambitiously',
 'steady',
 'titillate',
 'memorable',
 'fervor',
 'courtly',
 'eloquently',
 'decisive',
 'heaven',
 'competitive',
 'fantastic',
 'work',
 'profuse',
 'courageousness',
 'defeats',
 'brighten',
 'nice',
 'judicious',
 'spirited',
 'perseverance

In [13]:
negative_words

{'ungrateful',
 'indecently',
 'overawe',
 'oblique',
 'immoral',
 'prejudices',
 'desecrate',
 'provocation',
 'bleak',
 'feckless',
 'excessively',
 'scream',
 'throbbing',
 'mortified',
 'stingy',
 'oddly',
 'craziness',
 'confession',
 'inconceivable',
 'misinform',
 'crappy',
 'violent',
 'full-blown',
 'inextricable',
 'nastily',
 'notoriety',
 'vehement',
 'steep',
 'cruelty',
 'implausibly',
 'felon',
 'hardball',
 'dents',
 'ineffectiveness',
 'irrationals',
 'feign',
 'pitilessly',
 'boisterous',
 'straggler',
 'incomprehension',
 'hates',
 'delude',
 'betrayal',
 'bewitch',
 'forfeit',
 'motionless',
 'picketing',
 'tetchy',
 'tepid',
 'cynicism',
 'disaffect',
 'disservice',
 'abysmally',
 'berserk',
 'draconian',
 'sully',
 'premeditated',
 'ridiculously',
 'sorrowful',
 'hoax',
 'bug',
 'scandal',
 'disgustfully',
 'strictly',
 'swamped',
 'insubstantially',
 'tarnishes',
 'indignant',
 'panicky',
 'unkindly',
 'desperate',
 'stern',
 'bombastic',
 'debts',
 'strict',
 's

In [45]:
# Sentiment Analysis Function
def analyze_sentiment(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word.lower() not in stopwords]
    
    # Calculate scores
    positive_score = sum(1 for word in tokens if word.lower() in positive_words)
    negative_score = sum(1 for word in tokens if word.lower() in negative_words)
    
    # Calculate polarity and subjectivity using TextBlob
    blob = TextBlob(text)
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity
    
    return positive_score, negative_score, polarity_score, subjectivity_score

In [47]:
# Additional Analysis (Word Count, Personal Pronouns, etc.)
# Additional Analysis Functions
def additional_analysis(text):
    words = nltk.word_tokenize(text)
    
    # Word Count
    word_count = len([word for word in words if word.isalpha()])
    
    # Syllable Count Per Word
    syllable_count_per_word = sum(len([char for char in word if char in 'aeiouAEIOU']) for word in words)
    
    # Personal Pronouns
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))
    
    # Average Word Length
    avg_word_length = sum(len(word) for word in words if word.isalpha()) / word_count 
    
    return word_count, syllable_count_per_word, personal_pronouns, avg_word_length

In [59]:
# Readability and Other Metrics
def calculate_readability(text):
    sentences = nltk.sent_tokenize(text)
    words = nltk.word_tokenize(text)
    
    # Calculate Average Sentence Length
    avg_sentence_length = len(words) / len(sentences)
    
    # Complex Words
    complex_words = [word for word in words if len([char for char in word if char in 'aeiouAEIOU']) > 2]
    complex_word_count = len(complex_words)
    percentage_complex_words = complex_word_count / len(words)
    
    # Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    
    # Average Number of Words per Sentence
    avg_words_per_sentence = len(words) / len(sentences) if len(sentences) > 0 else 0
    
    return avg_sentence_length, percentage_complex_words, fog_index, complex_word_count, avg_words_per_sentence

# Compile Results and Save to Excel

In [66]:
# Process All Articles and Store Results
results = []

for i, url_id in enumerate(url_ids):
    text = load_article_text(url_id)
    
    # Sentiment Analysis
    positive_score, negative_score, polarity_score, subjectivity_score = analyze_sentiment(text)
    
    # Readability Analysis
    avg_sentence_length, percentage_complex_words, fog_index, complex_word_count, avg_words_per_sentence = calculate_readability(text)
    
    # Additional Analysis
    word_count, syllable_count_per_word, personal_pronouns, avg_word_length = additional_analysis(text)
    
    # Append all results together
    results.append([
        url_ids[i], urls[i],
        positive_score, negative_score, polarity_score, subjectivity_score,
        avg_sentence_length, percentage_complex_words, fog_index,
        complex_word_count, word_count, syllable_count_per_word,
        personal_pronouns, avg_word_length, avg_words_per_sentence
    ])

In [67]:
# Save the Results to Output Data Structure.xlsx
output_df = pd.DataFrame(results, columns=['URL_ID', 'URL','POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
                                           'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX',
                                           'AVG NUMBER OF WORDS PER SENTENCE','COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD',
                                           'PERSONAL PRONOUNS', 'AVG WORD LENGTH'])

output_df.to_excel('C://Users//Kavya//Desktop//internshala assignment//Output Data Structure.xlsx', index=False)