# Installing required libraries

In [1]:
pip install pandas requests beautifulsoup4 nltk



In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, sent_tokenize

# Loading of excel file

In [4]:
# Load the Excel file
input_file = "/content/Input.xlsx"
df = pd.read_excel(input_file)

# Function to extract text from the URL
def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Assuming that the article title is in <h1> and the text in <p> tags
        title = soup.find('h1').get_text(strip=True)
        paragraphs = soup.find_all('p')
        article_text = '\n'.join([para.get_text(strip=True) for para in paragraphs])

        return title + '\n' + article_text
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

# Directory to save the extracted articles
output_dir = 'extracted_articles'
os.makedirs(output_dir, exist_ok=True)

# Iterate over each URL and save the article text
for index, row in df.iterrows():
    url = row['URL']
    url_id = row['URL_ID']
    article_text = extract_article_text(url)

    if article_text:
        with open(f'{output_dir}/{url_id}.txt', 'w', encoding='utf-8') as f:
            f.write(article_text)


In [6]:
# Download required nltk data
nltk.download('vader_lexicon')
nltk.download('punkt')

# Initialize Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

# Function to compute text analysis metrics
def analyze_text(article_text):
    # Calculate Positive, Negative, Polarity, and Subjectivity Scores
    sentiment_scores = sia.polarity_scores(article_text)
    positive_score = sentiment_scores['pos']
    negative_score = sentiment_scores['neg']
    polarity_score = sentiment_scores['compound']
    subjectivity_score = (positive_score + negative_score) / (len(article_text.split()) + 1e-6)

    # Sentence and word counts
    sentences = sent_tokenize(article_text)
    words = word_tokenize(article_text)

    avg_sentence_length = len(words) / len(sentences)
    complex_words = [word for word in words if len(word) > 6]  # Example of complex words
    percentage_complex_words = len(complex_words) / len(words)
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = avg_sentence_length

    # Other metrics
    complex_word_count = len(complex_words)
    word_count = len(words)
    syllable_per_word = sum([len(word) // 2 for word in words]) / len(words)  # Simplified example
    personal_pronouns = len([word for word in words if word.lower() in ['i', 'we', 'my', 'ours', 'us']])
    avg_word_length = sum([len(word) for word in words]) / len(words)

    return {
        "positive_score": positive_score,
        "negative_score": negative_score,
        "polarity_score": polarity_score,
        "subjectivity_score": subjectivity_score,
        "avg_sentence_length": avg_sentence_length,
        "percentage_complex_words": percentage_complex_words,
        "fog_index": fog_index,
        "avg_words_per_sentence": avg_words_per_sentence,
        "complex_word_count": complex_word_count,
        "word_count": word_count,
        "syllable_per_word": syllable_per_word,
        "personal_pronouns": personal_pronouns,
        "avg_word_length": avg_word_length
    }

# Apply the analysis to all extracted articles
output_data = []

for index, row in df.iterrows():
    url_id = row['URL_ID']
    with open(f'{output_dir}/{url_id}.txt', 'r', encoding='utf-8') as f:
        article_text = f.read()

    analysis_results = analyze_text(article_text)
    output_data.append({**row, **analysis_results})

# Convert results to DataFrame
output_df = pd.DataFrame(output_data)

# Save the results to Excel
output_file = "Output_Data_Structure.xlsx"
output_df.to_excel(output_file, index=False)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
# Download required nltk data
nltk.download('vader_lexicon')
nltk.download('punkt')

# Initialize Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Function to find positive_score, negative_score, polarity_score, subjectivity_score

In [8]:
def sentiment_scores(article_text):
    # Calculate Positive, Negative, Polarity, and Subjectivity Scores
    sentiment = sia.polarity_scores(article_text)
    positive_score = sentiment['pos']
    negative_score = sentiment['neg']
    polarity_score = sentiment['compound']
    subjectivity_score = (positive_score + negative_score) / (len(article_text.split()) + 1e-6)

    return positive_score, negative_score, polarity_score, subjectivity_score

# Function to find the avg sentence length , complex words , fogindex etc

In [9]:
def sentence_word_analysis(article_text):
    sentences = sent_tokenize(article_text)
    words = word_tokenize(article_text)

    avg_sentence_length = len(words) / len(sentences)
    complex_words = [word for word in words if len(word) > 6]  # Example of complex words
    percentage_complex_words = len(complex_words) / len(words)
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = avg_sentence_length

    return avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, len(complex_words), len(words)


# Function to find the additional metrics


In [10]:
def additional_metrics(words):
    syllable_per_word = sum([len(word) // 2 for word in words]) / len(words)  # Simplified example
    personal_pronouns = len([word for word in words if word.lower() in ['i', 'we', 'my', 'ours', 'us']])
    avg_word_length = sum([len(word) for word in words]) / len(words)

    return syllable_per_word, personal_pronouns, avg_word_length


# Analyzing the text

In [11]:
def analyze_text(article_text):
    positive_score, negative_score, polarity_score, subjectivity_score = sentiment_scores(article_text)
    avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count, word_count = sentence_word_analysis(article_text)
    words = word_tokenize(article_text)
    syllable_per_word, personal_pronouns, avg_word_length = additional_metrics(words)

    return {
        "positive_score": positive_score,
        "negative_score": negative_score,
        "polarity_score": polarity_score,
        "subjectivity_score": subjectivity_score,
        "avg_sentence_length": avg_sentence_length,
        "percentage_complex_words": percentage_complex_words,
        "fog_index": fog_index,
        "avg_words_per_sentence": avg_words_per_sentence,
        "complex_word_count": complex_word_count,
        "word_count": word_count,
        "syllable_per_word": syllable_per_word,
        "personal_pronouns": personal_pronouns,
        "avg_word_length": avg_word_length
    }


# storing the data in the output file

In [12]:
output_data = []

for index, row in df.iterrows():
    url_id = row['URL_ID']
    with open(f'{output_dir}/{url_id}.txt', 'r', encoding='utf-8') as f:
        article_text = f.read()

    analysis_results = analyze_text(article_text)
    output_data.append({**row, **analysis_results})

# Convert results to DataFrame
output_df = pd.DataFrame(output_data)

# Save the results to Excel
output_file = "Output_Data_Structure.xlsx"
output_df.to_excel(output_file, index=False)


In [14]:
output_df.head(10)

Unnamed: 0,URL_ID,URL,positive_score,negative_score,polarity_score,subjectivity_score,avg_sentence_length,percentage_complex_words,fog_index,avg_words_per_sentence,complex_word_count,word_count,syllable_per_word,personal_pronouns,avg_word_length
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,0.107,0.038,0.9962,0.000154,45.291667,0.416743,18.283364,45.291667,453,1087,2.5069,4,5.585097
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,0.102,0.015,0.9769,0.000393,52.0,0.373626,20.949451,52.0,136,364,2.46978,3,5.543956
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,0.114,0.015,0.9814,0.000431,51.857143,0.369146,20.890516,51.857143,134,363,2.493113,3,5.570248
3,bctech2014,https://insights.blackcoffer.com/effective-man...,0.117,0.015,0.9837,0.000439,52.571429,0.36413,21.174224,52.571429,134,368,2.464674,3,5.51087
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,0.11,0.015,0.9808,0.000418,52.142857,0.378082,21.008376,52.142857,138,365,2.487671,3,5.569863
5,bctech2016,https://insights.blackcoffer.com/efficient-aws...,0.118,0.015,0.9833,0.000448,51.857143,0.366391,20.889414,51.857143,133,363,2.479339,3,5.539945
6,bctech2017,https://insights.blackcoffer.com/streamlined-e...,0.102,0.015,0.9769,0.000393,51.857143,0.358127,20.886108,51.857143,130,363,2.479339,3,5.53719
7,bctech2018,https://insights.blackcoffer.com/automated-ort...,0.107,0.015,0.9786,0.000409,51.857143,0.371901,20.891617,51.857143,135,363,2.493113,3,5.570248
8,bctech2019,https://insights.blackcoffer.com/streamlining-...,0.104,0.014,0.9786,0.000384,53.857143,0.363395,21.688215,53.857143,137,377,2.496021,3,5.572944
9,bctech2020,https://insights.blackcoffer.com/efficient-dat...,0.11,0.014,0.9815,0.000409,53.142857,0.365591,21.403379,53.142857,136,372,2.521505,3,5.61828


# Running commands

In [None]:
THIS IS A GOOGLE COLOB FILE IF YOU WISH TO RUN THIS
YOU WANT TO INSTALL THE NECESSARY MODULES RELATED TO THE GOOGLE
COLAB.AND USING THE RUN OPTION IN THE COLAB...
with open('Output_Data_Structure.xlsx', 'a') as f:#creating text file
    output.to_csv(f, index=False, header=False)
files.download('output_Data_Structure.csv')