Installing  all Dependency

In [None]:
!pip install textstat
!pip install pyphen




Main code for: Data Extraction and NLP

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textstat import flesch_kincaid_grade, syllable_count
from nltk.corpus import stopwords

# Load NLTK's VADER sentiment analyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# Load NLTK's stopwords
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

# Function to extract text from a URL
def extract_text_from_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        article = soup.find('article')  # Assuming article tags contain the main content
        text = article.get_text()
        return text
    except:
        return ""

# Read the input data from "Input.xlsx"
url_input_file = pd.read_excel("Input.xlsx")

# Initialize lists to store computed variables
positive_scores = []
negative_scores = []
polarity_scores = []
subjectivity_scores = []
avg_sentence_lengths = []
percentage_complex_words = []
fog_indices = []
avg_words_per_sentence = []
complex_word_counts = []
word_counts = []
syllables_per_word = []
personal_pronouns = []
avg_word_lengths = []

# Iterate through the URLs and extract text
for url in url_input_file['URL']:
    text = extract_text_from_url(url)

    # Tokenize text into sentences and words
    sentences = nltk.sent_tokenize(text)
    words = nltk.word_tokenize(text)

    # Check for non-zero denominators
    num_sentences = len(sentences)
    num_words = len(words)

    if num_sentences > 0 and num_words > 0:
        positive_scores.append(sid.polarity_scores(text)['pos'])
        negative_scores.append(sid.polarity_scores(text)['neg'])
        polarity_scores.append(sid.polarity_scores(text)['compound'])
        subjectivity_scores.append(sid.polarity_scores(text)['compound'])
        avg_sentence_lengths.append(flesch_kincaid_grade(text))
        percentage_complex_words.append((sum(1 for word in words if word.lower() not in stop_words) / num_words) * 100)
        fog_indices.append(0.4 * (avg_sentence_lengths[-1] + percentage_complex_words[-1]))
        avg_words_per_sentence.append(num_words / num_sentences)
        complex_word_counts.append(sum(1 for word in words if word.lower() not in stop_words))
        word_counts.append(num_words)
        syllables_per_word.append(syllable_count(text) / num_words)
        personal_pronouns.append(sum(1 for word in words if word.lower() in ['i', 'me', 'my', 'mine', 'myself']))
        avg_word_lengths.append(sum(len(word) for word in words) / num_words)
    else:
        # Handle the case where either sentences or words are zero
        positive_scores.append(0)
        negative_scores.append(0)
        polarity_scores.append(0)
        subjectivity_scores.append(0)
        avg_sentence_lengths.append(0)
        percentage_complex_words.append(0)
        fog_indices.append(0)
        avg_words_per_sentence.append(0)
        complex_word_counts.append(0)
        word_counts.append(0)
        syllables_per_word.append(0)
        personal_pronouns.append(0)
        avg_word_lengths.append(0)

# Create a DataFrame with the computed variables
output_data = pd.DataFrame({
    'URL_ID': url_input_file['URL_ID'],
    'POSITIVE SCORE': positive_scores,
    'NEGATIVE SCORE': negative_scores,
    'POLARITY SCORE': polarity_scores,
    'SUBJECTIVITY SCORE': subjectivity_scores,
    'AVG SENTENCE LENGTH': avg_sentence_lengths,
    'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
    'FOG INDEX': fog_indices,
    'AVG NUMBER OF WORDS PER SENTENCE': avg_words_per_sentence,
    'COMPLEX WORD COUNT': complex_word_counts,
    'WORD COUNT': word_counts,
    'SYLLABLE PER WORD': syllables_per_word,
    'PERSONAL PRONOUNS': personal_pronouns,
    'AVG WORD LENGTH': avg_word_lengths
})

# Save the output to "Output.xlsx"
output_data.to_excel("Output.xlsx", index=False)


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
