In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import textstat
import re
import os

nltk.download('vader_lexicon')
nltk.download('punkt')

input_df = pd.read_excel('Input.xlsx')
urls = input_df['URL']

sid = SentimentIntensityAnalyzer()

def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    title = soup.find('title').get_text()

    paragraphs = soup.find_all('p')
    article_text = ' '.join([para.get_text() for para in paragraphs])

    return title, article_text

def analyze_text(text):
    sentences = nltk.sent_tokenize(text)
    num_sentences = len(sentences)
    words = nltk.word_tokenize(text)
    num_words = len(words)
    sentiment_scores = sid.polarity_scores(text)
    positive_score = sentiment_scores['pos']
    negative_score = sentiment_scores['neg']
    polarity_score = sentiment_scores['compound']
    subjectivity_score = textstat.text_standard(text, float_output=True)
    avg_sentence_length = num_words / num_sentences if num_sentences != 0 else 0
    complex_words = [word for word in words if textstat.syllable_count(word) > 2]
    percentage_complex_words = len(complex_words) / num_words * 100 if num_words != 0 else 0
    fog_index = textstat.gunning_fog(text)
    complex_word_count = len(complex_words)
    word_count = num_words
    syllables_per_word = textstat.syllable_count(text) / num_words if num_words != 0 else 0
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))
    avg_word_length = sum(len(word) for word in words) / num_words if num_words != 0 else 0

    return {
        'positive_score': positive_score,
        'negative_score': negative_score,
        'polarity_score': polarity_score,
        'subjectivity_score': subjectivity_score,
        'avg_sentence_length': avg_sentence_length,
        'percentage_complex_words': percentage_complex_words,
        'fog_index': fog_index,
        'complex_word_count': complex_word_count,
        'word_count': word_count,
        'syllables_per_word': syllables_per_word,
        'personal_pronouns': personal_pronouns,
        'avg_word_length': avg_word_length
    }


for index, row in input_df.iterrows():
    url = row['URL']
    url_id = row['URL_ID']
    title, article_text = extract_text_from_url(url)


    with open(f'{url_id}.txt', 'w', encoding='utf-8') as file:
        file.write(title + '\n' + article_text)


output_df = pd.DataFrame()

for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    file_path = f'{url_id}.txt'

    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            analysis_results = analyze_text(text)


            row_data = row.to_dict()
            row_data.update(analysis_results)
            output_df = output_df.append(row_data, ignore_index=True)


output_df.to_excel('Output Data Structure.xlsx', index=False)

print("Analysis complete. Results saved to 'Output Data Structure.xlsx'.")
