In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Load input.xlsx
input_data = pd.read_excel("Input.xlsx")

# Iterate through each URL
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # Fetch the HTML content of the page
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract article title and text
    article_title = soup.find('title').text.strip()
    article_text = ' '.join([p.text for p in soup.find_all('p')])

    # Save the extracted article in a text file
    with open(f"{url_id}.txt", 'w', encoding='utf-8') as file:
        file.write(f"{article_title}\n{article_text}")

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
import syllables
import nltk
nltk.download('punkt')
import os

# Load input.xlsx
input_data = pd.read_excel("Input.xlsx")

# Create lists to store output data
output_url_ids = []
positive_scores = []
negative_scores = []
polarity_scores = []
subjectivity_scores = []
avg_sentence_lengths = []
percentage_complex_words_list = []
fog_indices = []
avg_words_per_sentence_list = []
complex_word_counts = []
word_counts = []
syllables_per_words = []
personal_pronouns_list = []
avg_word_lengths = []

folder_path = "/content"

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)

        try:
            print(f"Processing file: {file_path}")  # Print file path for clarity

            with open(file_path, 'r', encoding='utf-8') as file:
                article_text = file.read()

            blob = TextBlob(article_text)  # Create the TextBlob object

            # Compute variables
            positive_score = blob.sentiment.polarity
            negative_score = -blob.sentiment.polarity
            polarity_score = blob.sentiment.polarity
            subjectivity_score = blob.sentiment.subjectivity
            avg_sentence_length = len(blob.sentences) / len(blob.words)
            percentage_complex_words = len([word for word in blob.words if syllables.estimate(word) >= 3]) / len( blob.words) * 100
            fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
            avg_words_per_sentence = len(blob.words) / len(blob.sentences)
            complex_word_count = len([word for word in blob.words if syllables.estimate(word) >= 3])
            word_count = len(blob.words)
            syllables_per_word = sum([syllables.estimate(word) for word in blob.words]) / len(blob.words)
            personal_pronouns = len([word for word in blob.words if word.lower() in ['i', 'me', 'my', 'mine', 'myself']])
            avg_word_length = sum(len(word) for word in blob.words) / len(blob.words)

            # Append the results to the lists
            output_url_ids.append(row['URL_ID'])  # Assuming URL_ID is a column in our Input.xlsx
            positive_scores.append(positive_score)
            negative_scores.append(negative_score)
            polarity_scores.append(polarity_score)
            subjectivity_scores.append(subjectivity_score)
            avg_sentence_lengths.append(avg_sentence_length)
            percentage_complex_words_list.append(percentage_complex_words)
            fog_indices.append(fog_index)
            avg_words_per_sentence_list.append(avg_words_per_sentence)
            complex_word_counts.append(complex_word_count)
            word_counts.append(word_count)
            syllables_per_words.append(syllables_per_word)
            personal_pronouns_list.append(personal_pronouns)
            avg_word_lengths.append(avg_word_length)

        except FileNotFoundError:
            print(f"Error: File not found: {file_path}")
        except PermissionError:
            print(f"Error: Permission denied: {file_path}")
        except Exception as e:
            print(f"An error occurred: {e}")

# Add new columns to the input_data DataFrame
input_data['POSITIVE SCORE'] = positive_scores
input_data['NEGATIVE SCORE'] = [-score for score in positive_scores]  # Adding negative scores
input_data['POLARITY SCORE'] = polarity_scores
input_data['SUBJECTIVITY SCORE'] = subjectivity_scores
input_data['AVG SENTENCE LENGTH'] = avg_sentence_lengths
input_data['PERCENTAGE OF COMPLEX WORDS'] = percentage_complex_words_list
input_data['FOG INDEX'] = fog_indices
input_data['AVG NUMBER OF WORDS PER SENTENCE'] = avg_words_per_sentence_list
input_data['COMPLEX WORD COUNT'] = complex_word_counts
input_data['WORD COUNT'] = word_counts
input_data['SYLLABLE PER WORD'] = syllables_per_words
input_data['PERSONAL PRONOUNS'] = personal_pronouns_list
input_data['AVG WORD LENGTH'] = avg_word_lengths

# Save the updated input_data to a new Excel file
input_data.to_excel("Output.xlsx", index=False)
