In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Reading the input file
df = pd.read_excel("input.xlsx")

# Iterating over the URLs
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    # Sendinng a GET request to the URL
    response = requests.get(url)
    
    # Checking if the request was successful
    if response.status_code == 200:
        # Creating a BeautifulSoup object
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Finding the article title
        title_element = soup.find('h1', class_='entry-title')
        article_title = title_element.text.strip() if title_element else ''
        
        # Finding the article text
        content_element = soup.find('div', class_='td-post-content tagdiv-type')
        article_text = content_element.text.strip() if content_element else ''
        
        # Saving the extracted article to a text file
        filename = f"{url_id}.txt"
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(article_title + '\n\n')
            file.write(article_text)
        
        print(f"Article {url_id} saved successfully.")
    else:
        print(f"Failed to retrieve article {url_id}. Status code: {response.status_code}")

In [None]:
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import textstat
import os
import csv

# Function for stemming and removing stopwords
def stemming(content):
    port_stem = PorterStemmer()
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word.lower() not in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

# Reading positive words from file
positive_words = set()
with open('positive-words.txt', 'r') as file:
    for line in file:
        positive_words.add(line.strip().lower())

# Reading negative words from file
negative_words = set()
with open('negative-words.txt', 'r') as file:
    for line in file:
        negative_words.add(line.strip().lower())

current_directory = os.getcwd()
input_directory = os.path.join(current_directory, "New content")

# Output file path
output_file = 'C:/Users/91821/Desktop/ML related/output file/Output.csv'

# List to store the results
results = []

# Iterating over the text files
for i in range(38, 151):
    file_path = os.path.join(input_directory, f"{i}.txt")
    url_id = i

    try:
        # Reading the content of the file
        with open(file_path, 'r', encoding='utf-8') as file:
            extracted_data = file.read()

        # Tokenizing the extracted data into sentences
        sentences = sent_tokenize(extracted_data)

        # Applying stemming and removing stopwords on sentences
        sentences = [stemming(sentence) for sentence in sentences]

        # Text Analysis
        positive_score = 0
        negative_score = 0
        polarity_score = 0
        subjectivity_score = 0
        avg_sentence_length = 0
        percentage_complex_words = 0
        fog_index = 0
        avg_words_per_sentence = 0
        complex_word_count = 0
        word_count = 0
        syllables_per_word = 0
        personal_pronouns = 0
        avg_word_length = 0
        cleaned_words = []

        for sentence in sentences:
            cleaned_words.extend(sentence.split())

        word_count = len(cleaned_words)

        # Calculating parameters
        for word in cleaned_words:
            # Sentiment analysis
            if word.lower() in positive_words:
                positive_score += 1
            elif word.lower() in negative_words:
                negative_score += 1

            # Counting complex words
            if textstat.syllable_count(word) > 2:
                complex_word_count += 1

            # Counting syllables
            syllables_per_word += textstat.syllable_count(word)

            # Counting personal pronouns
            if word.lower() in ['i', 'we', 'my', 'mine', 'us', 'our', 'ours']:
                personal_pronouns += 1

        # Calculating polarity score
        polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)

        # Calculating subjectivity score
        subjectivity_score = (positive_score + negative_score) / (word_count + 0.000001)

        # Calculating average sentence length
        avg_sentence_length = word_count / len(sentences) if len(sentences) > 0 else 0

        # Calculating percentage of complex words
        percentage_complex_words = (complex_word_count / word_count) * 100 if word_count > 0 else 0

        # Calculating fog index
        fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

        # Calculating average number of words per sentence
        avg_words_per_sentence = word_count / len(sentences) if len(sentences) > 0 else 0

        # Calculating average word length
        if word_count > 0:
            avg_word_length = sum(len(word) for word in cleaned_words) / word_count

        # Storing the results in a dictionary
        result = {
            'URL_ID': url_id,
            'Positive Score': positive_score,
            'Negative Score': negative_score,
            'Polarity Score': polarity_score,
            'Subjectivity Score': subjectivity_score,
            'Average Sentence Length': avg_sentence_length,
            'Percentage of Complex Words': percentage_complex_words,
            'FOG Index': fog_index,
            'Average Number of Words per Sentence': avg_words_per_sentence,
            'Complex Word Count': complex_word_count,
            'Word Count': word_count,
            'Syllables per Word': syllables_per_word,
            'Personal Pronouns': personal_pronouns,
            'Average Word Length': avg_word_length
        }

        # Appending the result to the list
        results.append(result)

    except FileNotFoundError:
        print(f"File not found: {file_path}")

    except Exception as e:
        print(f"Error processing file: {file_path}")
        print(f"Error message: {str(e)}")

# Write the output data to the output file
with open(output_file, 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=results[0].keys())
    writer.writeheader()
    writer.writerows(results)

print("Output file saved successfully.")
