In [None]:
!pip install textstat



In [None]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize, word_tokenize
import textstat
from textstat import *
import re
import os
import logging
import requests
from bs4 import BeautifulSoup

# Set up logging
logging.basicConfig(filename='extraction_log.txt', level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')

# Download NLTK data if not already downloaded
nltk.download('vader_lexicon')
nltk.download('punkt')

# Function to perform textual analysis and compute variables
def perform_text_analysis(text, stop_words, positive_words, negative_words):
    # Clean text by removing stop words
    cleaned_text = ' '.join([word for word in text.split() if word not in stop_words])

    # Sentiment analysis
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = sia.polarity_scores(cleaned_text)
    positive_score = sentiment_scores['pos']
    negative_score = -sentiment_scores['neg']  # Make it positive
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)

    # Count positive and negative words
    positive_words_count = len([word for word in cleaned_text.split() if word in positive_words])
    negative_words_count = len([word for word in cleaned_text.split() if word in negative_words])

    # Subjectivity Score
    total_words_after_cleaning = len(cleaned_text.split())
    subjectivity_score = (positive_words_count + negative_words_count) / (total_words_after_cleaning + 0.000001)

    # Other variables
    sentences = sent_tokenize(cleaned_text)
    words = word_tokenize(cleaned_text)
    avg_sentence_length = round(len(words) / len(sentences), 2)
    complex_word_count = textstat.difficult_words(cleaned_text)
    word_count = len(words)
    percentage_complex_words = (complex_word_count / word_count) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = round(word_count / len(sentences), 2)

    # Count syllables per word
    syllables_per_word = 0
    for word in cleaned_text.split():
        # Handle exceptions for syllable counting
        word = re.sub(r'[?!,.]', '', word)  # Remove punctuation
        if word.endswith(('es', 'ed')):
            word = word[:-2]  # Remove common suffixes
        syllables = textstat.syllable_count(word)
        syllables_per_word += syllables

    # Count personal pronouns
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', cleaned_text, flags=re.IGNORECASE))

    # Average word length
    avg_word_length = round(sum(len(word) for word in words) / len(words), 2)

    return [positive_score, negative_score, polarity_score, avg_sentence_length,
            percentage_complex_words, fog_index, avg_words_per_sentence,
            complex_word_count, word_count, syllables_per_word, personal_pronouns,
            avg_word_length]

# Define lists to store data for both successful and error URLs
data = []

# Specify the folder containing stop words
stopwords_folder = '/content/drive/MyDrive/StopWords'  # Replace with the path to your folder

# List all the .txt files in the folder
txt_files = [f for f in os.listdir(stopwords_folder) if f.endswith('.txt')]

# Read the content of each .txt file
stop_words = set()  # Create a set to store stop words

for txt_file in txt_files:
    file_path = os.path.join(stopwords_folder, txt_file)

    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        file_content = file.read()
        stop_words.update(file_content.split())

# Read positive and negative words
with open('/content/drive/MyDrive/Copy of positive-words.txt', 'r') as positive_words_file:
    positive_words = positive_words_file.read().splitlines()

import chardet

# Detect file encoding
with open('/content/drive/MyDrive/Copy of negative-words.txt', 'rb') as file:
    result = chardet.detect(file.read())

# Use detected encoding to open the file
file_encoding = result['encoding']
with open('/content/drive/MyDrive/Copy of negative-words.txt', 'r', encoding=file_encoding) as negative_words_file:
    negative_words = negative_words_file.read().splitlines()

# Load input data
input_data = pd.read_excel('/content/drive/MyDrive/Copy of Input.xlsx')

# Function to extract article text from a URL
def extract_article_text(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')
        # Extract title and article text, exclude unwanted content
        title = soup.find('title').get_text()
        article_text = soup.find('article').get_text()  # Adjust as per HTML structure
        return title, article_text
    except Exception as e:
        # Log the error and return None values
        logging.error(f"Error extracting text from URL: {url}\nError message: {str(e)}")
        return None, None
num_analysis_columns = 12  # Update this with the actual number of analysis columns
# Iterate through URLs and save extracted data
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    title, article_text = extract_article_text(url)
    if title and article_text:
        with open(f'{url_id}.txt', 'w', encoding='utf-8') as file:
            file.write(f'Title: {title}\n\n')
            file.write(article_text)
        # Perform text analysis and store data in the appropriate list (success or error)
        variables = perform_text_analysis(article_text, stop_words, positive_words, negative_words)
        data.append([url_id, url, 'Success'] + variables)
    else:
        # Record the URL as an error with "None" values for analysis columns
        data.append([url_id, url, 'Error'] + [None] * num_analysis_columns)
# Create a DataFrame
df = pd.DataFrame(data, columns=['URL_ID', 'URL','Status','POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE',
                                 'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS',
                                 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE',
                                 'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD',
                                 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'])

# Save the DataFrame to an Excel file
df.to_excel('/content/drive/MyDrive/Output.xlsx',index = False)


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
ERROR:root:Error extracting text from URL: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
Error message: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
ERROR:root:Error extracting text from URL: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/
Error message: 404 Client Error: Not Found for url: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/
