In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import string
import re
from textstat import syllable_count

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

In [None]:
# Function to extract article title and text from URL
def extract_article(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    # Assuming the article title is contained within an <h1> tag
    title = soup.find('h1').get_text().strip()
    # Assuming the article text is contained within <p> tags
    paragraphs = soup.find_all('p')
    text = '\n'.join([p.get_text() for p in paragraphs])
    return title, text.strip()

In [None]:
# Read the Excel file
df = pd.read_excel('Input.xlsx')

In [None]:
# Create a directory to save the text files
output_dir = 'article_texts'
os.makedirs(output_dir, exist_ok=True)

In [None]:
# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    try:
        title, article_text = extract_article(url)
        # Save article title and text in a text file
        with open(os.path.join(output_dir, f"{url_id}.txt"), 'w', encoding='utf-8') as f:
            f.write(f"{title}\n\n{article_text}")
        print(f"Article {url_id} extracted and saved successfully.")
    except Exception as e:
        print(f"Error extracting article {url_id}: {e}")

In [None]:
# Load positive and negative word lists
positive_words = set(nltk.corpus.opinion_lexicon.positive())
negative_words = set(nltk.corpus.opinion_lexicon.negative())

In [None]:
# Function to clean text and tokenize
def clean_and_tokenize(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [word.strip(string.punctuation) for word in tokens if word not in stop_words and word.isalnum()]
    return tokens

In [None]:
# Function to calculate sentiment scores
def calculate_sentiment_scores(tokens):
    positive_score = sum(1 for word in tokens if word in positive_words)
    negative_score = sum(1 for word in tokens if word in negative_words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(tokens) + 0.000001)
    return positive_score, negative_score, polarity_score, subjectivity_score

In [None]:
# Function to calculate other text statistics
def calculate_text_statistics(text):
    sentences = sent_tokenize(text)
    total_words = clean_and_tokenize(text)
    total_sentences = len(sentences)
    total_complex_words = [word for word in total_words if syllable_count(word) > 2]
    complex_word_count = len(total_complex_words)
    total_syllables = sum(syllable_count(word) for word in total_words)
    avg_sentence_length = len(total_words) / total_sentences
    percentage_complex_words = complex_word_count / len(total_words) if len(total_words) > 0 else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = len(total_words) / total_sentences
    personal_pronouns_count = len(re.findall(r'\b(?:I|we|my|ours|us)\b', text))
    avg_word_length = sum(len(word) for word in total_words) / len(total_words) if len(total_words) > 0 else 0
    return (
        complex_word_count,
        len(total_words),
        total_syllables,
        personal_pronouns_count,
        avg_sentence_length,
        percentage_complex_words,
        fog_index,
        avg_words_per_sentence,
        avg_word_length
    )

In [None]:
# Read the input file
df1 = pd.read_excel('input.xlsx')

In [None]:
# Initialize a list to store individual DataFrames
dfs=[]

In [None]:
# Initialize a DataFrame to store the results
output_df = pd.DataFrame(columns=[
    'URL_ID',
    'URL',
    'POSITIVE SCORE',
    'NEGATIVE SCORE',
    'POLARITY SCORE',
    'SUBJECTIVITY SCORE',
    'AVG SENTENCE LENGTH',
    'PERCENTAGE OF COMPLEX WORDS',
    'FOG INDEX',
    'AVG NUMBER OF WORDS PER SENTENCE',
    'COMPLEX WORD COUNT',
    'WORD COUNT',
    'SYLLABLE PER WORD',
    'PERSONAL PRONOUNS',
    'AVG WORD LENGTH'
])

In [None]:
# Perform textual analysis for each article
for index, row in df1.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    file_path = os.path.join('article_texts', f"{url_id}.txt")
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            article_text = file.read()
        
        # Calculate sentiment scores
        tokens = clean_and_tokenize(article_text)
        positive_score, negative_score, polarity_score, subjectivity_score = calculate_sentiment_scores(tokens)
        
        # Calculate other text statistics
        (
            complex_word_count,
            word_count,
            total_syllables,
            personal_pronouns_count,
            avg_sentence_length,
            percentage_complex_words,
            fog_index,
            avg_words_per_sentence,
            avg_word_length
        ) = calculate_text_statistics(article_text)
        
        # Create DataFrame for current article
        article_df = pd.DataFrame({
            'URL_ID': [url_id],
            'URL': [url],
            'POSITIVE SCORE': [positive_score],
            'NEGATIVE SCORE': [negative_score],
            'POLARITY SCORE': [polarity_score],
            'SUBJECTIVITY SCORE': [subjectivity_score],
            'AVG SENTENCE LENGTH': [avg_sentence_length],
            'PERCENTAGE OF COMPLEX WORDS': [percentage_complex_words],
            'FOG INDEX': [fog_index],
            'AVG NUMBER OF WORDS PER SENTENCE': [avg_words_per_sentence],
            'COMPLEX WORD COUNT': [complex_word_count],
            'WORD COUNT': [word_count],
            'SYLLABLE PER WORD': [total_syllables / word_count if word_count > 0 else 0],
            'PERSONAL PRONOUNS': [personal_pronouns_count],
            'AVG WORD LENGTH': [avg_word_length]
        }, columns=[
            'URL_ID',
            'URL',
            'POSITIVE SCORE',
            'NEGATIVE SCORE',
            'POLARITY SCORE',
            'SUBJECTIVITY SCORE',
            'AVG SENTENCE LENGTH',
            'PERCENTAGE OF COMPLEX WORDS',
            'FOG INDEX',
            'AVG NUMBER OF WORDS PER SENTENCE',
            'COMPLEX WORD COUNT',
            'WORD COUNT',
            'SYLLABLE PER WORD',
            'PERSONAL PRONOUNS',
            'AVG WORD LENGTH'
        ])
        
        dfs.append(article_df)

In [None]:
# Concatenate all individual DataFrames
output_df = pd.concat(dfs, ignore_index=True)

In [None]:
# Save the output DataFrame to Excel
output_df.to_excel('output.xlsx', index=False)