In [3]:
import os
import csv
import requests
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import csv
import pandas as pd
from bs4 import BeautifulSoup
nltk.download('punkt')
nltk.download('stopwords')

# Function to read stop words from files and clean the text

def read_stop_words(stop_words_dir):
    stop_words = set()
    for filename in os.listdir(stop_words_dir):
        with open(os.path.join(stop_words_dir, filename), 'r', encoding='latin-1') as file:
            for line in file:
                stop_words.update(line.strip().lower().split())
    return stop_words

# Function to add positive and negative words
def read_sentiment_words(sentiment_words_dir):
    positive_words = set()
    negative_words = set()
    
    with open(os.path.join(sentiment_words_dir, 'positive_words.txt'), 'r', encoding='latin-1') as file:
        for line in file:
            positive_words.add(line.strip().lower())

    with open(os.path.join(sentiment_words_dir, 'negative_words.txt'), 'r', encoding='latin-1') as file:
        for line in file:
            negative_words.add(line.strip().lower())

    return positive_words, negative_words



# Function to clean text by removing stop words
def clean_text(text, stop_words):
    cleaned_text = ' '.join([word for word in text.lower().split() if word not in stop_words])
    return cleaned_text

# Function to perform text analysis
def perform_text_analysis(text, positive_words, negative_words):
    # Tokenize text
    tokens = word_tokenize(text)
    total_words = len(tokens)
    
    # Calculate polarity and subjectivity scores
    positive_score = sum(1 for word in tokens if word in positive_words)
    negative_score = sum(1 for word in tokens if word in negative_words)
    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)
    
    # Remove punctuation for calculating readability
    cleaned_tokens = [re.sub(r'[^\w\s]', '', token) for token in tokens]
    
    # Calculate average sentence length
    sentences = text.split('.')
    total_sentences = len(sentences)
    total_words_cleaned = len([token for token in cleaned_tokens if token])
    average_sentence_length = total_words_cleaned / total_sentences
    
    # Calculate percentage of complex words
    complex_words = [token for token in cleaned_tokens if len(token) > 2]
    percentage_complex_words = len(complex_words) / total_words_cleaned
    
    # Calculate Fog Index
    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)
    
    # Calculate average word length
    total_characters = sum(len(token) for token in cleaned_tokens)
    average_word_length = total_characters / total_words_cleaned
    
    return {
        'positive_score': positive_score,
        'negative_score': negative_score,
        'polarity_score': polarity_score,
        'subjectivity_score': subjectivity_score,
        'average_sentence_length': average_sentence_length,
        'percentage_complex_words': percentage_complex_words,
        'fog_index': fog_index,
        'average_word_length': average_word_length,
        'total_words': total_words
    }

# Function to read URLs from CSV


def read_urls_from_csv(csv_file):
    urls = {}
    with open(csv_file, 'r', newline='', encoding='latin-1') as file:
        reader = csv.DictReader((line.replace('\0', '') for line in file))  # Replace NULL byte with an empty string
        for row in reader:
            urls[row['URL_ID']] = row['URL']
    return urls

# Function to extract article text
def extract_article_text(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            title = soup.title.text.strip()
            
            # Extract text if either the class name or paragraph element is present
            article_content = soup.find(class_='td-post-content')
            if not article_content:
                article_content = soup.find('div', class_='type-post')
            if not article_content:
                article_content = soup.find('article')
            
            if article_content:
                article_text = article_content.get_text()
                return title, article_text
            else:
                print(f"No article content found for URL: {url}")
                return None, None
        else:
            print(f"Failed to fetch URL: {url}")
            return None, None
    except Exception as e:
        print(f"An error occurred while fetching URL: {url}\n{str(e)}")
        return None, None

# Function to save article text
def save_article_text(url_id, title, text, output_dir='extracted_texts'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    filename = os.path.join(output_dir, f"{url_id}.txt")
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(f"Title: {title}\n\n")
        file.write(text)
    print(f"Article text saved for URL_ID: {url_id}")

# Function to save output data structure to Excel
def save_output_to_excel(output_data, output_excel_file):
    df = pd.DataFrame(output_data)
    df.to_excel(output_excel_file, index=False)
    print(f"Output data structure saved to {output_excel_file}")

def main(csv_file):
    stop_words_dir = r'C:\Users\samal\Desktop\assignment_b\stopwords'  # Directory containing stop words files
    sentiment_words_dir = r'C:\Users\samal\Desktop\assignment_b\sentiment_words'  # Directory containing positive_words.txt and negative_words.txt
    urls = read_urls_from_csv(csv_file)
    stop_words = read_stop_words(stop_words_dir)
    positive_words, negative_words = read_sentiment_words(sentiment_words_dir)
    
    output_data = []
    for url_id, url in urls.items():
        title, text = extract_article_text(url)
        if title and text:
            cleaned_text = clean_text(text, stop_words)
            text_analysis = perform_text_analysis(cleaned_text, positive_words, negative_words)
            text_analysis['url_id'] = url_id
            output_data.append(text_analysis)
            save_article_text(url_id, title, text)
    
    # Save output data structure to Excel
    output_excel_file = "Output_Data.xlsx"
    save_output_to_excel(output_data, output_excel_file)

if __name__ == "__main__":
    csv_file_path = r'C:\Users\samal\Desktop\assignment_b\Input.csv'  # Specify the path to your CSV file here
    main(csv_file_path)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Article text saved for URL_ID: blackassign0001
Article text saved for URL_ID: blackassign0002
Article text saved for URL_ID: blackassign0003
Article text saved for URL_ID: blackassign0004
Article text saved for URL_ID: blackassign0005
Article text saved for URL_ID: blackassign0006
Article text saved for URL_ID: blackassign0007
Article text saved for URL_ID: blackassign0008
Article text saved for URL_ID: blackassign0009
Article text saved for URL_ID: blackassign0010
Article text saved for URL_ID: blackassign0011
Article text saved for URL_ID: blackassign0012
Article text saved for URL_ID: blackassign0013
Article text saved for URL_ID: blackassign0014
Article text saved for URL_ID: blackassign0015
Article text saved for URL_ID: blackassign0016
Article text saved for URL_ID: blackassign0017
Article text saved for URL_ID: blackassign0018
Article text saved for URL_ID: blackassign0019
Article text saved for URL_ID: blackassign0020
Article text saved for URL_ID: blackassign0021
Article text 