In [None]:
import pandas as pd
import nltk
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# Download NLTK resources
nltk.download('punkt')


# Function to read files
def read_files(file_paths):
    contents = []
    for file_path in file_paths:
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                contents.append(file.read())
        except FileNotFoundError:
            print(f"File {file_path} not found.")
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
    return contents
file_paths = [
    "Stopwords/StopWords_Auditor.txt",
    "Stopwords/StopWords_DatesandNumbers.txt",
    "Stopwords/StopWords_Generic.txt",
    "Stopwords/StopWords_GenericLong.txt",
    "Stopwords/StopWords_Names.txt"
]

# Call the function and pass the file paths
file_contents = read_files(file_paths)

# Write the contents of each file to "stopwords_1.txt"
with open("stopwords_1.txt", 'a', encoding='utf-8') as f:
    for content in file_contents:
        f.write(content)

# Function to extract text from URL
def extract_text(url):
    with open("scraped_data.txt", 'a', encoding='utf-8') as f:
        for i in range(0, len(df)):
        # Check if the URL is valid
            url = df.iloc[i, 0]  # Assuming URL is in the first column of the DataFrame
            if pd.isna(url):
                # Skip this iteration if the URL is NaN
                continue

            # Check if the URL has a schema (e.g., 'http://' or 'https://')
            if not url.startswith(('http://', 'https://')):
                # Add 'http://' as the default schema if it's missing
                url = 'http://' + url

            try:
                page = requests.get(url)
            except requests.exceptions.RequestException as e:
                print(f"Error fetching URL {url}: {e}")
                continue  # Skip this URL if there's an error

            soup = BeautifulSoup(page.content, 'html.parser')

            # Extracting title
            title_element = soup.find(attrs={'class': 'entry-title'})
            if title_element:
                title = title_element.text.strip()
            else:
                title = "Title not found"

            # Extracting content
            content_element = soup.find(attrs={'class': 'td-post-content'})
            if content_element:
                content = content_element.text.replace('\xa0', ' ').replace('\n', ' ')
            else:
                content = "Content not found"

            # Merge title and content
            text = f"{title}. {content}"

            # Write the text to the file
            f.write(text)


def positive(file_path_pos):
    with open(file_path_pos, 'r', encoding='latin-1') as file_neg:
        positive_words = file_neg.readlines()  # Use readlines() to get a list of lines from the file
    # Strip any leading/trailing whitespace and newline characters from each word
    positive_words = [word.strip() for word in positive_words]
    return positive_words

def negative(file_path_neg):
    with open(file_path_neg, 'r', encoding='latin-1') as file_neg:
        negative_words = file_neg.readlines()  # Use readlines() to get a list of lines from the file
    # Strip any leading/trailing whitespace and newline characters from each word
    negative_words = [word.strip() for word in negative_words]
    return negative_words

# Read positive and negative words
file_path_pos = "positive-words.txt"
file_path_neg = "negative-words.txt"

positive_words = read_files(file_path_pos)
negative_words = read_files(file_path_neg)
   
    
# Function to compute textual analysis variables
def compute_text_variables(text):
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stopwords_list = read_files(file_paths)

    tokens = [token for token in tokens if token.lower() not in stopwords_list]
    
    # Compute other variables
    positive_score = sum(1 for word in tokens if word in positive_words)
    negative_score = sum(1 for word in tokens if word in negative_words)
    polarity_score = (positive_score - negative_score) / max(1, positive_score + negative_score)
    subjectivity_score = 0  # You need a more advanced method to calculate subjectivity score
    avg_sentence_length = len(tokens) / len(sent_tokenize(text))
    complex_words = [token for token in tokens if len(token) > 6]  # Assume words with more than 6 characters are complex
    percentage_complex_words = (len(complex_words) / len(tokens)) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_num_words_per_sentence = len(tokens) / len(sent_tokenize(text))
    complex_word_count = len(complex_words)
    word_count = len(tokens)
    syllable_per_word = 0  # You need a syllable count function to calculate this
    personal_pronouns = sum(1 for word in tokens if word.lower() in ["i", "me", "my", "mine", "myself", "we", "us", "our", "ours", "ourselves"])
    avg_word_length = sum(len(word) for word in tokens) / len(tokens)
    
    return [positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length,
            percentage_complex_words, fog_index, avg_num_words_per_sentence, complex_word_count,
            word_count, syllable_per_word, personal_pronouns, avg_word_length]


# Load input data
data = pd.read_csv('input.csv')
df = data.drop('URL_ID', axis=1)

# Extract text from URLs, compute variables, and save output
output_data = []
for index, row in df.iterrows():
    url_id = row["URL_ID"]
    url = row["URL"]
    title, content = extract_text(url)
    text = f"{title}. {content}"
    text_variables = compute_text_variables(text)
    output_data.append([url_id, title] + text_variables)

# Create DataFrame for output data
output_columns = ["URL_ID", "Article_Title", "POSITIVE_SCORE", "NEGATIVE_SCORE", "POLARITY_SCORE", "SUBJECTIVITY_SCORE",
                  "AVG_SENTENCE_LENGTH", "PERCENTAGE_OF_COMPLEX_WORDS", "FOG_INDEX", "AVG_NUMBER_OF_WORDS_PER_SENTENCE",
                  "COMPLEX_WORD_COUNT", "WORD_COUNT", "SYLLABLE_PER_WORD", "PERSONAL_PRONOUNS", "AVG_WORD_LENGTH"]
output_df = pd.DataFrame(output_data, columns=output_columns)

# Save output to Excel file
output_df.to_excel("TextAnalysisOutput.xlsx", index=False)
