In [16]:
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import requests
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [17]:
import chardet


In [18]:
data = pd.read_excel('/content/Input.xlsx')

In [19]:
with open('/content/positive-words.txt', 'rb') as file:
    encoding = chardet.detect(file.read())['encoding']
    file.seek(0)
    positive_words = file.read().decode(encoding).splitlines()

with open('/content/negative-words.txt', 'rb') as file:
    encoding = chardet.detect(file.read())['encoding']
    file.seek(0)
    negative_words = file.read().decode(encoding).splitlines()

In [20]:
import nltk

nltk.download('stopwords')

def load_stopwords(file_path):
    with open(file_path, 'r', encoding='latin-1') as file:
        return set(word.strip() for word in file)

stop_words_auditor = load_stopwords('/content/StopWords_Auditor.txt')
stop_words_currencies = load_stopwords('/content/StopWords_Currencies.txt')
stop_words_dates_and_numbers = load_stopwords('/content/StopWords_DatesandNumbers.txt')
stop_words_generic = load_stopwords('/content/StopWords_Generic.txt')
stop_words_genericlong = load_stopwords('/content/StopWords_GenericLong.txt')
stop_words_geographic = load_stopwords('/content/StopWords_Geographic.txt')
stop_words_names = load_stopwords('/content/StopWords_Names.txt')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
# Preprocessing function to clean text
def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and punctuation
    clean_tokens = [word.lower() for word in tokens if word.lower() not in stop_words_auditor and word.isalnum()]
    return clean_tokens


In [22]:
# Function to extract text from websites
def extract_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup.get_text()

In [33]:
# Function to calculate positive score
def calculate_positive_score(text):
    return sum(1 for word in text if word in positive_words)

# Function to calculate negative score
def calculate_negative_score(text):
    return sum(1 for word in text if word in negative_words)

# Function to count complex words
def count_complex_words(text):
    return sum(1 for word in text if syllable_count(word) > 2)

# Function to calculate syllable count for a word
def syllable_count(word):
    # Implement syllable counting logic
    # Example: Simplified logic for demonstration
    return len(re.findall(r'[aeiouAEIOU]+', word))

# Function to calculate metrics and scores
def calculate_metrics(text):
    positive_score = calculate_positive_score(text)
    negative_score = calculate_negative_score(text)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(text) + 0.000001)
    avg_sentence_length = len(text) / text.count('.')
    percentage_complex_words = count_complex_words(text) / len(text)
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = len(text) / len(nltk.sent_tokenize(text))
    complex_word_count = count_complex_words(text)
    word_count = len(text)
    personal_pronouns = sum(1 for word in text if word.lower() in ['i', 'we', 'my', 'ours', 'us'])
    avg_word_length = sum(len(word) for word in text) / len(text)
    return polarity_score, subjectivity_score, avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, complex_word_count, word_count, personal_pronouns, avg_word_length



In [41]:
for index, row in data.iterrows():
    # Extract text from URL
    text = extract_text(row['URL'])
    # Preprocess text
    cleaned_text = preprocess_text(text)

    # Calculate scores and metrics
    positive_score = calculate_positive_score(cleaned_text)
    negative_score = calculate_negative_score(cleaned_text)

    # Avoid division by zero by checking if denominator is zero
    if (positive_score + negative_score) != 0:
        polarity_score = (positive_score - negative_score) / (positive_score + negative_score)
        subjectivity_score = (positive_score + negative_score) / len(cleaned_text)
    else:
        polarity_score = 0
        subjectivity_score = 0

    avg_sentence_length = len(cleaned_text) / (cleaned_text.count('.') + 1) if len(cleaned_text) != 0 else 0
    percentage_complex_words = count_complex_words(cleaned_text) / len(cleaned_text) if len(cleaned_text) != 0 else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = len(cleaned_text) / (cleaned_text.count('.') + 1) if len(cleaned_text) != 0 else 0
    complex_word_count = count_complex_words(cleaned_text)
    word_count = len(cleaned_text)
    personal_pronouns = sum(1 for word in cleaned_text if word.lower() in ['i', 'we', 'my', 'ours', 'us'])
    avg_word_length = sum(len(word) for word in cleaned_text) / len(cleaned_text) if len(cleaned_text) != 0 else 0

    # Assign calculated scores and metrics to DataFrame
    data.at[index, 'Positive_Score'] = positive_score
    data.at[index, 'Negative_Score'] = negative_score
    data.at[index, 'Polarity_Score'] = polarity_score
    data.at[index, 'Subjectivity_Score'] = subjectivity_score
    data.at[index, 'Avg_Sentence_Length'] = avg_sentence_length
    data.at[index, 'Percentage_of_Complex_Words'] = percentage_complex_words
    data.at[index, 'FOG_Index'] = fog_index
    data.at[index, 'Avg_Words_Per_Sentence'] = avg_words_per_sentence
    data.at[index, 'Complex_Word_Count'] = complex_word_count
    data.at[index, 'Word_Count'] = word_count
    data.at[index, 'Personal_Pronouns'] = personal_pronouns
    data.at[index, 'Avg_Word_Length'] = avg_word_length


In [42]:
# Save results to Excel
data.to_excel('output_file.xlsx', index=False)