In [11]:
def read_file_to_list(file_path, encoding='utf-8'):
    try:
        with open(file_path, 'r', encoding=encoding) as file:
            content_list = file.readlines()
            content_list = [line.rstrip('\n').lower() for line in content_list]  # Convert to lowercase
            return content_list
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except UnicodeDecodeError:
        print(f"Error decoding file. Try using a different encoding.")
        return None
if __name__ == "__main__":
    positive_file_path = "Text/positive-words.txt"
    negative_file_path = "Text/negative-words.txt"
    positive_words = read_file_to_list(positive_file_path, encoding='latin-1')
    negative_words = read_file_to_list(negative_file_path, encoding='latin-1')

In [12]:
def append_stop_words(file_path, split_character=" | ", encoding='latin-1'):
    words = read_file_to_list(file_path, encoding=encoding)
    if words:
        if split_character:
            words = [word.split(split_character)[0].lower() for word in words]
        stop_words.extend(words)
stop_words = []
file_paths = [
    "Text/StopWords_Geographic.txt",
    "Text/StopWords_GenericLong.txt",
    "Text/StopWords_Generic.txt",
    "Text/StopWords_Names.txt",
    "Text/StopWords_DatesandNumbers.txt",
    "Text/StopWords_Currencies.txt",
    "Text/StopWords_Auditor.txt"
]
for file_path in file_paths:
    append_stop_words(file_path)
print("Length of the merged stop_words list:", len(stop_words))
print("Length of the merged positive_words list:", len(positive_words))
print("Length of the merged negative_words list:", len(negative_words))

Length of the merged stop_words list: 14107
Length of the merged positive_words list: 2006
Length of the merged negative_words list: 4783


In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Function to tokenize text and remove stopwords
def tokenize_and_remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word.lower() for word in words if word.lower() not in stop_words and word.isalpha()]
    return filtered_words

# Function to calculate Positive Score
def calculate_positive_score(text, positive_words):
    positive_words_in_text = [word for word in tokenize_and_remove_stopwords(text) if word in positive_words]
    return len(positive_words_in_text)

# Function to calculate Negative Score
def calculate_negative_score(text, negative_words):
    negative_words_in_text = [word for word in tokenize_and_remove_stopwords(text) if word in negative_words]
    return -len(negative_words_in_text)

# Function to calculate Polarity Score
def calculate_polarity_score(positive_score, negative_score):
    denominator = (positive_score + negative_score + 0.000001)
    return (positive_score - negative_score) / denominator

# Function to calculate Subjectivity Score
def calculate_subjectivity_score(positive_score, negative_score, total_words):
    denominator = (total_words + 0.000001)
    return (positive_score + negative_score) / denominator

# Function to calculate Average Sentence Length
def calculate_average_sentence_length(text):
    sentences = nltk.sent_tokenize(text) if text else []
    words = tokenize_and_remove_stopwords(text)
    
    # Check if there are sentences to avoid division by zero
    if len(sentences) > 0:
        return len(words) / len(sentences)
    else:
        return 0  # Return 0 if there are no sentences

# Function to calculate Percentage of Complex Words
def calculate_percentage_complex_words(text):
    words = tokenize_and_remove_stopwords(text)
    
    # Check if there are words to avoid division by zero
    if len(words) > 0:
        complex_words = [word for word in words if len(word) > 2]
        return len(complex_words) / len(words)
    else:
        return 0  # Return 0 if there are no words

# Function to calculate Fog Index
def calculate_fog_index(average_sentence_length, percentage_complex_words):
    return 0.4 * (average_sentence_length + percentage_complex_words)

# Function to calculate Average Number of Words Per Sentence
def calculate_average_words_per_sentence(text):
    words = tokenize_and_remove_stopwords(text)
    sentences = nltk.sent_tokenize(text) if text else []
    
    # Check if there are sentences to avoid division by zero
    if len(sentences) > 0:
        return len(words) / len(sentences)
    else:
        return 0  # Return 0 if there are no sentences

# Function to count complex words
def count_complex_words(text):
    words = tokenize_and_remove_stopwords(text)
    complex_words = [word for word in words if len(word) > 2]
    return len(complex_words)

# Function to count syllables per word
def count_syllables_per_word(word):
    vowels = 'aeiouy'
    word = word.lower()
    count = 0

    if word[0] in vowels:
        count += 1

    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1

    if word.endswith('e'):
        count -= 1

    if count == 0:
        count += 1

    return count

# Function to calculate Personal Pronouns count
def calculate_personal_pronouns_count(text):
    pronouns = ['i', 'we', 'my', 'ours', 'us']
    words = tokenize_and_remove_stopwords(text)
    pronoun_count = sum(1 for word in words if word.lower() in pronouns)
    return pronoun_count

def calculate_average_word_length(text):
    words = tokenize_and_remove_stopwords(text)
    
    # Check if there are words to avoid division by zero
    if len(words) > 0:
        total_characters = sum(len(word) for word in words)
        return total_characters / len(words)
    else:
        return 0  # Return 0 if there are no words

# Function to calculate all metrics together
def calculate_all_metrics(text, positive_words, negative_words):
    positive_score = calculate_positive_score(text, positive_words)
    negative_score = calculate_negative_score(text, negative_words)
    total_words = len(tokenize_and_remove_stopwords(text))
    average_sentence_length = calculate_average_sentence_length(text)
    percentage_complex_words = calculate_percentage_complex_words(text)
    fog_index = calculate_fog_index(average_sentence_length, percentage_complex_words)
    average_words_per_sentence = calculate_average_words_per_sentence(text)
    complex_word_count = count_complex_words(text)
    syllables_per_word = sum(count_syllables_per_word(word) for word in tokenize_and_remove_stopwords(text))
    personal_pronouns_count = calculate_personal_pronouns_count(text)
    average_word_length = calculate_average_word_length(text)

    polarity_score = calculate_polarity_score(positive_score, negative_score)
    subjectivity_score = calculate_subjectivity_score(positive_score, negative_score, total_words)

    return {
        "Positive Score": positive_score,
        "Negative Score": negative_score,
        "Polarity Score": polarity_score,
        "Subjectivity Score": subjectivity_score,
        "Avg Sentence Length": average_sentence_length,
        "Percentage of Complex Words": percentage_complex_words,
        "Fog Index": fog_index,
        "Avg Number of Words Per Sentence": average_words_per_sentence,
        "Complex Word Count": complex_word_count,
        "Syllable Per Word": syllables_per_word,
        "Personal Pronouns Count": personal_pronouns_count,
        "Avg Word Length": average_word_length
    }

# Example usage:
text_for_analysis = "Certainly! I'll define a function that utilizes the previously created functions to calculate all the specified metrics. Additionally, I'll use the provided positive_words and negative_words lists for the positive and negative sets of words."

metrics_result = calculate_all_metrics(text_for_analysis, positive_words, negative_words)
print(metrics_result)


{'Positive Score': 1, 'Negative Score': -1, 'Polarity Score': 2000000.0, 'Subjectivity Score': 0.0, 'Avg Sentence Length': 5.0, 'Percentage of Complex Words': 1.0, 'Fog Index': 2.4000000000000004, 'Avg Number of Words Per Sentence': 5.0, 'Complex Word Count': 15, 'Syllable Per Word': 37, 'Personal Pronouns Count': 0, 'Avg Word Length': 7.6}


In [14]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
# Load the input data
input_data = pd.read_excel('Text/Input.xlsx')

# Initialize an empty list to store the results
results = []

# Function to save text to a file
def save_text_to_file(url_id, url, title, article_text, file_name):
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(f"URL_ID: {url_id}\n")
        file.write(f"URL: {url}\n")
        file.write(f"Title: {title}\n\n")
        file.write(article_text)

# Iterate through each article
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    # Fetch content from the URL
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad requests

        # Check if the status code is 404
        if response.status_code == 404:
            print(f"Error: URL not found - {url}")
            continue  # Skip the rest of the operations for this URL

        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract title
        title = soup.title.text.strip()
        
        # Extract text from the webpage (you might need to adjust this based on the HTML structure)
        article_text = soup.find('div', class_="td-post-content tagdiv-type")
        
        if article_text is None:
            article_text = soup.find('div', class_="tdb_single_content")
        
        # Check if the element is found before getting the text
        if article_text:
            article_text = article_text.get_text(separator='\n')
        else:
            article_text = ""
    except Exception as e:
        print(f"Error fetching content from URL: {url}\n{e}")
        title = ""
        article_text = ""  # Set empty text in case of an error

    # Save the article text to a file
    file_name = f"Text/{url_id}.txt"  # Adjust the path as needed
    save_text_to_file(url_id, url, title, article_text, file_name)

    # Perform textual analysis
    metrics_result = calculate_all_metrics(article_text, positive_words, negative_words)
    
    # Append the results to the list
    results.append({
        'URL_ID': url_id,
        'Title': title,
        **metrics_result
    })

# Create a DataFrame from the results
output_data = pd.DataFrame(results)

# Save the output data to 'Output Data Structure.xlsx'
output_data.to_excel('Text/Output Data Structure.xlsx', index=False)


Error fetching content from URL: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
404 Client Error: Not Found for url: https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/
Error fetching content from URL: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/
404 Client Error: Not Found for url: https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/


In [16]:
output_data

Unnamed: 0,URL_ID,Title,Positive Score,Negative Score,Polarity Score,Subjectivity Score,Avg Sentence Length,Percentage of Complex Words,Fog Index,Avg Number of Words Per Sentence,Complex Word Count,Syllable Per Word,Personal Pronouns Count,Avg Word Length
0,blackassign0001,Rising IT cities and its impact on the economy...,26,-6,1.600000,0.042105,6.089744,1.000000,2.835897,6.089744,475,1060,0,6.894737
1,blackassign0002,Rising IT Cities and Their Impact on the Econo...,51,-31,4.100000,0.030534,8.187500,1.000000,3.675000,8.187500,655,1721,0,7.795420
2,blackassign0003,"Internet Demand's Evolution, Communication Imp...",36,-23,4.538461,0.022375,10.192982,1.000000,4.477193,10.192982,581,1654,0,8.390706
3,blackassign0004,Rise of Cybercrime and its Effect in upcoming ...,35,-74,-2.794872,-0.067826,11.057692,1.000000,4.823077,11.057692,575,1574,0,8.231304
4,blackassign0005,OTT platform and its impact on the entertainme...,20,-8,2.333333,0.037736,8.153846,1.000000,3.661538,8.153846,318,780,0,7.827044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,blackassign0096,Due to the COVID-19 the repercussion of the en...,25,-54,-2.724138,-0.057087,10.160000,1.000000,4.464000,10.160000,508,1247,0,7.421260
96,blackassign0097,Impact of COVID-19 pandemic on office space an...,21,-35,-4.000000,-0.036269,10.157895,0.997409,4.462122,10.157895,385,817,0,6.860104
97,blackassign0098,Contribution of handicrafts (Visual Arts & Lit...,5,-3,3.999998,0.010363,8.772727,0.989637,3.904946,8.772727,191,450,0,7.305699
98,blackassign0099,How COVID-19 is impacting payment preferences?...,12,-3,1.666666,0.037500,7.272727,0.966667,3.295758,7.272727,232,504,0,6.591667
