In [1]:
import pandas as pd
import urllib
import requests
from bs4 import BeautifulSoup as bs
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

Lem = WordNetLemmatizer()

nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
class Data_ingestion:
    def __init__(self, file_path):
        """Initialize with the file path."""
        self.file_path = file_path

    def primary(self):
        """Load the primary data from the provided file path."""
        try:
            data = pd.read_excel(self.file_path)
            return data
        except Exception as e:
            print(f'Error loading file: {e}')
            return None

    def fetch_data_from_url(self, url):
        """Fetch the data from the provided URL and return the article title and content."""
        try:
            response = requests.get(url)
            soup = bs(response.text, 'html.parser')

            article_title = soup.find('title').text if soup.find('title') else "No Title"
            all_text_elements = soup.find('div', class_='td-post-content tagdiv-type')

            if all_text_elements:
                all_text = all_text_elements.get_text(strip=True, separator="\n")
                first_data = all_text.splitlines()
            else:
                first_data = []

            return article_title, first_data

        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL: {url}, Error: {e}")
            return None, None

    def secondary(self):
        """Process URLs from the dataset and save their content."""
        data = self.primary()
        if data is None:
            return None, None
        
        df = data.copy()
        updated_list = []
        no_matching_data = []
        blank_link = []

        for i, url in enumerate(df.get('URL', [])): 
            article_title, first_data = self.fetch_data_from_url(url)

            if first_data is None:
                print(f'No matching data found for URL: {url}')
                blank_link.append(f"blackassign00{i+1}: {url}")
                blank = {'URL_ID': f"blackassign00{i+1}", 'URL': url}
                no_matching_data.append(blank)
                continue

            #update list with new entry
            new_df = {
                'URL_ID': df['URL_ID'][i] if 'URL_ID' in df else f"assign00{i+1}",
                'URL': url,
                'article_words': f'{article_title} - {first_data}'
            }
            updated_list.append(new_df)

            # Save the article content to a text file
            self.save_article_to_file(url, article_title, first_data)

        return pd.DataFrame(updated_list), no_matching_data

    def save_article_to_file(self, url, article_title, article_content):
        """Save the article content to a text file."""
        file_name = urllib.parse.quote_plus(url)  # URL-encoded file name
        file_path = os.path.join(os.getcwd(), 'Text_files')

        if not os.path.exists(file_path):
            os.makedirs(file_path)

        with open(f'{file_path}/{file_name}.txt', 'w', encoding='utf-8') as file1:
            file1.writelines(article_title + "\n")
            if not article_content:
                file1.writelines("No data found")
            else:
                file1.writelines('\n'.join(article_content))

In [3]:
class Analysis:
    def StopWords_data(self, file_path=r'C:\Users\user\Downloads\StopWords-20241019T100716Z-001\StopWords'):
        StopWords_Auditor = open(f'{file_path}\\StopWords_Auditor.txt', 'r', encoding='ISO-8859-1')
        StopWords_Currencies = open(f'{file_path}\\StopWords_Currencies.txt', 'r', encoding='ISO-8859-1')
        StopWords_DateandNumbers = open(f'{file_path}\\StopWords_DatesandNumbers.txt', 'r', encoding='ISO-8859-1')
        StopWords_Generic = open(f'{file_path}\\StopWords_Generic.txt', 'r', encoding='ISO-8859-1')
        StopWords_GenericLong = open(f'{file_path}\\StopWords_GenericLong.txt', 'r', encoding='ISO-8859-1')
        StopWords_Geographic = open(f'{file_path}\\StopWords_Geographic.txt', 'r', encoding='ISO-8859-1')
        StopWords_Names = open(f'{file_path}\\StopWords_Names.txt', 'r', encoding='ISO-8859-1')

        return StopWords_Auditor, StopWords_Currencies, StopWords_DateandNumbers, StopWords_Generic, StopWords_GenericLong, StopWords_Geographic, StopWords_Names
    
    def MasterDictionary_data(self, file_path=r'C:\Users\user\Downloads\MasterDictionary-20241019T100713Z-001\MasterDictionary'):
        # Negative Dictionary
        file_neg = open(f'{file_path}\\negative-words.txt', 'r', encoding='ISO-8859-1')
        file_neg.seek(0)
        neg_split = file_neg.read().split()

        # Positive Dictionary
        file_pos = open(f'{file_path}\\positive-words.txt', 'r', encoding='ISO-8859-1')
        file_pos.seek(0)
        pos_split = file_pos.read().split()

        return pos_split, neg_split
    
    def text_corpus(self, x):
        StopWords_Auditor, StopWords_Currencies, StopWords_DateandNumbers, StopWords_Generic, StopWords_GenericLong, StopWords_Geographic, StopWords_Names = self.StopWords_data()

        string_format = str(x).lower()
        lowerwords = re.sub('[^a-zA-Z]+', ' ', string_format).strip()
        token = word_tokenize(lowerwords)
        token_word = [t for t in token if t not in (StopWords_Auditor, StopWords_Currencies, StopWords_DateandNumbers, StopWords_Generic, StopWords_GenericLong, StopWords_Geographic, StopWords_Names)]
        lemmatized = [Lem.lemmatize(w) for w in token_word]
        return lemmatized

    def count_syllables(self, word):
        vowels = 'aeiou'
        count = 0
        pre_char_was_vowel = False
        exceptions = ['es', 'ed']

        for exception in exceptions:
            if word.endswith(exception):
                return 0

        for char in word.lower():
            if char in vowels:
                if not pre_char_was_vowel:
                    count += 1
                pre_char_was_vowel = True
            else:
                pre_char_was_vowel = False
        return count

    def calculate_complexity_percentage(self, words):
        num_complex_words = sum(1 for word in words if self.count_syllables(word) >= 2)
        total_words = len(words)
        no_of_complex_words = num_complex_words
        percentage_complex_words = (num_complex_words / total_words) * 100 if total_words > 0 else 0
        return percentage_complex_words, no_of_complex_words

    def count_syllable_per_word(self, words):
        syllable_per_word = {word: self.count_syllables(word) for word in words}
        return syllable_per_word

    def personal_pronoun_count(self, words_list):
        list_of_words = ['I', 'we', 'my', 'ours', 'us']
        list_words_count = sum(1 for word in words_list if word in list_of_words)
        return list_words_count

    def avg_word_length(self, words):
        count = sum(len(word) for word in words)
        return count

In [4]:
class Col_Structure:
    def col_structure_primary(self, data):
        output_data = []
        updated_list = []

        analysis = Analysis()  # Instantiate once, use throughout the loop

        for i, j, column in zip(data["URL_ID"], data["URL"], data["article_words"]):
            # Return tokenized words
            preprocessed_word = analysis.text_corpus(column)

            # Existing dict in the text file
            positive_dict, negative_dict = analysis.MasterDictionary_data()

            # Positive Score
            positive_count = [ps_words for ps_words in preprocessed_word if ps_words in positive_dict]
            positive_score = len(positive_count)

            # Negative Score
            negative_count = [ns_word for ns_word in preprocessed_word if ns_word in negative_dict]
            negative_score = len(negative_count)

            # Polarity Score
            polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)

            # Subjectivity Score
            subjective_score = (positive_score + negative_score) / (len(preprocessed_word) + 0.000001)

            # Average Sentence Length
            total_sentence = len(nltk.tokenize.sent_tokenize(column))
            avg_sentence_length = round(len(preprocessed_word) / total_sentence, 0)

            # Percentage of Complex Words and Complex Words Count
            percentage_of_ComplexWords, total_no_of_ComplexWords_count = analysis.calculate_complexity_percentage(preprocessed_word)

            # FOG Index
            FOG_index = 0.4 * (avg_sentence_length + percentage_of_ComplexWords)

            # Avg No. of Words per Sentence
            avg_no_of_words_per_sentence = round(len(column.split()) / total_sentence, 0)

            # Word Count
            word_count = len(preprocessed_word)

            # Syllable per Word
            syllable_per_word = analysis.count_syllable_per_word(preprocessed_word)

            # Personal Pronouns
            personal_pronouns = analysis.personal_pronoun_count(preprocessed_word)

            # Average Word Length
            word_length = analysis.avg_word_length(preprocessed_word)
            avg_word_length = round(word_length / len(preprocessed_word), 0)

            # Final dictionary to append to the list
            final_dict = {
                "URL_ID": i,
                "URL": j,
                "article_word": column,
                "Positive Score": positive_score,
                "Negative Score": negative_score,
                "Polarity Score": polarity_score,
                "Subjectivity Score": subjective_score,
                "Avg_Sentence_Length": avg_sentence_length,
                "Percentage_of_ComplexWords": percentage_of_ComplexWords,
                "FOG_index": FOG_index,
                "Avg_No_of_Words_per_Sentence": avg_no_of_words_per_sentence,
                "Complex_word_Count": total_no_of_ComplexWords_count,
                "Word_Count": word_count,
                "Syllable_Per_Word": syllable_per_word,
                "Personal_Pronouns": personal_pronouns,
                "Avg_Word_Length": avg_word_length
            }
            updated_list.append(final_dict)

        # Convert list of dictionaries to a DataFrame after the loop completes
        df = pd.DataFrame(updated_list)

        # Save DataFrame to CSV
        df.to_csv(r"C:\Users\user\Desktop\Programming Languages\My Projects\Black_Coffer_Text_Analysis\output_data.csv", index=False)

        return df

In [5]:
if __name__ == "__main__":
    file_path = r'C:\Users\user\Downloads\input (1).xlsx'
    obj     = Data_ingestion(file_path)
    str_obj = Col_Structure()
    obj1 = obj.primary()
    total_data, blank_list = obj.secondary()
    df = str_obj.col_structure_primary(total_data)