In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import string
from bs4 import BeautifulSoup

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

In [3]:
# Replace 'your_excel_file.xlsx' with the path to your Excel file
excel_file = 'C:\\Users\\creat\\OneDrive - Högskolan Dalarna\\thesis\\Updated_annual_data_with_merge-2.xlsx'

# Read the Excel file
df = pd.read_excel(excel_file)

In [4]:
def preprocess_text(text):
    # Remove HTML tags
    # Remove HTML tags
    soup = BeautifulSoup(text, "html.parser")
    clean_text = soup.get_text()

    # Tokenization
    tokens = nltk.word_tokenize(clean_text)

    # Convert to lowercase
    tokens = [token.lower() for token in tokens]

    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove numbers
    text = re.sub(r"\b\d+\b", "", text)

    pattern = r'[0-9]'
    # Match all digits in the string and replace them with an empty string
    new_string = re.sub(pattern, '', text)

    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [5]:
def calculate_boilerplate(text):
    # Corpus for TF-IDF vectorizer (just the given text)
    corpus = [text]
    
    # Initialize TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words="english")
    
    # Fit the vectorizer on the corpus
    tfidf_vectorizer.fit(corpus)
    
    # Transform the text into TF-IDF representation
    text_tfidf = tfidf_vectorizer.transform(corpus)
    
    # Get feature names (words) from the vectorizer
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    # Extract TF-IDF scores for the words in the text
    word_tfidf_scores = {feature_names[col]: text_tfidf[0, col] for col in text_tfidf.nonzero()[1]}
    
    # Calculate boilerplate words (words with low TF-IDF scores)
    boilerplate_words = [word for word, score in word_tfidf_scores.items() if score < 0.2]
    
    
    
    return {
        "boilerplate_words": boilerplate_words,
        #"redundant_words": redundant_words,
        
    }

In [6]:
def calculate_redundancy(text):
    # Corpus for TF-IDF vectorizer (just the given text)
    corpus = [text]
    
    # Initialize TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words="english")
    
    # Fit the vectorizer on the corpus
    tfidf_vectorizer.fit(corpus)
    
    # Transform the text into TF-IDF representation
    text_tfidf = tfidf_vectorizer.transform(corpus)
    
    # Get feature names (words) from the vectorizer
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    # Extract TF-IDF scores for the words in the text
    word_tfidf_scores = {feature_names[col]: text_tfidf[0, col] for col in text_tfidf.nonzero()[1]}
    
    
    # Calculate redundancy (words with high term frequency but low TF-IDF scores)
    term_frequency = Counter(preprocess_text(text))
    redundant_words = {word: count for word, count in term_frequency.items() if count > 1 and word_tfidf_scores.get(word, 0) < 0.2}
    
    
    return {
      
        "redundant_words": redundant_words,
        
    }

In [7]:
ic_dc_text1_column = df['text_disclosure']
boilerwordcount=[]
redundantwordcount=[]
for text in ic_dc_text1_column:
    preprocessed_text = preprocess_text(text)
     # Calculate boilerplate words for each disclosure
    results_boilerplate = calculate_boilerplate(preprocessed_text)
    results_redundantword=calculate_redundancy(preprocessed_text)
    # Count the number of boilerplate words and append to the list
    boilerwordcount.append(len(results_boilerplate["boilerplate_words"]))
    redundantwordcount.append(len(results_redundantword["redundant_words"]))
# Add the boilerplate word count to the DataFrame
df['boilerplate_word_count'] = boilerwordcount
df['redundant_word_count'] = redundantwordcount

In [9]:
df.head()

Unnamed: 0,disclosure_id,company_fkey,is_effective,is_material_weakness,date_of_disclosure,industry,text_disclosure,form_type,year_of_disclosure,no_of_disclosures_per_comp,Total word Count,Difficult Words,Sentence Count,Syllable Count,Gunning Fog Index,boilerplate_word_count,redundant_word_count
0,592010,1750,0,1,2019-05-31,Manufacturing,ITEM 9A. CONTROLS AND PROCEDURES<p>Evaluati...,10-K,2019,3,1491,310,16,3769,42.3,399,41
1,617564,1750,1,0,2020-05-31,Manufacturing,ITEM 9A.CONTROLS AND PROCEDURES<p>Evaluation o...,10-K,2020,3,436,148,11,1079,24.1,192,39
2,644490,1750,1,0,2021-05-31,Manufacturing,ITEM 9A.CONTROLS AND PROCEDURES<p>Evaluation o...,10-K,2021,3,328,117,10,831,21.9,142,36
3,606409,1800,1,0,2019-12-31,Life Sciences,ITEM 9A. CONTROLS AND PROCEDURES<p>Disclosure...,10-K,2019,3,163,59,5,407,22.86,83,30
4,631446,1800,1,0,2020-12-31,Life Sciences,ITEM 9A. CONTROLS AND PROCEDURES<p>Disclosure...,10-K,2020,3,281,91,9,708,20.74,126,35


In [8]:
# Save the updated DataFrame into an Excel file
output_excel_file = 'C:\\Users\\creat\\OneDrive - Högskolan Dalarna\\thesis\\Updated_annual_data_with_merge-2.xlsx'
df.to_excel(output_excel_file, index=False)