In [6]:
import os
import glob

# Load the positive words from the master dictionary
master_dictionary_path = 'MasterDictionary'
positive_words = set()

with open(os.path.join(master_dictionary_path, 'positive-words.txt'), 'r') as f:
    for line in f:
        positive_words.add(line.strip())

# Function to compute positive score for a given text
def compute_positive_score(text):
    words = text.split()
    positive_score = sum(1 for word in words if word in positive_words)
    return positive_score

# Function to read the folder of extracted text files and compute positive scores for each file
def read_folder_and_compute_positive_scores(folder_path):
    for file_path in glob.glob(os.path.join(folder_path, '*.txt')):
        with open(file_path, 'r', encoding="utf-8") as file:
            text = file.read()
            positive_score = compute_positive_score(text)
            file_name = os.path.basename(file_path)
            print(f"File: {file_name} - Positive Score: {positive_score}")

# Replace 'path_to_your_folder' with the appropriate path to the folder containing the extracted text files
folder_path = 'ExtractedText URL'
read_folder_and_compute_positive_scores(folder_path)


File: 10282.6.txt - Positive Score: 51
File: 10744.4.txt - Positive Score: 42
File: 11206.2.txt - Positive Score: 22
File: 11668.0.txt - Positive Score: 2
File: 12129.8.txt - Positive Score: 36
File: 123.0.txt - Positive Score: 74
File: 12591.6.txt - Positive Score: 70
File: 13053.4.txt - Positive Score: 52
File: 13515.2.txt - Positive Score: 41
File: 13977.0.txt - Positive Score: 30
File: 14438.8.txt - Positive Score: 63
File: 14900.6.txt - Positive Score: 54
File: 15362.4.txt - Positive Score: 22
File: 15824.2.txt - Positive Score: 103
File: 16286.0.txt - Positive Score: 24
File: 16747.8.txt - Positive Score: 14
File: 17209.6.txt - Positive Score: 4
File: 17671.4.txt - Positive Score: 2
File: 18133.2.txt - Positive Score: 35
File: 18595.0.txt - Positive Score: 21
File: 19056.8.txt - Positive Score: 65
File: 19518.6.txt - Positive Score: 45
File: 19980.4.txt - Positive Score: 8
File: 20442.2.txt - Positive Score: 23
File: 20904.0.txt - Positive Score: 14
File: 21365.8.txt - Positive S

In [7]:
import os
import glob

# Load the negative words from the master dictionary
master_dictionary_path = 'MasterDictionary'
negative_words = set()

with open(os.path.join(master_dictionary_path, 'negative-words.txt'), 'r') as f:
    for line in f:
        negative_words.add(line.strip())

# Function to compute negative score for a given text
def compute_negative_score(text):
    words = text.split()
    negative_score = -sum(1 for word in words if word in negative_words)
    return negative_score

# Function to read the folder of extracted text files and compute negative scores for each file
def read_folder_and_compute_negative_scores(folder_path):
    for file_path in glob.glob(os.path.join(folder_path, '*.txt')):
        with open(file_path, 'r', encoding="utf-8") as file:
            text = file.read()
            negative_score = compute_negative_score(text)
            file_name = os.path.basename(file_path)
            print(f"File: {file_name} - Negative Score: {negative_score}")

# Replace 'path_to_your_folder' with the appropriate path to the folder containing the extracted text files
folder_path = 'ExtractedText URL'
read_folder_and_compute_negative_scores(folder_path)


File: 10282.6.txt - Negative Score: -21
File: 10744.4.txt - Negative Score: -18
File: 11206.2.txt - Negative Score: -11
File: 11668.0.txt - Negative Score: -2
File: 12129.8.txt - Negative Score: -11
File: 123.0.txt - Negative Score: -21
File: 12591.6.txt - Negative Score: -33
File: 13053.4.txt - Negative Score: -42
File: 13515.2.txt - Negative Score: -17
File: 13977.0.txt - Negative Score: -22
File: 14438.8.txt - Negative Score: -25
File: 14900.6.txt - Negative Score: -19
File: 15362.4.txt - Negative Score: -2
File: 15824.2.txt - Negative Score: -39
File: 16286.0.txt - Negative Score: -2
File: 16747.8.txt - Negative Score: -8
File: 17209.6.txt - Negative Score: -5
File: 17671.4.txt - Negative Score: -2
File: 18133.2.txt - Negative Score: -62
File: 18595.0.txt - Negative Score: -11
File: 19056.8.txt - Negative Score: -26
File: 19518.6.txt - Negative Score: -3
File: 19980.4.txt - Negative Score: -3
File: 20442.2.txt - Negative Score: -7
File: 20904.0.txt - Negative Score: -10
File: 21365

In [5]:
import os
import glob

# Load the positive and negative words from the master dictionary
master_dictionary_path = 'MasterDictionary'
positive_words = set()
negative_words = set()

with open(os.path.join(master_dictionary_path, 'positive-words.txt'), 'r') as f:
    for line in f:
        positive_words.add(line.strip())

with open(os.path.join(master_dictionary_path, 'negative-words.txt'), 'r') as f:
    for line in f:
        negative_words.add(line.strip())

# Function to compute positive and negative scores for a given text
def compute_scores_from_text(text):
    words = text.split()
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = -sum(1 for word in words if word in negative_words)
    return positive_score, negative_score

# Function to compute the polarity score with range limitation
def compute_polarity_score(positive_score, negative_score):
    raw_polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    polarity_score = max(-1, min(1, raw_polarity_score))  # Limiting the range to -1 to +1
    return polarity_score

# Function to read the folder of extracted text files, compute scores, and polarity scores for each file
def read_folder_compute_scores_and_polarity(folder_path):
    for file_path in glob.glob(os.path.join(folder_path, '*.txt')):
        with open(file_path, 'r', encoding="utf-8") as file:
            text = file.read()
            positive_score, negative_score = compute_scores_from_text(text)
            polarity_score = compute_polarity_score(positive_score, negative_score)
            file_name = os.path.basename(file_path)
            print(f"File: {file_name} - Positive Score: {positive_score}, Negative Score: {negative_score}, Polarity Score: {polarity_score}")

# Replace 'folder_path' with the appropriate path to the folder containing the extracted text files
folder_path = 'ExtractedText URL'
read_folder_compute_scores_and_polarity(folder_path)



File: 10282.6.txt - Positive Score: 51, Negative Score: -21, Polarity Score: 1
File: 10744.4.txt - Positive Score: 42, Negative Score: -18, Polarity Score: 1
File: 11206.2.txt - Positive Score: 22, Negative Score: -11, Polarity Score: 1
File: 11668.0.txt - Positive Score: 2, Negative Score: -2, Polarity Score: 1
File: 12129.8.txt - Positive Score: 36, Negative Score: -11, Polarity Score: 1
File: 123.0.txt - Positive Score: 74, Negative Score: -21, Polarity Score: 1
File: 12591.6.txt - Positive Score: 70, Negative Score: -33, Polarity Score: 1
File: 13053.4.txt - Positive Score: 52, Negative Score: -42, Polarity Score: 1
File: 13515.2.txt - Positive Score: 41, Negative Score: -17, Polarity Score: 1
File: 13977.0.txt - Positive Score: 30, Negative Score: -22, Polarity Score: 1
File: 14438.8.txt - Positive Score: 63, Negative Score: -25, Polarity Score: 1
File: 14900.6.txt - Positive Score: 54, Negative Score: -19, Polarity Score: 1
File: 15362.4.txt - Positive Score: 22, Negative Score: 

In [4]:
import os
import glob
import re

# Load the positive and negative words from the master dictionary
master_dictionary_path = 'MasterDictionary'
positive_words = set()
negative_words = set()

with open(os.path.join(master_dictionary_path, 'positive-words.txt'), 'r') as f:
    for line in f:
        positive_words.add(line.strip())

with open(os.path.join(master_dictionary_path, 'negative-words.txt'), 'r') as f:
    for line in f:
        negative_words.add(line.strip())

# Function to compute positive and negative scores for a given text
def compute_scores_from_text(text):
    words = text.split()
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = -sum(1 for word in words if word in negative_words)
    return positive_score, negative_score

# Function to clean text and compute total words
def clean_text_and_count_words(text):
    cleaned_text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = cleaned_text.split()
    total_words = len(words)
    return total_words

# Function to compute the subjectivity score with range limitation
def compute_subjectivity_score(positive_score, negative_score, total_words):
    subjectivity_score = min(1, (positive_score + negative_score) / (total_words + 0.000001))  # Limiting the range to 0 to +1
    return subjectivity_score

# Function to read the folder of extracted text files, compute scores, and subjectivity scores for each file
def read_folder_compute_scores_and_subjectivity(folder_path):
    for file_path in glob.glob(os.path.join(folder_path, '*.txt')):
        with open(file_path, 'r', encoding="utf-8") as file:
            text = file.read()
            total_words = clean_text_and_count_words(text)
            positive_score, negative_score = compute_scores_from_text(text)
            subjectivity_score = compute_subjectivity_score(positive_score, negative_score, total_words)
            file_name = os.path.basename(file_path)
            print(f"File: {file_name} - Positive Score: {positive_score}, Negative Score: {negative_score}, Subjectivity Score: {subjectivity_score}")

# Replace 'path_to_your_folder' with the appropriate path to the folder containing the extracted text files
folder_path = 'ExtractedText URL'
read_folder_compute_scores_and_subjectivity(folder_path)


File: 10282.6.txt - Positive Score: 51, Negative Score: -21, Subjectivity Score: 0.015948963308905387
File: 10744.4.txt - Positive Score: 42, Negative Score: -18, Subjectivity Score: 0.016961130730062803
File: 11206.2.txt - Positive Score: 22, Negative Score: -11, Subjectivity Score: 0.01199563793675503
File: 11668.0.txt - Positive Score: 2, Negative Score: -2, Subjectivity Score: 0.0
File: 12129.8.txt - Positive Score: 36, Negative Score: -11, Subjectivity Score: 0.028121484782765483
File: 123.0.txt - Positive Score: 74, Negative Score: -21, Subjectivity Score: 0.028726287247302824
File: 12591.6.txt - Positive Score: 70, Negative Score: -33, Subjectivity Score: 0.015832263579019144
File: 13053.4.txt - Positive Score: 52, Negative Score: -42, Subjectivity Score: 0.0049358341535361136
File: 13515.2.txt - Positive Score: 41, Negative Score: -17, Subjectivity Score: 0.01763409256602932
File: 13977.0.txt - Positive Score: 30, Negative Score: -22, Subjectivity Score: 0.005730659021682909
Fi

In [10]:
import os
import glob
import nltk.data
nltk.download('punkt')

# Function to compute the average sentence length for a given text
def compute_avg_sentence_length(text):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = tokenizer.tokenize(text)
    words = text.split()
    num_sentences = len(sentences)
    num_words = len(words)
    if num_sentences > 0:
        avg_sentence_length = num_words / num_sentences
    else:
        avg_sentence_length = 0
    return avg_sentence_length

# Function to read the folder of extracted text files and compute the average sentence length for each file
def read_folder_and_compute_avg_sentence_length(folder_path):
    for file_path in glob.glob(os.path.join(folder_path, '*.txt')):
        with open(file_path, 'r', encoding="utf-8") as file:
            text = file.read()
            avg_sentence_length = compute_avg_sentence_length(text)
            file_name = os.path.basename(file_path)
            print(f"File: {file_name} - Average Sentence Length: {avg_sentence_length}")

# Replace 'path_to_your_folder' with the appropriate path to the folder containing the extracted text files
folder_path = 'ExtractedText URL'
read_folder_and_compute_avg_sentence_length(folder_path)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


File: 10282.6.txt - Average Sentence Length: 23.25925925925926
File: 10744.4.txt - Average Sentence Length: 23.766666666666666
File: 11206.2.txt - Average Sentence Length: 18.775510204081634
File: 11668.0.txt - Average Sentence Length: 34.0
File: 12129.8.txt - Average Sentence Length: 22.3
File: 123.0.txt - Average Sentence Length: 22.0
File: 12591.6.txt - Average Sentence Length: 27.55294117647059
File: 13053.4.txt - Average Sentence Length: 21.585106382978722
File: 13515.2.txt - Average Sentence Length: 22.0
File: 13977.0.txt - Average Sentence Length: 16.48235294117647
File: 14438.8.txt - Average Sentence Length: 28.26153846153846
File: 14900.6.txt - Average Sentence Length: 20.270588235294117
File: 15362.4.txt - Average Sentence Length: 26.653846153846153
File: 15824.2.txt - Average Sentence Length: 153.15384615384616
File: 16286.0.txt - Average Sentence Length: 18.78
File: 16747.8.txt - Average Sentence Length: 19.88888888888889
File: 17209.6.txt - Average Sentence Length: 33.0
Fi

In [11]:
import os
import glob
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Function to compute the percentage of complex words for a given text
def compute_percentage_complex_words(text):
    words = nltk.word_tokenize(text)
    tagged_words = nltk.pos_tag(words)
    complex_word_count = sum(1 for word, tag in tagged_words if tag in ['JJ', 'JJR', 'JJS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'RB', 'RBR', 'RBS'])
    total_words = len(words)
    if total_words > 0:
        percentage_complex_words = (complex_word_count / total_words) * 100
    else:
        percentage_complex_words = 0
    return percentage_complex_words

# Function to read the folder of extracted text files and compute the percentage of complex words for each file
def read_folder_and_compute_percentage_complex_words(folder_path):
    results = []
    for file_path in sorted(glob.glob(os.path.join(folder_path, '*.txt'))):
        with open(file_path, 'r', encoding="utf-8") as file:
            text = file.read()
            percentage_complex_words = compute_percentage_complex_words(text)
            file_name = os.path.basename(file_path)
            results.append((file_name, percentage_complex_words))
    
    results.sort(key=lambda x: x[0])  # Sort results based on file names
    for result in results:
        print(f"File: {result[0]} - Percentage of Complex Words: {result[1]}")

# Replace 'path_to_your_folder' with the appropriate path to the folder containing the extracted text files
folder_path = 'ExtractedText URL'
read_folder_and_compute_percentage_complex_words(folder_path)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


File: 10282.6.txt - Percentage of Complex Words: 26.476014760147603
File: 10744.4.txt - Percentage of Complex Words: 27.432024169184288
File: 11206.2.txt - Percentage of Complex Words: 23.9292364990689
File: 11668.0.txt - Percentage of Complex Words: 11.284046692607005
File: 12129.8.txt - Percentage of Complex Words: 23.88781431334623
File: 123.0.txt - Percentage of Complex Words: 27.390677558865928
File: 12591.6.txt - Percentage of Complex Words: 28.4635711589279
File: 13053.4.txt - Percentage of Complex Words: 26.996351844345355
File: 13515.2.txt - Percentage of Complex Words: 25.620915032679736
File: 13977.0.txt - Percentage of Complex Words: 23.479318734793186
File: 14438.8.txt - Percentage of Complex Words: 28.40965041851305
File: 14900.6.txt - Percentage of Complex Words: 25.651302605210418
File: 15362.4.txt - Percentage of Complex Words: 23.19201995012469
File: 15824.2.txt - Percentage of Complex Words: 32.61176470588235
File: 16286.0.txt - Percentage of Complex Words: 21.915285

In [12]:
import os
import glob
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Function to compute the Fog Index for a given text
def compute_fog_index(average_sentence_length, percentage_complex_words):
    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)
    return fog_index

# Function to read the folder of extracted text files and compute the Fog Index for each file
def read_folder_and_compute_fog_index(folder_path):
    results = []
    for file_path in sorted(glob.glob(os.path.join(folder_path, '*.txt'))):
        with open(file_path, 'r', encoding="utf-8") as file:
            text = file.read()
            average_sentence_length = compute_avg_sentence_length(text)
            percentage_complex_words = compute_percentage_complex_words(text)
            fog_index = compute_fog_index(average_sentence_length, percentage_complex_words)
            file_name = os.path.basename(file_path)
            results.append((file_name, fog_index))
    
    results.sort(key=lambda x: x[0])  # Sort results based on file names
    for result in results:
        print(f"File: {result[0]} - Fog Index: {result[1]}")

# Replace 'path_to_your_folder' with the appropriate path to the folder containing the extracted text files
folder_path = 'ExtractedText URL'
read_folder_and_compute_fog_index(folder_path)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


File: 10282.6.txt - Fog Index: 19.894109607762744
File: 10744.4.txt - Fog Index: 20.479476334340383
File: 11206.2.txt - Fog Index: 17.081898681260213
File: 11668.0.txt - Fog Index: 18.113618677042805
File: 12129.8.txt - Fog Index: 18.475125725338494
File: 123.0.txt - Fog Index: 19.756271023546375
File: 12591.6.txt - Fog Index: 22.406604934159397
File: 13053.4.txt - Fog Index: 19.43258329092963
File: 13515.2.txt - Fog Index: 19.048366013071895
File: 13977.0.txt - Fog Index: 15.984668670387862
File: 14438.8.txt - Fog Index: 22.668475552020606
File: 14900.6.txt - Fog Index: 18.368756336201812
File: 15362.4.txt - Fog Index: 19.938346441588337
File: 15824.2.txt - Fog Index: 74.30624434389141
File: 16286.0.txt - Fog Index: 16.27811418047882
File: 16747.8.txt - Fog Index: 16.895383635784782
File: 17209.6.txt - Fog Index: 20.44770642201835
File: 17671.4.txt - Fog Index: 18.113618677042805
File: 18133.2.txt - Fog Index: 20.962180579216355
File: 18595.0.txt - Fog Index: 20.05854884427691
File: 1

In [13]:
import os
import glob
import nltk.data
nltk.download('punkt')

# Function to compute the average number of words per sentence for a given text
def compute_avg_words_per_sentence(text):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = tokenizer.tokenize(text)
    words = text.split()
    num_sentences = len(sentences)
    num_words = len(words)
    if num_sentences > 0:
        avg_words_per_sentence = num_words / num_sentences
    else:
        avg_words_per_sentence = 0
    return avg_words_per_sentence

# Function to read the folder of extracted text files and compute the average number of words per sentence for each file
def read_folder_and_compute_avg_words_per_sentence(folder_path):
    results = []
    for file_path in sorted(glob.glob(os.path.join(folder_path, '*.txt'))):
        with open(file_path, 'r', encoding="utf-8") as file:
            text = file.read()
            avg_words_per_sentence = compute_avg_words_per_sentence(text)
            file_name = os.path.basename(file_path)
            results.append((file_name, avg_words_per_sentence))

    results.sort(key=lambda x: x[0])  # Sort results based on file names
    for result in results:
        print(f"File: {result[0]} - Average Number of Words Per Sentence: {result[1]}")

# Replace 'path_to_your_folder' with the appropriate path to the folder containing the extracted text files
folder_path = 'ExtractedText URL'
read_folder_and_compute_avg_words_per_sentence(folder_path)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


File: 10282.6.txt - Average Number of Words Per Sentence: 23.25925925925926
File: 10744.4.txt - Average Number of Words Per Sentence: 23.766666666666666
File: 11206.2.txt - Average Number of Words Per Sentence: 18.775510204081634
File: 11668.0.txt - Average Number of Words Per Sentence: 34.0
File: 12129.8.txt - Average Number of Words Per Sentence: 22.3
File: 123.0.txt - Average Number of Words Per Sentence: 22.0
File: 12591.6.txt - Average Number of Words Per Sentence: 27.55294117647059
File: 13053.4.txt - Average Number of Words Per Sentence: 21.585106382978722
File: 13515.2.txt - Average Number of Words Per Sentence: 22.0
File: 13977.0.txt - Average Number of Words Per Sentence: 16.48235294117647
File: 14438.8.txt - Average Number of Words Per Sentence: 28.26153846153846
File: 14900.6.txt - Average Number of Words Per Sentence: 20.270588235294117
File: 15362.4.txt - Average Number of Words Per Sentence: 26.653846153846153
File: 15824.2.txt - Average Number of Words Per Sentence: 153

In [16]:
import os
import glob

# Function to calculate the number of syllables in a word based on its length
def count_syllables_simple(word):
    word = word.lower()
    if len(word) <= 3:
        return 1
    count = 0
    vowels = 'aeiouy'
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith('e'):
        count -= 1
    if word.endswith('le') and len(word) > 2 and word[-3] not in vowels:
        count += 1
    return max(1, count)

# Function to compute the number of complex words in a given text
def compute_complex_words(text):
    words = text.split()
    complex_word_count = sum(1 for word in words if count_syllables_simple(word) > 2)
    return complex_word_count

# Function to read the folder of extracted text files and compute the number of complex words for each file
def read_folder_and_compute_complex_words(folder_path):
    results = []
    for file_path in sorted(glob.glob(os.path.join(folder_path, '*.txt'))):
        with open(file_path, 'r', encoding="utf-8") as file:
            text = file.read()
            complex_word_count = compute_complex_words(text)
            file_name = os.path.basename(file_path)
            results.append((file_name, complex_word_count))

    results.sort(key=lambda x: x[0])  # Sort results based on file names
    for result in results:
        print(f"File: {result[0]} - Number of Complex Words: {result[1]}")

# Replace 'path_to_your_folder' with the appropriate path to the folder containing the extracted text files
folder_path = 'ExtractedText URL'
read_folder_and_compute_complex_words(folder_path)


File: 10282.6.txt - Number of Complex Words: 384
File: 10744.4.txt - Number of Complex Words: 275
File: 11206.2.txt - Number of Complex Words: 213
File: 11668.0.txt - Number of Complex Words: 49
File: 12129.8.txt - Number of Complex Words: 156
File: 123.0.txt - Number of Complex Words: 442
File: 12591.6.txt - Number of Complex Words: 479
File: 13053.4.txt - Number of Complex Words: 430
File: 13515.2.txt - Number of Complex Words: 332
File: 13977.0.txt - Number of Complex Words: 291
File: 14438.8.txt - Number of Complex Words: 332
File: 14900.6.txt - Number of Complex Words: 365
File: 15362.4.txt - Number of Complex Words: 176
File: 15824.2.txt - Number of Complex Words: 327
File: 16286.0.txt - Number of Complex Words: 219
File: 16747.8.txt - Number of Complex Words: 240
File: 17209.6.txt - Number of Complex Words: 94
File: 17671.4.txt - Number of Complex Words: 49
File: 18133.2.txt - Number of Complex Words: 311
File: 18595.0.txt - Number of Complex Words: 188
File: 19056.8.txt - Numbe

In [19]:
import os
import glob
import nltk
import string
import io

# Function to load stop words from files in a directory
def load_stop_words(stopwords_folder_path):
    stop_words = set()
    for file_path in glob.glob(os.path.join(stopwords_folder_path, '*.txt')):
        with io.open(file_path, 'r', encoding="utf-8", errors='ignore') as file:
            for word in file:
                stop_words.add(word.strip())
    return stop_words

# Load the stop words from the folder
stop_words_folder_path = 'StopWords'  # Replace with the appropriate path to the stopwords folder
stop_words = load_stop_words(stop_words_folder_path)

# Function to clean text and count the words
def clean_text_and_count_words(text):
    cleaned_text = text.lower()  # Convert to lowercase
    cleaned_text = ''.join([char for char in cleaned_text if char not in string.punctuation])  # Remove punctuation
    words = cleaned_text.split()
    cleaned_words = [word for word in words if word not in stop_words]
    word_count = len(cleaned_words)
    return word_count

# Function to read the folder of extracted text files and compute the word count for each file
def read_folder_and_compute_word_count(folder_path):
    results = []
    for file_path in sorted(glob.glob(os.path.join(folder_path, '*.txt'))):
        with io.open(file_path, 'r', encoding="utf-8", errors='ignore') as file:
            text = file.read()
            word_count = clean_text_and_count_words(text)
            file_name = os.path.basename(file_path)
            results.append((file_name, word_count))

    results.sort(key=lambda x: x[0])  # Sort results based on file names
    for result in results:
        print(f"File: {result[0]} - Word Count: {result[1]}")

# Replace 'path_to_your_folder' with the appropriate path to the folder containing the extracted text files
folder_path = 'ExtractedText URL'
read_folder_and_compute_word_count(folder_path)


File: 10282.6.txt - Word Count: 951
File: 10744.4.txt - Word Count: 700
File: 11206.2.txt - Word Count: 485
File: 11668.0.txt - Word Count: 141
File: 12129.8.txt - Word Count: 470
File: 123.0.txt - Word Count: 983
File: 12591.6.txt - Word Count: 1143
File: 13053.4.txt - Word Count: 1087
File: 13515.2.txt - Word Count: 732
File: 13977.0.txt - Word Count: 762
File: 14438.8.txt - Word Count: 813
File: 14900.6.txt - Word Count: 891
File: 15362.4.txt - Word Count: 403
File: 15824.2.txt - Word Count: 899
File: 16286.0.txt - Word Count: 500
File: 16747.8.txt - Word Count: 498
File: 17209.6.txt - Word Count: 212
File: 17671.4.txt - Word Count: 141
File: 18133.2.txt - Word Count: 816
File: 18595.0.txt - Word Count: 482
File: 19056.8.txt - Word Count: 932
File: 19518.6.txt - Word Count: 449
File: 19980.4.txt - Word Count: 327
File: 20442.2.txt - Word Count: 528
File: 20904.0.txt - Word Count: 389
File: 21365.8.txt - Word Count: 288
File: 21827.6.txt - Word Count: 220
File: 22289.4.txt - Word Cou

In [20]:
import os
import glob
import string

# Function to count the number of syllables in a word with exceptions for "es" and "ed"
def count_syllables_per_word(word):
    count = 0
    vowels = 'aeiouy'
    endings = ['es', 'ed']
    word = word.lower().strip(string.punctuation)
    if word.endswith(tuple(endings)):
        word = word[: -2]
    if len(word) <= 2:
        return 1
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith('e'):
        count -= 1
    return max(1, count)

# Function to read the folder of extracted text files and compute the syllable count per word for each file
def read_folder_and_compute_syllable_count_per_word(folder_path):
    results = []
    for file_path in sorted(glob.glob(os.path.join(folder_path, '*.txt'))):
        with open(file_path, 'r', encoding="utf-8") as file:
            text = file.read()
            words = text.split()
            syllable_count_per_word = sum(count_syllables_per_word(word) for word in words)
            file_name = os.path.basename(file_path)
            results.append((file_name, syllable_count_per_word))

    results.sort(key=lambda x: x[0])  # Sort results based on file names
    for result in results:
        print(f"File: {result[0]} - Syllable Count Per Word: {result[1]}")

# Replace 'path_to_your_folder' with the appropriate path to the folder containing the extracted text files
folder_path = 'ExtractedText URL'
read_folder_and_compute_syllable_count_per_word(folder_path)


File: 10282.6.txt - Syllable Count Per Word: 3133
File: 10744.4.txt - Syllable Count Per Word: 2333
File: 11206.2.txt - Syllable Count Per Word: 1549
File: 11668.0.txt - Syllable Count Per Word: 382
File: 12129.8.txt - Syllable Count Per Word: 1410
File: 123.0.txt - Syllable Count Per Word: 3383
File: 12591.6.txt - Syllable Count Per Word: 3904
File: 13053.4.txt - Syllable Count Per Word: 3423
File: 13515.2.txt - Syllable Count Per Word: 2432
File: 13977.0.txt - Syllable Count Per Word: 2333
File: 14438.8.txt - Syllable Count Per Word: 2943
File: 14900.6.txt - Syllable Count Per Word: 2915
File: 15362.4.txt - Syllable Count Per Word: 1250
File: 15824.2.txt - Syllable Count Per Word: 3128
File: 16286.0.txt - Syllable Count Per Word: 1602
File: 16747.8.txt - Syllable Count Per Word: 1623
File: 17209.6.txt - Syllable Count Per Word: 675
File: 17671.4.txt - Syllable Count Per Word: 382
File: 18133.2.txt - Syllable Count Per Word: 2579
File: 18595.0.txt - Syllable Count Per Word: 1541
File:

In [21]:
import os
import glob
import re

# Function to compute the count of personal pronouns in the text
def compute_personal_pronouns(text):
    personal_pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text, flags=re.IGNORECASE)
    count = len(personal_pronouns)
    return count

# Function to read the folder of extracted text files and compute the count of personal pronouns for each file
def read_folder_and_compute_personal_pronouns(folder_path):
    results = []
    for file_path in sorted(glob.glob(os.path.join(folder_path, '*.txt'))):
        with open(file_path, 'r', encoding="utf-8") as file:
            text = file.read()
            personal_pronouns_count = compute_personal_pronouns(text)
            file_name = os.path.basename(file_path)
            results.append((file_name, personal_pronouns_count))

    results.sort(key=lambda x: x[0])  # Sort results based on file names
    for result in results:
        print(f"File: {result[0]} - Personal Pronouns Count: {result[1]}")

# Replace 'path_to_your_folder' with the appropriate path to the folder containing the extracted text files
folder_path = 'ExtractedText URL'
read_folder_and_compute_personal_pronouns(folder_path)


File: 10282.6.txt - Personal Pronouns Count: 18
File: 10744.4.txt - Personal Pronouns Count: 23
File: 11206.2.txt - Personal Pronouns Count: 9
File: 11668.0.txt - Personal Pronouns Count: 2
File: 12129.8.txt - Personal Pronouns Count: 4
File: 123.0.txt - Personal Pronouns Count: 4
File: 12591.6.txt - Personal Pronouns Count: 13
File: 13053.4.txt - Personal Pronouns Count: 3
File: 13515.2.txt - Personal Pronouns Count: 12
File: 13977.0.txt - Personal Pronouns Count: 8
File: 14438.8.txt - Personal Pronouns Count: 34
File: 14900.6.txt - Personal Pronouns Count: 13
File: 15362.4.txt - Personal Pronouns Count: 2
File: 15824.2.txt - Personal Pronouns Count: 18
File: 16286.0.txt - Personal Pronouns Count: 2
File: 16747.8.txt - Personal Pronouns Count: 3
File: 17209.6.txt - Personal Pronouns Count: 4
File: 17671.4.txt - Personal Pronouns Count: 2
File: 18133.2.txt - Personal Pronouns Count: 4
File: 18595.0.txt - Personal Pronouns Count: 7
File: 19056.8.txt - Personal Pronouns Count: 12
File: 1

In [24]:
import os
import glob

# Function to compute the average word length in the text
def compute_average_word_length(text):
    words = text.split()
    total_characters = sum(len(word) for word in words)
    total_words = len(words)
    if total_words > 0:
        average_word_length = total_characters / total_words
    else:
        average_word_length = 0
    return average_word_length

# Function to read the folder of extracted text files and compute the average word length for each file
def read_folder_and_compute_average_word_length(folder_path):
    results = []
    for file_path in sorted(glob.glob(os.path.join(folder_path, '*.txt'))):
        with open(file_path, 'r', encoding="utf-8") as file:
            text = file.read()
            average_word_length = compute_average_word_length(text)
            file_name = os.path.basename(file_path)
            results.append((file_name, average_word_length))

    results.sort(key=lambda x: x[0])  # Sort results based on file names
    for result in results:
        print(f"File: {result[0]} - Average Word Length: {result[1]}")

# Replace 'path_to_your_folder' with the appropriate path to the folder containing the extracted text files
folder_path = 'ExtractedText URL'
read_folder_and_compute_average_word_length(folder_path)


File: 10282.6.txt - Average Word Length: 5.286624203821656
File: 10744.4.txt - Average Word Length: 5.222300140252455
File: 11206.2.txt - Average Word Length: 5.344565217391304
File: 11668.0.txt - Average Word Length: 5.799019607843137
File: 12129.8.txt - Average Word Length: 5.133408071748879
File: 123.0.txt - Average Word Length: 5.7229437229437226
File: 12591.6.txt - Average Word Length: 5.114432109308283
File: 13053.4.txt - Average Word Length: 5.387875800887136
File: 13515.2.txt - Average Word Length: 5.578445747800586
File: 13977.0.txt - Average Word Length: 5.269807280513919
File: 14438.8.txt - Average Word Length: 5.009798584648884
File: 14900.6.txt - Average Word Length: 5.32965757399884
File: 15362.4.txt - Average Word Length: 5.645021645021645
File: 15824.2.txt - Average Word Length: 4.9136112506278256
File: 16286.0.txt - Average Word Length: 5.3567625133120345
File: 16747.8.txt - Average Word Length: 5.64245810055866
File: 17209.6.txt - Average Word Length: 5.70523415977961