In [115]:
import nltk
from textblob import TextBlob
from nltk.corpus import cmudict
from nltk.tokenize import sent_tokenize, word_tokenize

# Load CMU pronouncing dictionary for syllable counting
d = cmudict.dict()

# Load text file with UTF-8 encoding
with open('httpsinsights.blackcoffer.comchallenges-and-opportunities-of-big-data-in-healthcare.txt', 'r', encoding='utf-8', errors='replace') as file:
    text = file.read()

# Tokenize text into sentences and words
sentences = sent_tokenize(text)
words = word_tokenize(text)

# Calculate metrics
positive_score = 0
negative_score = 0
polarity_score = 0
subjectivity_score = 0
avg_sentence_length = 0
percentage_complex_words = 0
fog_index = 0
avg_word_length = 0
syllables_per_word = 0
personal_pronouns = 0
complex_word_count = 0
word_count = len(words)

for sentence in sentences:
    # Calculate sentence-level metrics
    sentence_blob = TextBlob(sentence)
    positive_score += sentence_blob.sentiment.polarity if sentence_blob.sentiment.polarity > 0 else 0
    negative_score += abs(sentence_blob.sentiment.polarity) if sentence_blob.sentiment.polarity < 0 else 0
    polarity_score += sentence_blob.sentiment.polarity
    subjectivity_score += sentence_blob.sentiment.subjectivity
    words_in_sentence = word_tokenize(sentence)
    avg_sentence_length += len(words_in_sentence)
    complex_word_count += sum([1 for word in words_in_sentence if len(word) > 2 and d.get(word.lower()) and len(d.get(word.lower())[0]) > 2])

# Calculate document-level metrics
avg_sentence_length /= len(sentences)
percentage_complex_words = complex_word_count / word_count * 100
fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
avg_word_length = sum(len(word) for word in words) / word_count
syllables_per_word = sum([len(d.get(word.lower())[0]) for word in words if d.get(word.lower())]) / word_count
personal_pronouns = sum([1 for word in words if word.lower() in ['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']])

# Print results
print("Positive score: ", positive_score)
print("Negative score: ", negative_score)
print("Polarity score: ", polarity_score)
print("Subjectivity score: ", subjectivity_score)
print("Avg sentence length: ", avg_sentence_length)
print("Percentage of complex words: ", percentage_complex_words)
print("Fog index: ", fog_index)
print("Avg number of words per sentence: ", avg_sentence_length)
print("Complex word count: ", complex_word_count)
print("Word count: ", word_count)
print("Syllables per word: ", syllables_per_word)
print("Personal pronouns: ", personal_pronouns)
print("Avg word length: ", avg_word_length)


Positive score:  9.248870078334365
Negative score:  1.5699867724867724
Polarity score:  7.678883305847592
Subjectivity score:  24.60919655741084
Avg sentence length:  20.873015873015873
Percentage of complex words:  61.21673003802282
Fog index:  32.83589836441548
Avg number of words per sentence:  20.873015873015873
Complex word count:  805
Word count:  1315
Syllables per word:  3.7300380228136882
Personal pronouns:  11
Avg word length:  4.844866920152091
