## Import Library

In [35]:
from bs4 import BeautifulSoup
import requests
import string
import re
import nltk
import textstat

## Text Analysis 


In [587]:
url="""https://insights.blackcoffer.com/telemedicine-what-patients-like-and-dislike-about-it//"""
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0"}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html >
<!--[if IE 8]>    <html class="ie8" lang="en"> <![endif]-->
<!--[if IE 9]>    <html class="ie9" lang="en"> <![endif]-->
<!--[if gt IE 8]><!-->
<html lang="en-US">
 <!--<![endif]-->
 <head>
  <link href="https://fonts.googleapis.com/css?family=Open+Sans%3A300italic%2C400%2C400italic%2C600%2C600italic%2C700%7CRoboto%3A300%2C400%2C400italic%2C500%2C500italic%2C700%2C900" rel="stylesheet"/>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <link href="https://insights.blackcoffer.com/xmlrpc.php" rel="pingback"/>
  <meta content="index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1" name="robots">
   <meta content="https://insights.blackcoffer.com/wp-content/uploads/2021/01/03-1-760x428-1.jpg" property="og:image"/>
   <!-- This site is optimized with the Yoast SEO plugin v19.13 - https://yoast.com/wordpress/plugins/seo/ -->
   <title>
    telemedicine - What patients like and dislike about it | blac

## Extract title from article

In [588]:
title=soup.find('h1',class_="entry-title")
title.get_text()

'What patients like and dislike about telemedicine?'

## Extract content from article

In [589]:
cont=soup.findAll(class_='td-post-content')
cont=cont[0].text.replace('\n'," ")
cont[0:200]

' In today’s world, telemedicine technology is one of those technologies which has brought about a change. Compared to the early days there have been remarkable differences in the methods of consultati'

## Remove punctuation from the content

In [590]:
content = cont.translate(str.maketrans('', '', string.punctuation))
print(content)

 In today’s world telemedicine technology is one of those technologies which has brought about a change Compared to the early days there have been remarkable differences in the methods of consultation with a doctor In the years that have passed by consultation for a disease with a doctor was quite hectic It involved waiting traveling etc But with the advent of telemedicine opportunities this has completely changed It is a rural area that has been completely blessed with the invention of telemedicine Today a considerable amount of people are able to consult doctors remotely Not just doctors but specialists in various fields of medicine This has been of great importance as far as rural people are concerned There are a lot of telemedicine tools that have been found There are a lot of areas like ophthalmology oncology dermatology etc where the facility of telemedicine has been practiced Most of the patients are truly benefitting from telemedicine Patients are pretty satisfied with the cons

## Convert alphanumeric characters as tokens and drop everything else

### Import RegexpTokenizer

In [591]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
text_tokens=tokenizer.tokenize(content)
print(text_tokens[0:20])

['In', 'today', 's', 'world', 'telemedicine', 'technology', 'is', 'one', 'of', 'those', 'technologies', 'which', 'has', 'brought', 'about', 'a', 'change', 'Compared', 'to', 'the']


### Change all token to uppercase(for comparison)

In [592]:
text_tokens=[j.upper() for j in text_tokens]
print(text_tokens[0:20])

['IN', 'TODAY', 'S', 'WORLD', 'TELEMEDICINE', 'TECHNOLOGY', 'IS', 'ONE', 'OF', 'THOSE', 'TECHNOLOGIES', 'WHICH', 'HAS', 'BROUGHT', 'ABOUT', 'A', 'CHANGE', 'COMPARED', 'TO', 'THE']


### Length of tokens before removing stopwords

In [593]:
len(text_tokens)

390

## Remove stopwords from the tokens

### Import Stopwords(only english)

In [594]:
import nltk
from nltk.corpus import stopwords
stop_words=stopwords.words('english')
print(stop_words[0:20])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


### Change all stopwords to uppercase(for comparison)

In [595]:
stop_words=[j.upper() for j in stop_words]
print(stop_words[0:20])

['I', 'ME', 'MY', 'MYSELF', 'WE', 'OUR', 'OURS', 'OURSELVES', 'YOU', "YOU'RE", "YOU'VE", "YOU'LL", "YOU'D", 'YOUR', 'YOURS', 'YOURSELF', 'YOURSELVES', 'HE', 'HIM', 'HIS']


### Remove Stopwords

In [596]:
no_stop_tokens = [word for word in text_tokens if not word in stop_words]
print(no_stop_tokens[0:40])

['TODAY', 'WORLD', 'TELEMEDICINE', 'TECHNOLOGY', 'ONE', 'TECHNOLOGIES', 'BROUGHT', 'CHANGE', 'COMPARED', 'EARLY', 'DAYS', 'REMARKABLE', 'DIFFERENCES', 'METHODS', 'CONSULTATION', 'DOCTOR', 'YEARS', 'PASSED', 'CONSULTATION', 'DISEASE', 'DOCTOR', 'QUITE', 'HECTIC', 'INVOLVED', 'WAITING', 'TRAVELING', 'ETC', 'ADVENT', 'TELEMEDICINE', 'OPPORTUNITIES', 'COMPLETELY', 'CHANGED', 'RURAL', 'AREA', 'COMPLETELY', 'BLESSED', 'INVENTION', 'TELEMEDICINE', 'TODAY', 'CONSIDERABLE']


## Word Count( Total Words after cleaning(with duplicates))

In [597]:
word_after_clean=len(no_stop_tokens)
print(word_after_clean)

194


### Remove duplicates from token

In [598]:
result1 = []
[result1.append(m) for m in no_stop_tokens if m not in result1]
print(result1[0:40])

['TODAY', 'WORLD', 'TELEMEDICINE', 'TECHNOLOGY', 'ONE', 'TECHNOLOGIES', 'BROUGHT', 'CHANGE', 'COMPARED', 'EARLY', 'DAYS', 'REMARKABLE', 'DIFFERENCES', 'METHODS', 'CONSULTATION', 'DOCTOR', 'YEARS', 'PASSED', 'DISEASE', 'QUITE', 'HECTIC', 'INVOLVED', 'WAITING', 'TRAVELING', 'ETC', 'ADVENT', 'OPPORTUNITIES', 'COMPLETELY', 'CHANGED', 'RURAL', 'AREA', 'BLESSED', 'INVENTION', 'CONSIDERABLE', 'AMOUNT', 'PEOPLE', 'ABLE', 'CONSULT', 'DOCTORS', 'REMOTELY']


### Total Words after cleaning(without duplicates)

In [599]:
len(result1)

135

## Check for positive words

In [600]:
filename1 = 'positive-words.txt'
file1 = open(filename1, 'rt')
text1 = file1.read()
file1.close()
# split into words by white space
pword = text1.split()
print(pword[0:20])

['a+', 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'accessible', 'acclaim', 'acclaimed', 'acclamation', 'accolade', 'accolades', 'accommodative', 'accomodative', 'accomplish', 'accomplished', 'accomplishment', 'accomplishments', 'accurate', 'accurately']


### Change all positive words to uppercase(for comparison)

In [601]:
pword=[j.upper() for j in pword]
print(pword[0:20])

['A+', 'ABOUND', 'ABOUNDS', 'ABUNDANCE', 'ABUNDANT', 'ACCESSABLE', 'ACCESSIBLE', 'ACCLAIM', 'ACCLAIMED', 'ACCLAMATION', 'ACCOLADE', 'ACCOLADES', 'ACCOMMODATIVE', 'ACCOMODATIVE', 'ACCOMPLISH', 'ACCOMPLISHED', 'ACCOMPLISHMENT', 'ACCOMPLISHMENTS', 'ACCURATE', 'ACCURATELY']


### Check positive score

In [602]:
p_score = []
[p_score.append(m) for m in result1 if m in pword]
print(p_score)

['REMARKABLE', 'GREAT', 'LIKE', 'PRETTY', 'SATISFIED', 'AFFORDABLE', 'EFFICIENT', 'QUALIFIED', 'BOOST', 'CONFIDENCE', 'BETTER', 'PATIENT', 'TRUST', 'TRUSTED', 'WORTHY']


In [603]:
positive_score=len(p_score)
print(positive_score)

15


## Check for negative words

In [604]:
filename2 = 'negative-words.txt'
file2 = open(filename2, 'rt')
text2 = file2.read()
file2.close()
# split into words by white space
nword = text2.split()
print(nword[0:20])

['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted', 'aborts', 'abrade', 'abrasive', 'abrupt', 'abruptly', 'abscond', 'absence', 'absent-minded', 'absentee', 'absurd']


### Change all negative words to uppercase(for comparison)

In [605]:
nword=[j.upper() for j in nword]
print(nword[0:20])

['2-FACED', '2-FACES', 'ABNORMAL', 'ABOLISH', 'ABOMINABLE', 'ABOMINABLY', 'ABOMINATE', 'ABOMINATION', 'ABORT', 'ABORTED', 'ABORTS', 'ABRADE', 'ABRASIVE', 'ABRUPT', 'ABRUPTLY', 'ABSCOND', 'ABSENCE', 'ABSENT-MINDED', 'ABSENTEE', 'ABSURD']


### Check negative score

In [606]:
n_score = []
[n_score.append(m) for m in result1 if m in nword]
print(n_score)

['HECTIC', 'CONCERNED', 'DOUBT', 'DISTRUST', 'GLITCHES', 'HINDER', 'EMERGENCY', 'LIMIT']


In [607]:
negative_score=len(n_score)
print(negative_score)

8


## Polarity Score

In [608]:
polarity_score = (positive_score-negative_score)/((positive_score+negative_score) + 0.000001)
print(polarity_score)

0.3043478128544429


## Subjectivity Score

In [609]:
subjectivity_score = (positive_score+negative_score)/((word_after_clean) + 0.000001)
print(subjectivity_score)

0.11855670041981083


### Total number of sentences

In [610]:
total_sentences=textstat.sentence_count(cont)
print(total_sentences)

28


## Average sentence length

In [611]:
Avg_sentence_length = (word_after_clean)/total_sentences
print('Average Sentence length =', Avg_sentence_length)

Average Sentence length = 6.928571428571429


## Complex word count

In [612]:
def syllable_count(word):
    count = 0
    vowels = "AEIOUYaeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)): 
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
            if word.endswith("es"or "ed"):
                count -= 1
    if count == 0:
        count += 1
    return count
complex_words=0
total_syllable=0
for i in range(0,len(text_tokens)):
    c=syllable_count(text_tokens[i])
    total_syllable=c+total_syllable
    if(c>2):
        complex_words+=1

print("total complex words=",complex_words)
print("total syllable=",total_syllable)

total complex words= 70
total syllable= 711


## Percentage of complex Words

In [613]:
Percentage_complex=(complex_words)/(word_after_clean)
print("percentage of complex words=",Percentage_complex)

percentage of complex words= 0.36082474226804123


## Fog Index

In [614]:
print("Fog index=",0.4 * (Avg_sentence_length + Percentage_complex ))

Fog index= 2.9157584683357882


## Average Number of Words Per Sentence

In [615]:
Avg_words_per_sentence = (word_after_clean)/total_sentences
print('Average number of words per sentence =', Avg_words_per_sentence)

Average number of words per sentence = 6.928571428571429


## Average Word Length

In [616]:
Average_word_length=len(content.replace(' ',''))/len(cont.split())
print("Average word length=",Average_word_length)

Average word length= 5.149484536082475


## Syllable Count Per Word

In [617]:
print("Syllable count per word=",total_syllable/len(cont.split()))

Syllable count per word= 1.8324742268041236


## Count Personal Pronouns

In [618]:
import re
pronoun='I|We|my|us|ours'

In [619]:
count_pronoun=len(re.findall(pronoun,cont, flags=re.IGNORECASE))
print(count_pronoun)

156
