# Data scraping

In [1]:
import nltk
import requests
from bs4 import BeautifulSoup

In [2]:
req = requests.get("https://insights.blackcoffer.com/what-is-the-chance-homo-sapiens-will-survive-for-the-next-500-years/")

In [3]:
soup = BeautifulSoup(req.content, "html.parser")
res = soup.title
paras = soup.find_all('p')

In [4]:
texts = " ".join([paragraph.text.strip() for paragraph in paras])
print(texts)

We’ve really done it this year. Like an insatiable glutton, the law of averages has come home to roost. We should’ve taken the hint when on the 1st of January, 66 people lost their lives in the Jakarta floods. What followed was like the highlights reel of a disaster movie franchise – a volcanic eruption in The Philippines, irrepressible bushfires in Australia, earthquakes in Russia, Iran, Turkey, India, and China. And speaking of China. 2020 has brought home the fragile mortality of the human race into sharp focus. As global Covid-19 deaths stoutly push past the grim 1 million marks, we have no choice but to question our place in the universe – are we the all-conquering masters of our domain, or mere tourists in a ruthlessly apathetic ecosystem? Is the human race on the ubiquitous three-part literary arc that defines every story, every life, every civilization – ascent, apex, and descent? Maybe when Michael Jackson unveiled his moonwalk in 1983, or when Barack Obama stepped into the Wh

In [5]:
len(texts)

8652

Removing Punctuation

In [6]:
import re
import string
texts = re.sub(r'[\'\“\”\()\%\,\-\'\’\?\ ]', ' ', texts)
texts[0:200]
len(texts)

8652

# Removing Stopwords

In [7]:
from nltk.tokenize import word_tokenize

In [8]:
token = word_tokenize(texts)
token

['We',
 've',
 'really',
 'done',
 'it',
 'this',
 'year',
 '.',
 'Like',
 'an',
 'insatiable',
 'glutton',
 'the',
 'law',
 'of',
 'averages',
 'has',
 'come',
 'home',
 'to',
 'roost',
 '.',
 'We',
 'should',
 've',
 'taken',
 'the',
 'hint',
 'when',
 'on',
 'the',
 '1st',
 'of',
 'January',
 '66',
 'people',
 'lost',
 'their',
 'lives',
 'in',
 'the',
 'Jakarta',
 'floods',
 '.',
 'What',
 'followed',
 'was',
 'like',
 'the',
 'highlights',
 'reel',
 'of',
 'a',
 'disaster',
 'movie',
 'franchise',
 '–',
 'a',
 'volcanic',
 'eruption',
 'in',
 'The',
 'Philippines',
 'irrepressible',
 'bushfires',
 'in',
 'Australia',
 'earthquakes',
 'in',
 'Russia',
 'Iran',
 'Turkey',
 'India',
 'and',
 'China',
 '.',
 'And',
 'speaking',
 'of',
 'China',
 '.',
 '2020',
 'has',
 'brought',
 'home',
 'the',
 'fragile',
 'mortality',
 'of',
 'the',
 'human',
 'race',
 'into',
 'sharp',
 'focus',
 '.',
 'As',
 'global',
 'Covid',
 '19',
 'deaths',
 'stoutly',
 'push',
 'past',
 'the',
 'grim',
 '1'

In [9]:
with open("D:\lab\Dataset\Stopwords_Blackcoffer.txt", 'r') as file:
    data = file.read()

In [10]:
data[:100]

'ERNST\nYOUNG\nDELOITTE\nTOUCHE\nKPMG\nPRICEWATERHOUSECOOPERS\nPRICEWATERHOUSE\nCOOPERS\nAFGHANI\nARIARY\nBAHT\n'

In [11]:
data = data.replace('\n', ',')
data[:100]

'ERNST,YOUNG,DELOITTE,TOUCHE,KPMG,PRICEWATERHOUSECOOPERS,PRICEWATERHOUSE,COOPERS,AFGHANI,ARIARY,BAHT,'

In [12]:
new_texts = " ".join([i for i in token if i not in data])
new_texts[:200]

'We year . Like insatiable glutton law averages home roost . We hint 1st January 66 people lost lives Jakarta floods . What highlights reel disaster movie franchise – volcanic eruption The Philippines '

In [13]:
print("length before cleaning: ", len(texts))
print("length after cleaning: ", len(new_texts))

length before cleaning:  8652
length after cleaning:  5472


# Negative words

In [14]:
with open("D:\\lab\\Dataset\\Negative_words.txt", 'r') as file:
    negative = file.read()

In [15]:
negative[:206]

'2-faced\n2-faces\nabnormal\nabolish\nabominable\nabominably\nabominate\nabomination\nabort\naborted\naborts\nabrade\nabrasive\nabrupt\nabruptly\nabscond\nabsence\nabsent-minded\nabsentee\nabsurd\nabsurdity\nabsurdly\nabsurdness\n'

In [16]:
negative = negative.replace('\n', ',')
negative[:206]

'2-faced,2-faces,abnormal,abolish,abominable,abominably,abominate,abomination,abort,aborted,aborts,abrade,abrasive,abrupt,abruptly,abscond,absence,absent-minded,absentee,absurd,absurdity,absurdly,absurdness,'

In [17]:
new_texts = word_tokenize(new_texts)
negative_words = [i for i in new_texts if i in negative]
print(negative_words)
len(negative_words)

['insatiable', 'law', 'hint', 'lost', 'disaster', 'eruption', 'irrepressible', 'fragile', 'human', 'race', 'sharp', 'grim', 'question', 'ruthlessly', 'apathetic', 'human', 'race', 'arc', 'life', 'steady', 'tip', 'rates', 'grave', 'pertinent', 'catastrophe', 'threat', 'check', 'check', 'back', 'wall', 'make', 'disruptive', 'bio', 'human', 'doubt', 'critical', 'human', 'pin', 'nature', 'destructive', 'throws', 'developed', 'weakening', 'advantage', 'back', 'length', 'front', 'scary', 'stuff', 'left', 'bio', 'lead', 'agile', 'human', 'small', 'brain', 'hear', 'reaction', 'rest', 'blood', 'person', 'determined', 'run', 'feverish', 'key', 'manipulation', 'left', 'adjust', 'water', 'air', 'human', 'led', 'earth', 'believer', 'concept', 'owns', 'make', 'small', 'inevitably', 'concerted', 'factor', 'inevitable', 'factor', 'eruption', 'toll', 'human', 'concept', 'fiction', 'station', 'ago', 'flag', 'peak', 'reasonable', 'urge', 'earth', 'forgiving', 'list', 'struck', 'surge', 'social', 'human',

117

# Positive words

In [18]:
with open("D:\\lab\\Dataset\\Positive_words.txt", 'r') as file:
    positive = file.read()
positive[:210]    

'a+\nabound\nabounds\nabundance\nabundant\naccessable\naccessible\nacclaim\nacclaimed\nacclamation\naccolade\naccolades\naccommodative\naccomodative\naccomplish\naccomplished\naccomplishment\naccomplishments\naccurate\naccurately\n'

In [19]:
positive = positive.replace('\n',',')
positive[:210]

'a+,abound,abounds,abundance,abundant,accessable,accessible,acclaim,acclaimed,acclamation,accolade,accolades,accommodative,accomodative,accomplish,accomplished,accomplishment,accomplishments,accurate,accurately,'

In [20]:
positive_words = [i for i in new_texts if i in positive]
print(positive_words)
len(positive_words)

['law', 'human', 'race', 'sharp', 'marks', 'question', 'masters', 'human', 'race', 'life', 'steady', 'world', 'dropping', 'optimistic', 'famed', 'warming', 'back', 'advent', 'intelligence', 'enhancement', 'good', 'human', 'doubt', 'human', 'survival', 'pin', 'enterprising', 'advantage', 'survival', 'gift', 'back', 'world', 'smart', 'thrilling', 'intelligence', 'fable', 'enhancement', 'lead', 'evolution', 'smarter', 'faster', 'agile', 'adapt', 'promises', 'world', 'human', 'brain', 'intelligence', 'enhanced', 'hear', 'rest', 'person', 'run', 'faster', 'future', 'breakthrough', 'adjust', 'air', 'human', 'led', 'simple', 'fervent', 'world', 'cheaper', 'tantalizing', 'factor', 'factor', 'warming', 'toll', 'feasible', 'human', 'straight', 'reasonable', 'survival', 'list', 'struck', 'adapt', 'world', 'world', 'human', 'race', 'world', 'future', 'rest', 'vision', 'future', 'smart', 'product', 'catch', 'fast', 'held', 'back', 'gene', 'luxury', 'human', 'spirit', 'thrive', 'human', 'race', 'dou

105

#### positive Score


In [21]:
pos_score = len(positive_words)
print(pos_score)

105


#### Negative Score

In [22]:
neg_score=len(negative_words)
print(neg_score)

117


#### Polarity Score

In [23]:
Polarity_Score = (pos_score - neg_score)/((pos_score + neg_score) + 0.000001)
round(Polarity_Score,2)

-0.05

In [24]:
word_count = len(new_texts)
print("number of words after cleaning :",word_count)

number of words after cleaning : 790


#### Subjectivity Score

In [25]:
Subjectivity_Score = (pos_score + neg_score)/ ((word_count) + 0.000001)
round(Subjectivity_Score,2)

0.28

# Analysis of Readability

In [26]:
import nltk
from nltk.tokenize import word_tokenize
word_tokens = nltk.word_tokenize(texts)
No_of_words = len(word_tokens)
No_of_words

1542

In [27]:
sent_tokens = nltk.sent_tokenize(texts)
No_of_sents = len(sent_tokens)
No_of_sents 

67

### Average sentence Length


In [28]:
Avg_Sents_Length = No_of_words / No_of_sents
round(Avg_Sents_Length,2)

23.01

### Percentage of Complex words
Complex words: words with more than 2 syllable are called complex words

In [29]:
from nltk.corpus import cmudict
cmud = cmudict.dict()

# Defining a function to count syllables in a word
def count_syllables(word):
    phonemes = cmud[word.lower()][0] 
    return len([s for s in phonemes if s[-1].isdigit()])

# Identifing complex words
w_tokens = [i for i in word_tokens if i in cmud]
complex_words = [word for word in w_tokens if count_syllables(word) > 2]

# Calculatingnumber of complex words
num_complex_words = len(complex_words)

print("Number of complex words:", num_complex_words)
print("Total number of words:", len(texts))

Number of complex words: 195
Total number of words: 8652


In [30]:
Perc_of_Complex_words = len(texts) / num_complex_words
round(Perc_of_Complex_words,2)

44.37

### Fog Index

In [31]:
# Tokenize each sentence into words
words = [word_tokens for sentence in sent_tokens]

# Calculate the average sentence length
avg_sent_len = sum(No_of_sents for sentence in words) / No_of_sents

print("Average sentence length:", avg_sent_len)


Average sentence length: 67.0


In [32]:
Fog_index = 0.4 * (avg_sent_len + Perc_of_Complex_words)
round(Fog_index,2)

44.55

# Average Number of Words Per Sentence

In [33]:
avg_no_of_words_per_sent = No_of_words / No_of_sents
round(avg_no_of_words_per_sent,2)

23.01

# Complex Word Count

In [34]:
from nltk.corpus import cmudict
cmud = cmudict.dict()

# Defining a function to count syllables in a word
def count_syllables(word):
    phonemes = cmud[word.lower()][0] 
    return len([s for s in phonemes if s[-1].isdigit()])

# Identifing complex words
w_tokens = [i for i in word_tokens if i in cmud]
complex_words = [word for word in w_tokens if count_syllables(word) > 2]

# Calculatingnumber of complex words
num_complex_words = len(complex_words)

print("complex word count:", num_complex_words)

complex word count: 195


# Word Count
number of words after cleaning

In [35]:
len(new_texts)

790

# Syllable Count Per Word

In [36]:
word = input("Enter a word to get syllable: ")
def count_syllables(word):
    phonemes = cmud[word.lower()][0] 
    return len([s for s in phonemes if s[-1].isdigit()])

print("Number of syllable in a word :",count_syllables(word))

Enter a word to get syllable: question
Number of syllable in a word : 2


# Personal Pronouns

In [37]:
import re

# Define a regex pattern to match personal pronouns
pattern = r'\b(I|you|he|she|it|we|they|me|him|her|us|them)\b'

# Count the frequency of personal pronouns
pronoun_freq = {}
for pronoun in re.findall(pattern, texts, re.IGNORECASE):
    pronoun = pronoun.lower()
    if pronoun in pronoun_freq:
        pronoun_freq[pronoun] += 1
    else:
        pronoun_freq[pronoun] = 1

print("Personal pronoun frequency:", pronoun_freq)
def returnSum(dict):
 
    sum = 0
    for i in pronoun_freq.values():
        sum = sum + i
 
    return sum
print("total number of pronouns in a article :",returnSum(pronoun_freq))

Personal pronoun frequency: {'we': 13, 'it': 8, 'they': 4, 'us': 5, 'them': 1}
total number of pronouns in a article : 31


# Average Word Length

In [38]:
# Calculate the total number of characters in all words
total_chars = sum(len(word) for word in word_tokens)

avg_word_length = total_chars / No_of_words

print("Average word length:", round(avg_word_length,2))


Average word length: 4.58
