In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import os
import string
import nltk
import re
from bs4 import BeautifulSoup
import requests

Download required packages

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
nltk.download('cmudict')

[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


True

In [4]:
#To store scraped texts 
dir = "Extracted_Texts"
os.mkdir(dir)

In [5]:
#reading the urls from Input.xlsx which will be web scraped
df = pd.read_excel("/content/drive/MyDrive/Blackcoffer/Input.xlsx", dtype={'URL_ID': 'Int32'})
df.head(5)

Unnamed: 0,URL_ID,URL
0,37,https://insights.blackcoffer.com/ai-in-healthc...
1,38,https://insights.blackcoffer.com/what-if-the-c...
2,39,https://insights.blackcoffer.com/what-jobs-wil...
3,40,https://insights.blackcoffer.com/will-machine-...
4,41,https://insights.blackcoffer.com/will-ai-repla...


Scraping texts from multiple urls and storing them in directory Extracted_Texts

In [6]:
for index, url in enumerate(list(df['URL'])):
  #to enable mod security access for scraping from websites
  headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
  #extract raw texts from urls
  res = requests.get(url,headers=headers)
  html_page = res.content
  soup = BeautifulSoup(html_page, 'html.parser')
  text = soup.find_all(text=True)
  output = ''
  #include only the required texts and exclude other contents
  blacklist = ['[document]','a','article','aside','body','div','footer','form','h1','head','header','html','li','meta','pre','script','span','style','time','title','ul']
  for t in text:
      if t.parent.name not in blacklist:
          output += '{} '.format(t)
  #saving the scraped texts in individual files
  file_path = dir + "/" + str(df['URL_ID'][index]) + ".txt"
  with open(file_path, 'w') as file:
    file.write(output)
    file.close()


# Sentimental Analysis

**Creating Stop Words corpus**

In [7]:
stop_word_files = [filename for filename in os.listdir('/content/drive/MyDrive/Blackcoffer') if filename.startswith("StopWords")]
stop_words_corpus = []
for file in stop_word_files:
  fpath = '/content/drive/MyDrive/Blackcoffer/'+ file
  with open(fpath, 'r', encoding='latin-1') as f:
    text = f.readlines()
    for t in text:
      stop_words_corpus.append(t.split(" ")[0])
    

with open("StopWords_corpus.txt",'w') as f:
  for word in stop_words_corpus:
    f.write(word)
    


**Cleaning using Stop Words Lists**

In [8]:
#Create directory Clean_text to store the cleaned texts
dir = "/content/drive/MyDrive/Blackcoffer/Clean_Text"
os.mkdir(dir)

In [9]:
#Cleaning the raw texts using the list of stop words mentioned
from nltk.tokenize import TweetTokenizer
tweet = TweetTokenizer()

for fpath in os.listdir("/content/Extracted_Texts"):
  with open("/content/Extracted_Texts/"+fpath, 'r', encoding = 'utf-8') as f:
    text = f.read()
  words = tweet.tokenize(text)
  words_new = [i for i in words if i not in stop_words_corpus]
  with open("/content/drive/MyDrive/Blackcoffer/Clean_Text/"+ fpath, 'w') as f:
    for i in words_new:
      f.write(i+" ")

IDs 44, 57 & 144 presently shows PageNotFound error.

In [10]:
error = [44,54,57,114,144]

**Extracting positive and negative words**

In [11]:
pos_words = []
neg_words = []
with open('/content/drive/MyDrive/Blackcoffer/MasterDictionary/positive-words.txt','r+',encoding='latin-1') as f:
  for i in f.readlines():
    pos_words.append(i.split('\n')[0])
   
with open('/content/drive/MyDrive/Blackcoffer/MasterDictionary/negative-words.txt','r+',encoding='latin-1') as f:
  for i in f.readlines():
    neg_words.append(i.split('\n')[0])


**Positive Score, Negative Score, Word Count**

In [12]:
positive_score = []
negative_score = []
total_words = []
for url_id in list(df['URL_ID']):
  if url_id in error:
    positive_score.append("NA")
    negative_score.append("NA")
    total_words.append("NA")
  else:
    pos = 0
    neg = 0
    with open("/content/drive/MyDrive/Blackcoffer/Clean_Text/"+str(url_id)+".txt", 'r', encoding = 'utf-8') as f:
      text = f.read()
    token=nltk.tokenize.word_tokenize(text)
    token_new = [i for i in token if i not in string.punctuation]
    total_words.append(len(token_new))
    for i in token_new:
      if i in pos_words:
        pos+=1
      if i in neg_words:
        neg-=1
    positive_score.append(pos)
    negative_score.append(neg)


**Polarity & Subjectivity Score**

In [13]:
polarity_score = ["NA" if positive_score[i]=="NA" else (positive_score[i] - negative_score[i])/(positive_score[i] + negative_score[i]) for i in range(len(list(df['URL_ID'])))]
subjectivity_score = ["NA" if positive_score[i]=="NA" else (positive_score[i] + negative_score[i])/(total_words[i]+0.000001) for i in range(len(list(df['URL_ID'])))]

# Analysis of Readability

**Average Number of Words Per Sentence**

In [14]:
num_sentences = []
avg_words_per_sent = []
for url_id in list(df['URL_ID']):
  if url_id in error:
    num_sentences.append("NA")
  else:
    with open("/content/drive/MyDrive/Blackcoffer/Clean_Text/"+str(url_id)+".txt", 'r', encoding = 'utf-8') as f:
      text = f.read()
    tokenized_sentences = nltk.sent_tokenize(text)
    num_sentences.append(len(tokenized_sentences))
avg_words_per_sent = ["NA" if total_words[i]=="NA" else int(total_words[i]/num_sentences[i]) for i in range(len(list(df['URL'])))]

**Average Word Length**

In [15]:
avg_word_len = []
for url_id in list(df['URL_ID']):
  if url_id in error:
    avg_word_len.append("NA")
  else:
    char_count = 0
    with open("/content/drive/MyDrive/Blackcoffer/Clean_Text/"+str(url_id)+".txt", 'r', encoding = 'utf-8') as f:
      text = f.read()
    token=nltk.tokenize.word_tokenize(text)
    token_new = [i for i in token if i not in string.punctuation]
    for i in token_new:
      char_count+= len(i)
    avg_word_len.append(int(char_count/len(token_new)))

**Average Sentence length**

In [16]:
avg_sent_len = []
for url_id in list(df['URL_ID']):
  if url_id in error:
    avg_sent_len.append("NA")
  else:
    sent_len = 0
    with open("/content/drive/MyDrive/Blackcoffer/Clean_Text/"+str(url_id)+".txt", 'r', encoding = 'utf-8') as f:
      text = f.read()
    tokenized_sentences = nltk.sent_tokenize(text)
    token=nltk.tokenize.word_tokenize(text)
    token_new = [i for i in token if i not in string.punctuation]
    avg_sent_len.append(int(len(token_new)/len(tokenized_sentences))) 

**Syllable Count Per Word**

In [17]:
from curses.ascii import isdigit
from nltk.corpus import cmudict

d = cmudict.dict()
def nsyl(word):
  lowercase = word.lower()
  if lowercase not in d:
     return 0
  else:
     return max([len([y for y in x if isdigit(y[-1])]) for x in d[lowercase]])
syllable_count = []
for url_id in list(df['URL_ID']):
  if url_id in error:
    syllable_count.append("NA")
  else:
    count=0
    with open("/content/drive/MyDrive/Blackcoffer/Clean_Text/"+str(url_id)+".txt", 'r', encoding = 'utf-8') as f:
      text = f.read()
    token = nltk.wordpunct_tokenize(text)
    token_new = [i for i in token if i not in string.punctuation]
    regexp = "[A-Za-z]+"
    exp = re.compile(regexp)

    for a in token_new:
      if exp.match(a):
          count+=nsyl(a)
    syllable_count.append(count)


**Percentage of Complex words**

In [18]:
complex_words_percentage = []
complex_words_count = []
for url_id in list(df['URL_ID']):
  if url_id in error:
    complex_words_percentage.append("NA")
    complex_words_count.append("NA")
  else:
    complex_count = 0
    with open("/content/drive/MyDrive/Blackcoffer/Clean_Text/"+str(url_id)+".txt", 'r', encoding = 'utf-8') as f:
      text = f.read()
    token = nltk.wordpunct_tokenize(text)
    token_new = [i for i in token if i not in string.punctuation]
    regexp = "[A-Za-z]+"
    exp = re.compile(regexp)
    for a in token_new:
      if exp.match(a):
        if nsyl(a)>2:
          complex_count+=1
    complex_words_count.append(complex_count)
    complex_words_percentage.append(np.round(complex_count*100/len(token_new),2))

**Personal Pronouns**

In [19]:
pp = ['i','we','my','ours','us']
pp_count = []
for url_id in list(df['URL_ID']):
  if url_id in error:
    pp_count.append("NA")
  else:
    count = 0
    with open("/content/drive/MyDrive/Blackcoffer/Clean_Text/"+str(url_id)+".txt", 'r', encoding = 'utf-8') as f:
      text = f.read()
    token = nltk.wordpunct_tokenize(text)
    token_new = [i for i in token if i not in string.punctuation]
    for i in token_new:
      if i!="US":
        if i.lower() in pp:
          count+=1
    pp_count.append(count)



 **Fog Index = 0.4 * (Average Sentence Length + Percentage of Complex words)**


In [20]:
fog_index = ["NA" if avg_sent_len[i]=="NA" else 0.4*(avg_sent_len[i]+complex_words_percentage[i]) for i in range(len(list(df['URL_ID'])))]

Finalizing the output scorecard

In [21]:
output = {'URL_ID':list(df['URL_ID']),
          'URL':list(df['URL']),
          'POSITIVE SCORE':positive_score,
          'NEGATIVE SCORE':negative_score,
          'POLARITY SCORE':polarity_score,
          'SUBJECTIVITY SCORE':subjectivity_score,
          'AVG SENTENCE LENGTH':avg_sent_len,
          'PERCENTAGE OF COMPLEX WORDS':complex_words_percentage,
          'FOG INDEX':fog_index,
          'AVG NUMBER OF WORDS PER SENTENCE':avg_words_per_sent,
          'COMPLEX WORD COUNT':complex_words_count,
          'WORD COUNT':total_words,
          'SYLLABLE PER WORD':syllable_count,
          'PERSONAL PRONOUNS':pp_count,
          'AVG WORD LENGTH':avg_word_len}
output_df = pd.DataFrame(output)
output_df.head(10)

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,76.0,-33.0,2.534884,0.023836,23.0,22.65,18.26,23.0,414.0,1804.0,3270.0,1.0,5.0
1,38,https://insights.blackcoffer.com/what-if-the-c...,65.0,-38.0,3.814815,0.018711,18.0,14.13,12.852,18.0,206.0,1443.0,2258.0,7.0,4.0
2,39,https://insights.blackcoffer.com/what-jobs-wil...,73.0,-35.0,2.842105,0.022209,20.0,21.37,16.548,20.0,367.0,1711.0,3037.0,3.0,5.0
3,40,https://insights.blackcoffer.com/will-machine-...,72.0,-25.0,2.06383,0.028416,17.0,15.37,12.948,17.0,255.0,1654.0,2652.0,17.0,4.0
4,41,https://insights.blackcoffer.com/will-ai-repla...,59.0,-25.0,2.470588,0.019155,21.0,17.65,15.46,21.0,317.0,1775.0,2881.0,16.0,4.0
5,42,https://insights.blackcoffer.com/man-and-machi...,55.0,-24.0,2.548387,0.023773,21.0,15.78,14.712,21.0,207.0,1304.0,2088.0,16.0,4.0
6,43,https://insights.blackcoffer.com/in-future-or-...,27.0,-12.0,2.6,0.020492,16.0,17.05,13.22,16.0,126.0,732.0,1212.0,7.0,5.0
7,44,https://insights.blackcoffer.com/how-neural-ne...,,,,,,,,,,,,,
8,45,https://insights.blackcoffer.com/how-machine-l...,43.0,-13.0,1.866667,0.040323,20.0,11.6,12.64,20.0,87.0,744.0,1096.0,2.0,4.0
9,46,https://insights.blackcoffer.com/deep-learning...,77.0,-40.0,3.162162,0.016772,27.0,17.27,17.708,27.0,386.0,2206.0,3584.0,7.0,4.0


In [22]:
#storing the output in excel sheet
output_df.to_excel("Output.xlsx")