## Load the folder from drive

In [55]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [56]:
!ls

 articles	    Objective.docx		  StopWords
 Input.xlsx	   'Output Data Structure.xlsx'  'Text Analysis.docx'
 MasterDictionary   output.xlsx


In [2]:
cd drive/MyDrive/Test_completion/Black_Coffer

/content/drive/MyDrive/Test_completion/Black_Coffer


## Load data into structures needful for scrapping

In [57]:
import pandas as pd
dataset = pd.read_excel('Input.xlsx')
mylist = dataset['URL'].tolist()

['https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/', 'https://insights.blackcoffer.com/what-if-the-creation-is-taking-over-the-creator/', 'https://insights.blackcoffer.com/what-jobs-will-robots-take-from-humans-in-the-future/', 'https://insights.blackcoffer.com/will-machine-replace-the-human-in-the-future-of-work/', 'https://insights.blackcoffer.com/will-ai-replace-us-or-work-with-us/', 'https://insights.blackcoffer.com/man-and-machines-together-machines-are-more-diligent-than-humans-blackcoffe/', 'https://insights.blackcoffer.com/in-future-or-in-upcoming-years-humans-and-machines-are-going-to-work-together-in-every-field-of-work/', 'https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/', 'https://insights.blackcoffer.com/how-machine-learning-will-affect-your-business/', 'https://insights.blackcoffer.com/deep-learning-impact-on-areas-of-e-learning/', 'https://insights.blackcoffer.com/how-to-protect-future-data-

In [None]:
url_ids = dataset["URL_ID"].tolist()

In [58]:
dataset

Unnamed: 0,URL_ID,URL
0,37,https://insights.blackcoffer.com/ai-in-healthc...
1,38,https://insights.blackcoffer.com/what-if-the-c...
2,39,https://insights.blackcoffer.com/what-jobs-wil...
3,40,https://insights.blackcoffer.com/will-machine-...
4,41,https://insights.blackcoffer.com/will-ai-repla...
...,...,...
109,146,https://insights.blackcoffer.com/blockchain-fo...
110,147,https://insights.blackcoffer.com/the-future-of...
111,148,https://insights.blackcoffer.com/big-data-anal...
112,149,https://insights.blackcoffer.com/business-anal...


## Scarpping data

In [60]:
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle

In [12]:
# Scrapes transcript data
def url_to_transcript(url):
    try:
      page = requests.get(url).text
      soup = BeautifulSoup(page, "html.parser")
      doc = [p.text for p in soup.find(class_="td-post-content").find_all('p')]
      header = soup.find("h1").text
      doc.insert(0,f"{header}")
      print(url)
      return doc
    except :
      return "NA"

In [61]:
# URLs of transcripts in scope
urls = mylist


In [13]:
transcripts = [url_to_transcript(u) for u in urls]

https://insights.blackcoffer.com/ai-in-healthcare-to-improve-patient-outcomes/
https://insights.blackcoffer.com/what-if-the-creation-is-taking-over-the-creator/
https://insights.blackcoffer.com/what-jobs-will-robots-take-from-humans-in-the-future/
https://insights.blackcoffer.com/will-machine-replace-the-human-in-the-future-of-work/
https://insights.blackcoffer.com/will-ai-replace-us-or-work-with-us/
https://insights.blackcoffer.com/man-and-machines-together-machines-are-more-diligent-than-humans-blackcoffe/
https://insights.blackcoffer.com/in-future-or-in-upcoming-years-humans-and-machines-are-going-to-work-together-in-every-field-of-work/
https://insights.blackcoffer.com/how-machine-learning-will-affect-your-business/
https://insights.blackcoffer.com/deep-learning-impact-on-areas-of-e-learning/
https://insights.blackcoffer.com/how-to-protect-future-data-and-its-privacy-blackcoffer/
https://insights.blackcoffer.com/how-machines-ai-automations-and-robo-human-are-effective-in-finance-an

In [81]:
!mkdir articles

In [82]:
!ls

 articles	    Objective.docx		 'Text Analysis.docx'
 Input.xlsx	   'Output Data Structure.xlsx'
 MasterDictionary   StopWords


## Store data into pickel file and use that for future retrieval

In [14]:
for i, c in enumerate(url_ids):
    with open("articles/" + str(c) + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)

In [62]:
# Load pickled files
data = {}
for i, c in enumerate(url_ids):
    with open("articles/" + str(c) + ".txt", "rb") as file:
        data[c] = pickle.load(file)

## Filter out all the 'NA' values

In [13]:
data.keys()

dict_keys([37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150])

In [91]:
data[44]

'NA'

In [63]:
def filterOutEmpty(kval):
  filterOut = 'NA'
  key, value = kval
  if(value==filterOut):
    return False
  else:
    return True


In [64]:
mainData = dict(filter(filterOutEmpty, data.items()))

In [14]:
mainData.keys()

dict_keys([37, 38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 145, 146, 147, 148, 149, 150])

In [89]:
mainData[37]

['AI in healthcare to Improve Patient Outcomes',
 'Introduction',
 '“If anything kills over 10 million people in the next few decades, it will be a highly infectious virus rather than a war. Not missiles but microbes.” Bill Gates’s remarks at a TED conference in 2014, right after the world had avoided the Ebola outbreak. When the new, unprecedented, invisible virus hit us, it met an overwhelmed and unprepared healthcare system and oblivious population. This public health emergency demonstrated our lack of scientific consideration and underlined the alarming need for robust innovations in our health and medical facilities. For the past few years, artificial intelligence has proven to be of tangible potential in the healthcare sectors, clinical practices, translational medical and biomedical research.',
 'After the first case was detected in China on December 31st 2019, it was an AI program developed by BlueDot that alerted the world about the pandemic. It was quick to realise AI’s abili

## Cleaning the text data

In [65]:
import nltk
from nltk.tokenize import sent_tokenize

In [66]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Tokenize sentences (as we will remove punctuations later on)

In [67]:
for key, value_list in mainData.items():
    new_list = []
    for value in value_list:
        sentences = sent_tokenize(value)
        new_list.extend(sentences)
    mainData[key] = new_list

In [93]:
mainData[37]

['AI in healthcare to Improve Patient Outcomes',
 'Introduction',
 '“If anything kills over 10 million people in the next few decades, it will be a highly infectious virus rather than a war.',
 'Not missiles but microbes.” Bill Gates’s remarks at a TED conference in 2014, right after the world had avoided the Ebola outbreak.',
 'When the new, unprecedented, invisible virus hit us, it met an overwhelmed and unprepared healthcare system and oblivious population.',
 'This public health emergency demonstrated our lack of scientific consideration and underlined the alarming need for robust innovations in our health and medical facilities.',
 'For the past few years, artificial intelligence has proven to be of tangible potential in the healthcare sectors, clinical practices, translational medical and biomedical research.',
 'After the first case was detected in China on December 31st 2019, it was an AI program developed by BlueDot that alerted the world about the pandemic.',
 'It was quick t

### Remove punctuations and create personal pronoun list (as next step is stop word removal which will also eliminate pronouns)

In [68]:
import string

In [69]:
custom_punctuations = ['“','.','”',"?",]
punctuation_marks = set(custom_punctuations)

In [70]:
personal_pronouns = ["i", "we", "my", "ours", "us"]

In [71]:
personal_pronouns_count_dict = {}

In [72]:
for key, value_list in mainData.items():
  personal_pronouns_count = 0
  clean_list = []
  for value in value_list:
    words = value.split()
    cleaned_words = []
    for word in words:
      clean_word = ''.join(char for char in word if char not in punctuation_marks)
      if clean_word == "US":
        cleaned_words.append(word)
      else:
        cleaned_words.append(clean_word.lower())

      if clean_word.lower() in personal_pronouns:
          personal_pronouns_count +=1

    sentence = ' '.join(cleaned_words)     
    clean_list.append(sentence)
  
  personal_pronouns_count_dict[key] = personal_pronouns_count
  mainData[key] = clean_list
    

In [16]:
mainData[37][0:10]

['ai in healthcare to improve patient outcomes',
 'introduction',
 'if anything kills over 10 million people in the next few decades, it will be a highly infectious virus rather than a war',
 'not missiles but microbes bill gates’s remarks at a ted conference in 2014, right after the world had avoided the ebola outbreak',
 'when the new, unprecedented, invisible virus hit us, it met an overwhelmed and unprepared healthcare system and oblivious population',
 'this public health emergency demonstrated our lack of scientific consideration and underlined the alarming need for robust innovations in our health and medical facilities',
 'for the past few years, artificial intelligence has proven to be of tangible potential in the healthcare sectors, clinical practices, translational medical and biomedical research',
 'after the first case was detected in china on december 31st 2019, it was an ai program developed by bluedot that alerted the world about the pandemic',
 'it was quick to realise

### Removal of stop words

In [73]:
import os
stop_words_dir = 'StopWords'

In [75]:
stop_words = []

for filename in os.listdir(stop_words_dir):
  if filename.endswith('.txt'):
    file_path = os.path.join(stop_words_dir, filename)
    with open(file_path, 'r', encoding='latin-1') as f:
      stop_words += [word.strip().lower() for word in f.readlines()]

In [100]:
stop_words[0:10]

['ernst',
 'young',
 'deloitte',
 'touche',
 'kpmg',
 'pricewaterhousecoopers',
 'pricewaterhouse',
 'coopers',
 'afghani  | afghanistan',
 'ariary | madagascar']

In [76]:
for key, value_list in mainData.items():
  cleaned_list = []
  for value in value_list:
    words = value.split()
    filtered_word = [word for word in words if word not in stop_words]
    sentence = ' '.join(filtered_word)
    cleaned_list.append(sentence)
  mainData[key] = cleaned_list

In [102]:
mainData[37]

['healthcare improve patient outcomes',
 'introduction',
 'kills 10 people decades, highly infectious virus war',
 'missiles microbes gates’s remarks conference 2014, world avoided ebola outbreak',
 'new, unprecedented, invisible virus hit us, met overwhelmed unprepared healthcare system oblivious population',
 'public health emergency demonstrated lack scientific consideration underlined alarming robust innovations health medical facilities',
 'past years, artificial intelligence proven tangible potential healthcare sectors, clinical practices, translational medical biomedical research',
 'detected 31st 2019, program developed bluedot alerted world pandemic',
 'realise ai’s ability analyse chunks data detecting patterns identifying tracking carriers virus',
 'tracing apps tabs people infected prevent risk cross-infection algorithms track patterns extract features classify categorise',
 '',
 'ibm watson, sophisticated works computing natural language processing, prominently contributed

## Preparing for computing attributes of each article

In [77]:
import nltk
from nltk.stem import WordNetLemmatizer
import syllables

In [78]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
!pip install syllables

In [79]:
positive_words = []
with open('MasterDictionary/positive-words.txt', 'r') as file:    
    for line in file:
        word = line.strip()
        positive_words.append(word)

In [107]:
positive_words[0:5]

['a+', 'abound', 'abounds', 'abundance', 'abundant']

In [80]:
negative_words = []
with open('MasterDictionary/negative-words.txt', 'r', encoding='latin-1') as file:    
    for line in file:
        word = line.strip()
        negative_words.append(word)

In [109]:
negative_words[0:5]

['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable']

In [82]:

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to normalize a word by removing apostrophes and getting its singular form
def normalize_word(word):
    word = word.replace("'", "")
    return lemmatizer.lemmatize(word.lower())

# Function to count syllables in a word while handling exceptions
def count_syllables(word):
    if word.endswith("es") or word.endswith("ed"):
        word = word[:-2]  # Remove "es" or "ed" from the end of the word
    return syllables.estimate(word)

In [83]:
positive_scores_dict = {}
negative_scores_dict = {}
word_count_dict = {}
polarity_scores_dict = {}
subjectivity_score_dict = {}
avg_sentence_length_dict = {}
avg_words_per_sentence_dict = {}
avg_word_length_dict = {}
syllable_count_dict = {}
complex_word_count_dict = {}
percentage_of_complex_words_dict = {}
fog_index_dict = {}


In [84]:
def check_range(number, min, max):
    if number >= min and number <= max:
        return number
    elif number < min:
        return min
    else:
        return max

## Computing the final attributes for each article

In [91]:
for key, lst in mainData.items():
    positive_score = 0
    negative_score = 0
    word_count = 0
    sentence_count = len(lst)
    total_word_length = 0
    personal_pronouns_count = 0
    total_syllable_count = 0
    complex_word_count = 0

    for sentence in lst:
      for word in sentence.split():
        normalized_word = normalize_word(word)
        if normalized_word in positive_words:
          positive_score += 1
        elif normalized_word in negative_words:
          negative_score += -1

        word_count+=1
        total_word_length += len(word)
        total_syllable_count += count_syllables(normalized_word)

        if total_syllable_count > 2:
          complex_word_count += 1
          

    positive_scores_dict[key] = positive_score
    negative_scores_dict[key] = negative_score

    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    polarity_score = check_range(polarity_score, -1, 1)
    polarity_scores_dict[key] = polarity_score

    subjectivity_score = (positive_score - negative_score)/(word_count + 0.000001)
    subjectivity_score = check_range(subjectivity_score, 0, 1)
    subjectivity_score_dict[key] = subjectivity_score

    avg_sentence_len = word_count/sentence_count
    avg_sentence_length_dict[key] = avg_sentence_len

    percentage_complex_word = complex_word_count / word_count
    percentage_of_complex_words_dict[key] = percentage_complex_word

    fog_index_dict[key] = 0.4 * (avg_sentence_len + percentage_complex_word)

    avg_words_per_sentence = word_count/sentence_count
    avg_words_per_sentence_dict[key] = avg_words_per_sentence


    complex_word_count_dict[key] = complex_word_count

    word_count_dict[key] = word_count

    syllable_per_word = total_syllable_count/word_count
    syllable_count_dict[key] = syllable_per_word


    avg_word_len = total_word_length/word_count
    avg_word_length_dict[key] = avg_word_len
    



In [None]:
positive_scores_dict

## Combining the data frames and exporting to output as excel file

In [92]:
# Create a list to store the data for each row
data = []

# Iterate over the keys in the positive_scores_dict (assuming both dictionaries have the same keys)
for key in positive_scores_dict.keys():
    data.append([key, positive_scores_dict[key], negative_scores_dict[key], polarity_scores_dict[key],subjectivity_score_dict[key],avg_sentence_length_dict[key], percentage_of_complex_words_dict[key], fog_index_dict[key],avg_words_per_sentence_dict[key],complex_word_count_dict[key],word_count_dict[key], syllable_count_dict[key], personal_pronouns_count_dict[key], avg_word_length_dict[key]])

# Create a dataframe using the data list and specify the column names
df = pd.DataFrame(data, columns=['URL_ID', 'POSITIVE_SCORE', 'NEGATIVE_SCORE',"POLARITY_SCORE","SUBJECTIVITY_SCORE","AVG SENTENCE LENGTH","PERCENTAGE OF COMPLEX WORDS", "FOG INDEX", "AVG NUMBER OF WORDS PER SENTENCE","COMPLEX WORD COUNT","WORD COUNT", "SYLLABLE PER WORD", "PERSONAL PRONOUNS", "AVG WORD LENGTH"])

In [93]:
df

Unnamed: 0,URL_ID,POSITIVE_SCORE,NEGATIVE_SCORE,POLARITY_SCORE,SUBJECTIVITY_SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,69,-32,1.0,0.104016,11.841463,1.000000,5.136585,11.841463,971,971,2.571576,0,7.807415
1,38,65,-35,1.0,0.166945,7.487500,0.998331,3.394332,7.487500,598,599,2.277129,6,7.193656
2,39,62,-40,1.0,0.122596,9.142857,0.998798,4.056662,9.142857,831,832,2.631010,2,7.822115
3,40,56,-28,1.0,0.131661,6.194175,1.000000,2.877670,6.194175,638,638,2.387147,18,7.126959
4,41,59,-25,1.0,0.106870,9.469880,1.000000,4.187952,9.469880,786,786,2.391858,15,7.318066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,146,22,-24,-1.0,0.108235,8.173077,0.997647,3.668290,8.173077,424,425,2.458824,11,7.797647
107,147,30,-9,1.0,0.073585,10.600000,1.000000,4.640000,10.600000,530,530,2.460377,2,7.390566
108,148,36,-45,-1.0,0.139655,8.656716,0.998276,3.861997,8.656716,579,580,2.396552,2,7.084483
109,149,33,-3,1.0,0.125000,10.666667,1.000000,4.666667,10.666667,288,288,2.649306,0,7.927083


In [94]:
final_data = pd.merge(dataset, df, on='URL_ID', how='left')

In [95]:
final_data[0:10]

Unnamed: 0,URL_ID,URL,POSITIVE_SCORE,NEGATIVE_SCORE,POLARITY_SCORE,SUBJECTIVITY_SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,37,https://insights.blackcoffer.com/ai-in-healthc...,69.0,-32.0,1.0,0.104016,11.841463,1.0,5.136585,11.841463,971.0,971.0,2.571576,0.0,7.807415
1,38,https://insights.blackcoffer.com/what-if-the-c...,65.0,-35.0,1.0,0.166945,7.4875,0.998331,3.394332,7.4875,598.0,599.0,2.277129,6.0,7.193656
2,39,https://insights.blackcoffer.com/what-jobs-wil...,62.0,-40.0,1.0,0.122596,9.142857,0.998798,4.056662,9.142857,831.0,832.0,2.63101,2.0,7.822115
3,40,https://insights.blackcoffer.com/will-machine-...,56.0,-28.0,1.0,0.131661,6.194175,1.0,2.87767,6.194175,638.0,638.0,2.387147,18.0,7.126959
4,41,https://insights.blackcoffer.com/will-ai-repla...,59.0,-25.0,1.0,0.10687,9.46988,1.0,4.187952,9.46988,786.0,786.0,2.391858,15.0,7.318066
5,42,https://insights.blackcoffer.com/man-and-machi...,37.0,-20.0,1.0,0.101968,9.637931,1.0,4.255172,9.637931,559.0,559.0,2.32737,18.0,7.194991
6,43,https://insights.blackcoffer.com/in-future-or-...,24.0,-10.0,1.0,0.099707,7.413043,0.997067,3.364044,7.413043,340.0,341.0,2.451613,7.0,7.228739
7,44,https://insights.blackcoffer.com/how-neural-ne...,,,,,,,,,,,,,
8,45,https://insights.blackcoffer.com/how-machine-l...,30.0,-11.0,1.0,0.128931,8.833333,1.0,3.933333,8.833333,318.0,318.0,2.198113,0.0,6.874214
9,46,https://insights.blackcoffer.com/deep-learning...,64.0,-30.0,1.0,0.099788,9.915789,0.998938,4.365891,9.915789,941.0,942.0,2.453291,8.0,7.326964


In [96]:
final_data.to_excel('output.xlsx', index=False)