# Data Collection

In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import concurrent.futures

In [None]:
input_df = pd.read_excel('Input.xlsx')
input_df

Unnamed: 0,URL_ID,URL
0,123.0,https://insights.blackcoffer.com/rise-of-telem...
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...
4,432.0,https://insights.blackcoffer.com/rise-of-telem...
...,...,...
109,50921.0,https://insights.blackcoffer.com/coronavirus-i...
110,51382.8,https://insights.blackcoffer.com/coronavirus-i...
111,51844.6,https://insights.blackcoffer.com/what-are-the-...
112,52306.4,https://insights.blackcoffer.com/marketing-dri...


In [None]:
def extract_data(url):
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")

        title = soup.find("title").get_text()

        divs = soup.find_all('div', class_=["td-post-content tagdiv-type", "tdb-block-inner td-fix-index"])

        article_text = ""
        for div in divs:
            paragraphs = div.find_all(["p", "li", "ul"])
            for element in paragraphs:
                if element.name == "ul":
                    list_items = element.find_all("li")
                    for li in list_items:
                        article_text += li.get_text()
                else:
                    article_text += element.get_text()

        return title, article_text.strip()

    except Exception as e:
        return None, None

In [None]:
def process_row(row):
    title, text = extract_data(row.URL)
    return title, text

In [None]:
import concurrent.futures

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    results = list(executor.map(process_row, input_df.itertuples(index=False)))

In [None]:
input_df["Title"], input_df["Text"] = zip(*results)

In [None]:
input_df

Unnamed: 0,URL_ID,URL,Title,Text
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,Rise of telemedicine and its Impact on Livelih...,"Telemedicine, the use of technology to diagnos..."
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,Rise of e-health and its impact on humans by t...,"The rise of e-health, or the use of electronic..."
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,Rise of e-health and its impact on humans by t...,2020 was the year the world was ravaged by the...
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,Rise of telemedicine and its Impact on Livelih...,"“More gains on quality, affordability and acce..."
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,Rise of telemedicine and its Impact on Livelih...,"“More gains on quality, affordability and acce..."
...,...,...,...,...
109,50921.0,https://insights.blackcoffer.com/coronavirus-i...,Coronavirus: Impact on the Hospitality Industr...,Before jumping on the topic I would like to gi...
110,51382.8,https://insights.blackcoffer.com/coronavirus-i...,Coronavirus impact on energy markets - Blackco...,As the coronavirus spreads around the world an...
111,51844.6,https://insights.blackcoffer.com/what-are-the-...,impacts of COVID-19 on the world of work - Wha...,"From Alibaba to Ping An and Google to Ford, co..."
112,52306.4,https://insights.blackcoffer.com/marketing-dri...,Marketing Drives Results With A Focus On Probl...,"When\nthe British ruled India, many Indians\na..."


#Data Preprocessing

In [None]:
input_df = input_df.drop([24, 37])
input_df = input_df.reset_index(drop=True)

In [None]:
input_df['Text'].isna().values

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

Lowercasing

In [None]:
input_df['Text'] = input_df['Text'].str.lower()

Remove HTML tags

In [None]:
# Function for Reomiving HTML Tags
def remove_html_tags(text):
  soup = BeautifulSoup(text, "html.parser")
  return soup.get_text()
  input_df['Text'] = input_df['Text'].apply(remove_html_tags)

Reomve Punctuations

In [None]:
import string

# Function for Reomiving Punctuations
def remove_punctuations(text):
    translator = str.maketrans('', '', string.punctuation + ':–“”')
    return text.translate(translator)

input_df['Text'] = input_df['Text'].apply(remove_punctuations)

# (1)Sentiment Analysis

(1.1) Cleaning using Stop Words Lists

In [None]:
# Import Stop Words files
stopword_files = ['StopWords_DatesandNumbers.txt', 'StopWords_Geographic.txt', 'StopWords_Generic.txt', 'StopWords_Currencies.txt', 'StopWords_Auditor.txt', 'StopWords_Names.txt', 'StopWords_GenericLong.txt']

custom_stop_words = set()

# Function for Merging all Stop Words
def merge_stop_words_from_file(file_path):
    try:
        with open(file_path, 'r', encoding='ISO-8859-1', errors='ignore') as file:
            stop_words_in_file = file.read().split()
            custom_stop_words.update(stop_words_in_file)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

for file_path in stopword_files:
    merge_stop_words_from_file(file_path)

# Function for Reomiving Stop Words
def remove_stop_words(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in custom_stop_words]
    return ' '.join(filtered_words)

# Apply "reomove_stop_words" function
input_df['Text'] = input_df['Text'].apply(remove_stop_words)

(1.2)	Creating a dictionary of Positive and Negative words

In [None]:
# Create a dictionary of Positive & Negative words if not not found stop words list

positive_words = {}
negative_words = {}

with open('positive-words.txt', 'r', encoding='utf-8') as file:
    positive_words_list = file.read().split()
    positive_words = {word: 1 for word in positive_words_list if word not in custom_stop_words}

with open('negative-words.txt', 'r', encoding='latin-1') as file:
    negative_words_list = file.read().split()
    negative_words = {word: -1 for word in negative_words_list if word not in custom_stop_words}

Tokenization

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Tokenize the Text
input_df['Tokens'] = input_df['Text'].apply(lambda text: word_tokenize(text))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Preprocess Output file

In [None]:
output_df = pd.read_excel('Output Data Structure.xlsx')
output_df = output_df.drop([24, 37])
output_df = output_df.reset_index(drop=True)
output_df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,,,,,,,,,,,,,
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,,,,,,,,,,,,,
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,,,,,,,,,,,,,
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,,,,,,,,,,,,,
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,50921.0,https://insights.blackcoffer.com/coronavirus-i...,,,,,,,,,,,,,
108,51382.8,https://insights.blackcoffer.com/coronavirus-i...,,,,,,,,,,,,,
109,51844.6,https://insights.blackcoffer.com/what-are-the-...,,,,,,,,,,,,,
110,52306.4,https://insights.blackcoffer.com/marketing-dri...,,,,,,,,,,,,,


(1.3) Extracting Derived variables

In [None]:
# Function to calculate Positive Score
def calculate_positive_score(tokens, positive_words):
    return sum(1 for token in tokens if token in positive_words)

# Function to calculate Negative Score
def calculate_negative_score(tokens, negative_words):
    return sum(1 for token in tokens if token in negative_words)

# Function to calculate Polarity Score
def calculate_polarity_score(positive_score, negative_score):
    return (positive_score - negative_score) / (positive_score + negative_score + 0.000001)

# Function to calculate Subjectivity Score
def calculate_subjectivity_score(positive_score, negative_score, total_words):
    return (positive_score + negative_score) / (total_words + 0.000001)

In [None]:
# Calculate scores for each row in input_df
for index, row in input_df.iterrows():
    tokens = row['Tokens']
    total_words = len(tokens)

    positive_score = calculate_positive_score(tokens, positive_words)
    negative_score = calculate_negative_score(tokens, negative_words)
    polarity_score = calculate_polarity_score(positive_score, negative_score)
    subjectivity_score = calculate_subjectivity_score(positive_score, negative_score, total_words)

    # Store the calculated scores in the corresponding rows of output_df
    output_df.at[index, 'POSITIVE SCORE'] = positive_score
    output_df.at[index, 'NEGATIVE SCORE'] = negative_score
    output_df.at[index, 'POLARITY SCORE'] = polarity_score
    output_df.at[index, 'SUBJECTIVITY SCORE'] = subjectivity_score

In [None]:
output_df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,75.0,22.0,0.546392,0.115339,,,,,,,,,
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,38.0,13.0,0.490196,0.183453,,,,,,,,,
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,23.0,27.0,-0.080000,0.090090,,,,,,,,,
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,33.0,27.0,0.100000,0.092593,,,,,,,,,
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,33.0,27.0,0.100000,0.092593,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,50921.0,https://insights.blackcoffer.com/coronavirus-i...,5.0,42.0,-0.787234,0.109302,,,,,,,,,
108,51382.8,https://insights.blackcoffer.com/coronavirus-i...,23.0,52.0,-0.386667,0.080645,,,,,,,,,
109,51844.6,https://insights.blackcoffer.com/what-are-the-...,90.0,32.0,0.475410,0.125000,,,,,,,,,
110,52306.4,https://insights.blackcoffer.com/marketing-dri...,24.0,21.0,0.066667,0.066568,,,,,,,,,


# (2) Analysis of Readability

In [None]:
! pip install textstat

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.14.0 textstat-0.7.3


In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from textstat import syllable_count

# Function to calculate Average Sentence Length
def calculate_average_sentence_length(text):
    sentences = sent_tokenize(text)
    total_words = sum(len(word_tokenize(sentence)) for sentence in sentences)
    total_sentences = len(sentences)
    return total_words / total_sentences

# Function to calculate Percentage of Complex Words
def calculate_percentage_complex_words(text):
    words = word_tokenize(text)
    complex_word_count = sum(1 for word in words if syllable_count(word) >= 3)
    total_word_count = len(words)
    return (complex_word_count / total_word_count) * 100

# Function to calculateFog Index
def calculate_gunning_fog_index(average_sentence_length, percentage_complex_words):
    return 0.4 * (average_sentence_length + percentage_complex_words)

# Calculate readability metrics for each row in input_df
for index, row in input_df.iterrows():
    text = row['Text']
    average_sentence_length = calculate_average_sentence_length(text)
    percentage_complex_words = calculate_percentage_complex_words(text)
    fog_index = calculate_gunning_fog_index(average_sentence_length, percentage_complex_words)

    # Store the calculated metrics in the corresponding columns of output_df
    output_df.at[index, 'AVG SENTENCE LENGTH'] = average_sentence_length
    output_df.at[index, 'PERCENTAGE OF COMPLEX WORDS'] = percentage_complex_words
    output_df.at[index, 'FOG INDEX'] = fog_index

In [None]:
output_df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,75.0,22.0,0.546392,0.115339,841.0,29.369798,348.147919,,,,,,
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,38.0,13.0,0.490196,0.183453,278.0,42.446043,128.178417,,,,,,
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,23.0,27.0,-0.080000,0.090090,555.0,34.414414,235.765766,,,,,,
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,33.0,27.0,0.100000,0.092593,648.0,32.098765,272.039506,,,,,,
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,33.0,27.0,0.100000,0.092593,648.0,32.098765,272.039506,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,50921.0,https://insights.blackcoffer.com/coronavirus-i...,5.0,42.0,-0.787234,0.109302,430.0,26.046512,182.418605,,,,,,
108,51382.8,https://insights.blackcoffer.com/coronavirus-i...,23.0,52.0,-0.386667,0.080645,930.0,29.032258,383.612903,,,,,,
109,51844.6,https://insights.blackcoffer.com/what-are-the-...,90.0,32.0,0.475410,0.125000,976.0,31.250000,402.900000,,,,,,
110,52306.4,https://insights.blackcoffer.com/marketing-dri...,24.0,21.0,0.066667,0.066568,676.0,27.071006,281.228402,,,,,,


# (3) Average Number of Words Per Sentence

In [None]:
# Function to calculate the average number of words per sentence
def calculate_average_words_per_sentence(tokenized_sentences):
    total_words = sum(len(word_tokenize(sentence)) for sentence in tokenized_sentences)
    total_sentences = len(tokenized_sentences)
    return total_words / total_sentences

# Calculate the average number of words per sentence for each row in input_df
for index, row in input_df.iterrows():
    tokenized_sentences = sent_tokenize(row['Text'])
    average_words_per_sentence = calculate_average_words_per_sentence(tokenized_sentences)

    # Store the calculated metric in the corresponding column of output_df
    output_df.at[index, 'AVG NUMBER OF WORDS PER SENTENCE'] = average_words_per_sentence

In [None]:
output_df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,75.0,22.0,0.546392,0.115339,841.0,29.369798,348.147919,841.0,,,,,
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,38.0,13.0,0.490196,0.183453,278.0,42.446043,128.178417,278.0,,,,,
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,23.0,27.0,-0.080000,0.090090,555.0,34.414414,235.765766,555.0,,,,,
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,33.0,27.0,0.100000,0.092593,648.0,32.098765,272.039506,648.0,,,,,
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,33.0,27.0,0.100000,0.092593,648.0,32.098765,272.039506,648.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,50921.0,https://insights.blackcoffer.com/coronavirus-i...,5.0,42.0,-0.787234,0.109302,430.0,26.046512,182.418605,430.0,,,,,
108,51382.8,https://insights.blackcoffer.com/coronavirus-i...,23.0,52.0,-0.386667,0.080645,930.0,29.032258,383.612903,930.0,,,,,
109,51844.6,https://insights.blackcoffer.com/what-are-the-...,90.0,32.0,0.475410,0.125000,976.0,31.250000,402.900000,976.0,,,,,
110,52306.4,https://insights.blackcoffer.com/marketing-dri...,24.0,21.0,0.066667,0.066568,676.0,27.071006,281.228402,676.0,,,,,


# (4) Complex Word Count

In [None]:
from nltk.corpus import cmudict

# Function to count syllables in a word using the CMU Pronouncing Dictionary
def count_syllables(word, pronouncing_dict):
    if word.lower() in pronouncing_dict:
        return max([len(list(y for y in x if y[-1].isdigit())) for x in pronouncing_dict[word.lower()]])
    else:
        # If the word is not found in the dictionary, assume it has 1 syllable
        return 1

In [None]:
# Load the CMU Pronouncing Dictionary
nltk.download('cmudict')
pronouncing_dict = cmudict.dict()

[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


In [None]:
# Function to calculate the complex word count
def calculate_complex_word_count(text):
    words = word_tokenize(text)
    complex_word_count = sum(1 for word in words if count_syllables(word, pronouncing_dict) > 2)
    return complex_word_count

# Calculate the complex word count for each row in input_df
for index, row in input_df.iterrows():
    text = row['Text']
    complex_word_count = calculate_complex_word_count(text)

    # Store the calculated metric in the corresponding column of output_df
    output_df.at[index, 'COMPLEX WORD COUNT'] = complex_word_count

In [None]:
output_df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,75.0,22.0,0.546392,0.115339,841.0,29.369798,348.147919,841.0,328.0,,,,
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,38.0,13.0,0.490196,0.183453,278.0,42.446043,128.178417,278.0,126.0,,,,
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,23.0,27.0,-0.080000,0.090090,555.0,34.414414,235.765766,555.0,202.0,,,,
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,33.0,27.0,0.100000,0.092593,648.0,32.098765,272.039506,648.0,243.0,,,,
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,33.0,27.0,0.100000,0.092593,648.0,32.098765,272.039506,648.0,243.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,50921.0,https://insights.blackcoffer.com/coronavirus-i...,5.0,42.0,-0.787234,0.109302,430.0,26.046512,182.418605,430.0,117.0,,,,
108,51382.8,https://insights.blackcoffer.com/coronavirus-i...,23.0,52.0,-0.386667,0.080645,930.0,29.032258,383.612903,930.0,205.0,,,,
109,51844.6,https://insights.blackcoffer.com/what-are-the-...,90.0,32.0,0.475410,0.125000,976.0,31.250000,402.900000,976.0,316.0,,,,
110,52306.4,https://insights.blackcoffer.com/marketing-dri...,24.0,21.0,0.066667,0.066568,676.0,27.071006,281.228402,676.0,218.0,,,,


In [None]:
input_df.columns

Index(['URL_ID', 'URL', 'Title', 'Text', 'Tokens'], dtype='object')

# (5) Word Count

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

# Function to calculate the word count after removing stopwords and punctuation
def calculate_word_count(tokenized_words):
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    words = [word for word in tokenized_words if word not in stop_words and word not in string.punctuation]

    # Count the remaining words
    word_count = len(words)
    return word_count

# Calculate the word count for each row in input_df using the tokenized words
for index, row in input_df.iterrows():
    tokenized_words = row['Tokens']
    word_count = calculate_word_count(tokenized_words)

    # Store the calculated metric in the corresponding column of output_df
    output_df.at[index, 'WORD COUNT'] = word_count

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
output_df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,75.0,22.0,0.546392,0.115339,841.0,29.369798,348.147919,841.0,328.0,838.0,,,
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,38.0,13.0,0.490196,0.183453,278.0,42.446043,128.178417,278.0,126.0,278.0,,,
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,23.0,27.0,-0.080000,0.090090,555.0,34.414414,235.765766,555.0,202.0,553.0,,,
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,33.0,27.0,0.100000,0.092593,648.0,32.098765,272.039506,648.0,243.0,640.0,,,
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,33.0,27.0,0.100000,0.092593,648.0,32.098765,272.039506,648.0,243.0,640.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,50921.0,https://insights.blackcoffer.com/coronavirus-i...,5.0,42.0,-0.787234,0.109302,430.0,26.046512,182.418605,430.0,117.0,427.0,,,
108,51382.8,https://insights.blackcoffer.com/coronavirus-i...,23.0,52.0,-0.386667,0.080645,930.0,29.032258,383.612903,930.0,205.0,919.0,,,
109,51844.6,https://insights.blackcoffer.com/what-are-the-...,90.0,32.0,0.475410,0.125000,976.0,31.250000,402.900000,976.0,316.0,942.0,,,
110,52306.4,https://insights.blackcoffer.com/marketing-dri...,24.0,21.0,0.066667,0.066568,676.0,27.071006,281.228402,676.0,218.0,662.0,,,


# (6) Syllable Count Per Word

In [None]:
def count_syllables(word):
    # Convert the word to lowercase
    word = word.lower()

    # Remove trailing 'es' and 'ed' (common exceptions)
    if word.endswith('es'):
        word = word[:-2]
    elif word.endswith('ed'):
        word = word[:-2]

    # Count the number of vowels in the word
    vowels = "aeiouy"
    count = 0
    prev_char = ""

    for char in word:
        if char in vowels and prev_char not in vowels:
            count += 1
        prev_char = char

    # Handle words with no vowels
    if count == 0:
        count = 1

    return count

# Store the Syllable Count in the corresponding column of output_df
output_df['SYLLABLE PER WORD'] = input_df['Tokens'].apply(lambda tokens: [count_syllables(word) for word in tokens])

In [None]:
output_df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,75.0,22.0,0.546392,0.115339,841.0,29.369798,348.147919,841.0,328.0,838.0,"[6, 4, 3, 1, 2, 4, 2, 2, 1, 1, 2, 2, 1, 2, 4, ...",,
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,38.0,13.0,0.490196,0.183453,278.0,42.446043,128.178417,278.0,126.0,278.0,"[2, 1, 3, 1, 5, 1, 2, 2, 4, 3, 2, 2, 1, 4, 2, ...",,
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,23.0,27.0,-0.080000,0.090090,555.0,34.414414,235.765766,555.0,202.0,553.0,"[1, 1, 1, 2, 2, 2, 3, 2, 1, 3, 2, 4, 1, 2, 1, ...",,
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,33.0,27.0,0.100000,0.092593,648.0,32.098765,272.039506,648.0,243.0,640.0,"[1, 3, 5, 5, 2, 2, 2, 1, 1, 1, 4, 1, 1, 2, 1, ...",,
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,33.0,27.0,0.100000,0.092593,648.0,32.098765,272.039506,648.0,243.0,640.0,"[1, 3, 5, 5, 2, 2, 2, 1, 1, 1, 4, 1, 1, 2, 1, ...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,50921.0,https://insights.blackcoffer.com/coronavirus-i...,5.0,42.0,-0.787234,0.109302,430.0,26.046512,182.418605,430.0,117.0,427.0,"[2, 2, 2, 2, 5, 2, 1, 7, 5, 2, 3, 2, 1, 2, 2, ...",,
108,51382.8,https://insights.blackcoffer.com/coronavirus-i...,23.0,52.0,-0.386667,0.080645,930.0,29.032258,383.612903,930.0,205.0,919.0,"[5, 1, 1, 2, 2, 3, 2, 2, 2, 2, 2, 1, 4, 2, 2, ...",,
109,51844.6,https://insights.blackcoffer.com/what-are-the-...,90.0,32.0,0.475410,0.125000,976.0,31.250000,402.900000,976.0,316.0,942.0,"[3, 1, 2, 1, 3, 2, 2, 1, 3, 3, 2, 1, 1, 1, 2, ...",,
110,52306.4,https://insights.blackcoffer.com/marketing-dri...,24.0,21.0,0.066667,0.066568,676.0,27.071006,281.228402,676.0,218.0,662.0,"[2, 1, 1, 1, 1, 1, 3, 2, 2, 3, 1, 1, 3, 3, 2, ...",,


# (7) Personal Pronouns

In [None]:
import re

# Define a function to count personal pronouns
def count_personal_pronouns(text):
    # Define a regex pattern to match the specified personal pronouns
    pattern = r'\b(I|we|my|ours|us)\b'

    # Use re.findall to find all matches in the text
    matches = re.findall(pattern, text, flags=re.IGNORECASE)

    # Exclude "US" from the list of matches
    matches = [match for match in matches if match.lower() != "us"]

    # Return the count of personal pronouns
    return len(matches)

# Store the  count of Personal Pronouns in the corresponding column of output_df
output_df['PERSONAL PRONOUNS'] = input_df['Text'].apply(count_personal_pronouns)

In [None]:
output_df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,75.0,22.0,0.546392,0.115339,841.0,29.369798,348.147919,841.0,328.0,838.0,"[6, 4, 3, 1, 2, 4, 2, 2, 1, 1, 2, 2, 1, 2, 4, ...",0,
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,38.0,13.0,0.490196,0.183453,278.0,42.446043,128.178417,278.0,126.0,278.0,"[2, 1, 3, 1, 5, 1, 2, 2, 4, 3, 2, 2, 1, 4, 2, ...",0,
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,23.0,27.0,-0.080000,0.090090,555.0,34.414414,235.765766,555.0,202.0,553.0,"[1, 1, 1, 2, 2, 2, 3, 2, 1, 3, 2, 4, 1, 2, 1, ...",0,
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,33.0,27.0,0.100000,0.092593,648.0,32.098765,272.039506,648.0,243.0,640.0,"[1, 3, 5, 5, 2, 2, 2, 1, 1, 1, 4, 1, 1, 2, 1, ...",0,
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,33.0,27.0,0.100000,0.092593,648.0,32.098765,272.039506,648.0,243.0,640.0,"[1, 3, 5, 5, 2, 2, 2, 1, 1, 1, 4, 1, 1, 2, 1, ...",0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,50921.0,https://insights.blackcoffer.com/coronavirus-i...,5.0,42.0,-0.787234,0.109302,430.0,26.046512,182.418605,430.0,117.0,427.0,"[2, 2, 2, 2, 5, 2, 1, 7, 5, 2, 3, 2, 1, 2, 2, ...",0,
108,51382.8,https://insights.blackcoffer.com/coronavirus-i...,23.0,52.0,-0.386667,0.080645,930.0,29.032258,383.612903,930.0,205.0,919.0,"[5, 1, 1, 2, 2, 3, 2, 2, 2, 2, 2, 1, 4, 2, 2, ...",0,
109,51844.6,https://insights.blackcoffer.com/what-are-the-...,90.0,32.0,0.475410,0.125000,976.0,31.250000,402.900000,976.0,316.0,942.0,"[3, 1, 2, 1, 3, 2, 2, 1, 3, 3, 2, 1, 1, 1, 2, ...",0,
110,52306.4,https://insights.blackcoffer.com/marketing-dri...,24.0,21.0,0.066667,0.066568,676.0,27.071006,281.228402,676.0,218.0,662.0,"[2, 1, 1, 1, 1, 1, 3, 2, 2, 3, 1, 1, 3, 3, 2, ...",0,


# (8) Average Word Length

In [None]:
# Define a function to calculate average word length
def calculate_average_word_length(text):
    words = text.split()
    total_characters = sum(len(word) for word in words)
    total_words = len(words)

    if total_words == 0:
        return 0  # Avoid division by zero

    average_word_length = total_characters / total_words
    return average_word_length

# Store the calculated Avg Word Length in the corresponding column of output_df
output_df['AVG WORD LENGTH'] = input_df['Text'].apply(calculate_average_word_length)

In [None]:
output_df

Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,75.0,22.0,0.546392,0.115339,841.0,29.369798,348.147919,841.0,328.0,838.0,"[6, 4, 3, 1, 2, 4, 2, 2, 1, 1, 2, 2, 1, 2, 4, ...",0,8.031212
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,38.0,13.0,0.490196,0.183453,278.0,42.446043,128.178417,278.0,126.0,278.0,"[2, 1, 3, 1, 5, 1, 2, 2, 4, 3, 2, 2, 1, 4, 2, ...",0,8.154676
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,23.0,27.0,-0.080000,0.090090,555.0,34.414414,235.765766,555.0,202.0,553.0,"[1, 1, 1, 2, 2, 2, 3, 2, 1, 3, 2, 4, 1, 2, 1, ...",0,7.681319
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,33.0,27.0,0.100000,0.092593,648.0,32.098765,272.039506,648.0,243.0,640.0,"[1, 3, 5, 5, 2, 2, 2, 1, 1, 1, 4, 1, 1, 2, 1, ...",0,7.642520
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,33.0,27.0,0.100000,0.092593,648.0,32.098765,272.039506,648.0,243.0,640.0,"[1, 3, 5, 5, 2, 2, 2, 1, 1, 1, 4, 1, 1, 2, 1, ...",0,7.642520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,50921.0,https://insights.blackcoffer.com/coronavirus-i...,5.0,42.0,-0.787234,0.109302,430.0,26.046512,182.418605,430.0,117.0,427.0,"[2, 2, 2, 2, 5, 2, 1, 7, 5, 2, 3, 2, 1, 2, 2, ...",0,7.076010
108,51382.8,https://insights.blackcoffer.com/coronavirus-i...,23.0,52.0,-0.386667,0.080645,930.0,29.032258,383.612903,930.0,205.0,919.0,"[5, 1, 1, 2, 2, 3, 2, 2, 2, 2, 2, 1, 4, 2, 2, ...",0,7.177313
109,51844.6,https://insights.blackcoffer.com/what-are-the-...,90.0,32.0,0.475410,0.125000,976.0,31.250000,402.900000,976.0,316.0,942.0,"[3, 1, 2, 1, 3, 2, 2, 1, 3, 3, 2, 1, 1, 1, 2, ...",0,7.207304
110,52306.4,https://insights.blackcoffer.com/marketing-dri...,24.0,21.0,0.066667,0.066568,676.0,27.071006,281.228402,676.0,218.0,662.0,"[2, 1, 1, 1, 1, 1, 3, 2, 2, 3, 1, 1, 3, 3, 2, ...",0,7.421384


# Save and Download the output file

In [None]:
# Save the output file

output_file_path = 'Output Data Structure.xlsx'
output_df.to_excel(output_file_path)

from google.colab import files
files.download(output_file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>