In [50]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import nltk
nltk.download('punkt')


## Example of Extracting Text

In [51]:
def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")

        # Attempt to extract the title
        title = soup.find("h1", class_="tdb-title-text")
        title_text = title.text.strip() if title else None
        if title_text is None:
            alt_title = soup.find("h1", class_="entry-title")
            title_text = alt_title.text.strip() if alt_title else None

        # Attempt to extract the article text
        article_content = soup.find("div", class_="td-post-content")
        article_text = ""
        if article_content:
            paragraphs = article_content.find_all("p")
            for paragraph in paragraphs:
                article_text += paragraph.get_text() + "\n"
            # Remove <pre> element and its contents
            for pre in article_content.find_all("pre"):
                pre.decompose()

        return title_text, article_text

    except Exception as e:
        print(f"Error extracting data from {url}: {e}")
        return None, None

# URLs to extract data from
urls = [
    'https://insights.blackcoffer.com/rise-of-telemedicine-and-its-impact-on-livelihood-by-2040-3-2/',
    'https://insights.blackcoffer.com/rise-of-e-health-and-its-imapct-on-humans-by-the-year-2030-2/',
    'https://insights.blackcoffer.com/how-advertisement-increase-your-market-value/'
]

# Extracting data from URLs
data = []
for url in urls:
    title, text = extract_article_text(url)
    data.append({'URL': url, 'Title': title, 'Article Text': text})

# Printing the extracted data
for item in data:
    print(f"URL: {item['URL']}\nTitle: {item['Title']}\nArticle Text: {item['Article Text']}\n")


URL: https://insights.blackcoffer.com/rise-of-telemedicine-and-its-impact-on-livelihood-by-2040-3-2/
Title: Rise of telemedicine and its Impact on Livelihood by 2040
Article Text: Telemedicine, the use of technology to diagnose and treat patients remotely, has been rising in recent years. With the advent of high-speed internet and improved video conferencing tools, healthcare providers are increasingly turning to telemedicine to provide care to patients in remote or underserved areas.
Telemedicine, using technology to provide healthcare services remotely, has recently gained popularity. With advancements in communication and medical technology, it has become increasingly possible for doctors and patients to connect and interact from anywhere in the world. This has led to the rise of telemedicine, which has the potential to revolutionize the way healthcare is delivered.#Telemedicine
The increasing focus on preventative healthcare has also driven the rise of telemedicine. As more and mor

## Importing Input File

In [78]:
df=pd.read_excel('Input.xlsx')

In [79]:
df

Unnamed: 0,URL_ID,URL
0,123.0,https://insights.blackcoffer.com/rise-of-telem...
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...
4,432.0,https://insights.blackcoffer.com/rise-of-telem...
...,...,...
109,50921.0,https://insights.blackcoffer.com/coronavirus-i...
110,51382.8,https://insights.blackcoffer.com/coronavirus-i...
111,51844.6,https://insights.blackcoffer.com/what-are-the-...
112,52306.4,https://insights.blackcoffer.com/marketing-dri...


In [80]:
df.head()

Unnamed: 0,URL_ID,URL
0,123.0,https://insights.blackcoffer.com/rise-of-telem...
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...
4,432.0,https://insights.blackcoffer.com/rise-of-telem...


## Extracting 

In [81]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Define the extraction function
def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")

        # Attempt to extract the title
        title = soup.find("h1", class_="tdb-title-text")
        title_text = title.text.strip() if title else None
        if title_text is None:
            alt_title = soup.find("h1", class_="entry-title")
            title_text = alt_title.text.strip() if alt_title else None


        # Attempt to extract the article text
        article_content = soup.find("div", class_="td-post-content")
        article_text = ""
        if article_content:
            paragraphs = article_content.find_all("p")
            for paragraph in paragraphs:
                article_text += paragraph.get_text() + "\n"
        # Remove <pre> element and its contents
            for pre in article_content.find_all("pre"):
                pre.decompose()

        return title_text, article_text

    except Exception as e:
        print(f"Error extracting data from {url}: {e}")
        return None, None


# URLs are in the 'URLs' column of your DataFrame
urls = df['URL']

# Extracting data from URLs and creating new DataFrame columns
df[['Title', 'Article Text']] = urls.apply(lambda url: pd.Series(extract_article_text(url)))

# Display the DataFrame with extracted data
print(df)


      URL_ID                                                URL  \
0      123.0  https://insights.blackcoffer.com/rise-of-telem...   
1      321.0  https://insights.blackcoffer.com/rise-of-e-hea...   
2     2345.0  https://insights.blackcoffer.com/rise-of-e-hea...   
3     4321.0  https://insights.blackcoffer.com/rise-of-telem...   
4      432.0  https://insights.blackcoffer.com/rise-of-telem...   
..       ...                                                ...   
109  50921.0  https://insights.blackcoffer.com/coronavirus-i...   
110  51382.8  https://insights.blackcoffer.com/coronavirus-i...   
111  51844.6  https://insights.blackcoffer.com/what-are-the-...   
112  52306.4  https://insights.blackcoffer.com/marketing-dri...   
113  52768.2  https://insights.blackcoffer.com/continued-dem...   

                                                 Title  \
0    Rise of telemedicine and its Impact on Livelih...   
1    Rise of e-health and its impact on humans by t...   
2    Rise of e-health

In [82]:
# View the first 3 rows of the DataFrame
df.head(3)


Unnamed: 0,URL_ID,URL,Title,Article Text
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,Rise of telemedicine and its Impact on Livelih...,"Telemedicine, the use of technology to diagnos..."
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,Rise of e-health and its impact on humans by t...,"The rise of e-health, or the use of electronic..."
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,Rise of e-health and its impact on humans by t...,2020 was the year the world was ravaged by the...


In [83]:
# View rows with empty values in 'Article Text' column
empty_rows = df[df['Article Text'].apply(lambda x: not x)]
print(empty_rows)

     URL_ID                                                URL Title  \
24  11668.0  https://insights.blackcoffer.com/how-neural-ne...  None   
37  17671.4  https://insights.blackcoffer.com/covid-19-envi...  None   

   Article Text  
24               
37               


In [84]:
# Drop rows with empty values in 'Article Text' column
df.drop(df[df['Article Text'].apply(lambda x: not x)].index, inplace=True)


In [85]:
df.to_csv('output2.csv', index=False)

## Cleaning using Stop Words Lists


In [86]:
stopword_files = [
    "StopWords_Generic.txt",
    "StopWords_Auditor.txt",
    "StopWords_Currencies.txt",
    "StopWords_DatesandNumbers.txt",
    "StopWords_GenericLong.txt",
    "StopWords_Geographic.txt",
    "StopWords_Names.txt"
]
stop_words = set()

# Read stop words from each file and add them to the set
for file_name in stopword_files:
    with open(file_name, 'r') as file:
        words = file.read().splitlines()
        stop_words.update(words)


In [87]:
df['Article Text'] = df['Article Text'].apply(lambda text: ' '.join([word for word in text.split() if word.lower() not in stop_words]))
print(df['Article Text'])

0      Telemedicine, technology diagnose treat patien...
1      rise e-health, electronic means facilitate hea...
2      2020 year world ravaged SarsCov2 virus. notori...
3      “More gains quality, affordability accessibili...
4      “More gains quality, affordability accessibili...
                             ...                        
109    jumping topic give overview Coronavirus, Covid...
110    coronavirus spreads world countries implement ...
111    Alibaba Ping Google Ford, companies globe tell...
112    British ruled India, Indians accepted work pol...
113    business business longer business increase bot...
Name: Article Text, Length: 112, dtype: object


## Creating a dictionary of Positive and Negative words

In [88]:
# Read the positive and negative words from files
positive_words = []
negative_words = []

# Define file paths of positive and negative words
positive_file = 'positive-words.txt'
negative_file = 'negative-words.txt'

# Read positive words from file
with open(positive_file, 'r') as file:
    positive_words = file.read().splitlines()

# Read negative words from file
with open(negative_file, 'r') as file:
    negative_words = file.read().splitlines()

# Filtering out positive and negative words not present in the stop words list
positive_words = [word for word in positive_words if word.lower() not in stop_words]
negative_words = [word for word in negative_words if word.lower() not in stop_words]


## Extracting Derived variables 

In [89]:
# Function to calculate the positive score for each article text
def calculate_positive_score(text):
    tokens = text.split()  # Tokenize the text (you can use more sophisticated tokenizers)
    positive_score = sum(1 for word in tokens if word.lower() in positive_words)
    return positive_score

# Apply the function to create a 'Positive Score' column in the DataFrame
df['Positive Score'] = df['Article Text'].apply(calculate_positive_score)


In [90]:
# Function to calculate the negative score for each article text
def calculate_negative_score(text):
    tokens = text.split()  # Tokenize the text (you can use more sophisticated tokenizers)
    negative_score = sum(-1 for word in tokens if word.lower() in negative_words)
    return -1 * negative_score  # Multiplying by -1 to make the score positive

# Apply the function to create a 'Negative Score' column in the DataFrame
df['Negative Score'] = df['Article Text'].apply(calculate_negative_score)


In [91]:
# Calculate the Polarity Score for each article text
df['Polarity Score'] = (df['Positive Score'] - df['Negative Score']) / (df['Positive Score'] + df['Negative Score'] + 0.000001)

# Calculate the total words after cleaning
df['Total Words'] = df['Article Text'].apply(lambda x: len(x.split()))

# Calculate the Subjectivity Score for each article text
df['Subjectivity Score'] = (df['Positive Score'] + df['Negative Score']) / (df['Total Words'] + 0.000001)


In [92]:
## Checking the Range
numeric_cols = df.select_dtypes(include=['number'])
range_of_numeric_cols = numeric_cols.agg(['min', 'max'])
print("Range (Min, Max) of each numeric column:")
print(range_of_numeric_cols)

Range (Min, Max) of each numeric column:
      URL_ID  Positive Score  Negative Score  Polarity Score  Total Words  \
min    123.0               0               0            -1.0           62   
max  52768.2              78              80             1.0         2220   

     Subjectivity Score  
min            0.000000  
max            0.195946  


## Average Number of Words Per Sentence

In [93]:
# Calculate the total number of sentences
df['Total Sentences'] = df['Article Text'].apply(lambda x: len(nltk.sent_tokenize(x)))

# Calculate the average number of words per sentence
df['Average Number of Words Per Sentence'] = df['Total Words'] / df['Total Sentences']

## Word Count

In [95]:
# Function to count the words after text cleaning using custom stop words
def count_cleaned_words(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    
    # Filter out punctuation and make words lowercase
    words = [word.lower() for word in words if word.isalpha()]
    
    # Remove custom stop words
    words = [word for word in words if word not in stop_words]
    
    # Return the count of cleaned words
    return len(words)

# Apply the function to create a 'Cleaned Word Count' column in the DataFrame
df['Cleaned Word Count'] = df['Article Text'].apply(count_cleaned_words)


## Syllable Count Per Word & Complex Word Count

In [94]:
# Define a function to count syllables in a word
def count_syllables(word):
    vowels = 'aeiouy'
    count = 0
    for i in range(len(word)):
        if word[i].lower() in vowels:
            count += 1
            if i < len(word)-1 and word[i+1].lower() in vowels:
                count -= 1
    if word.endswith(('es', 'ed')):
        count -= 1
    if count == 0:
        count = 1
    return count

# Calculate syllable count for each word in the DataFrame
df['Syllable Count'] = df['Article Text'].apply(lambda text: sum(count_syllables(word) for word in text.split()))

# Function to count complex words in a text
def count_complex_words(text):
    words = text.split()
    complex_word_count = sum(1 for word in words if count_syllables(word) > 2)
    return complex_word_count

# Calculate the complex word count for each article text in the DataFrame
df['Complex Word Count'] = df['Article Text'].apply(count_complex_words)

## Personal Pronouns

In [96]:
def count_personal_pronouns(text):
    # Define the regex pattern to match the personal pronouns
    pattern = r'\b(?<!\bUS\b)(?:I|we|my|ours|us)\b'

    # Find all matches while ignoring 'US' as a whole word
    matches = re.findall(pattern, text, flags=re.IGNORECASE)

    return len(matches)

# Apply the function to count personal pronouns in the DataFrame
df['Personal Pronouns'] = df['Article Text'].apply(count_personal_pronouns)


## Average Word Length


In [97]:
# Define a function to calculate the average word length
def average_word_length(text):
    words = text.split()
    total_chars = sum(len(word) for word in words)
    return total_chars / len(words) if len(words) > 0 else 0

# Apply the function to create an 'Average Word Length' column in the DataFrame
df['Average Word Length'] = df['Article Text'].apply(average_word_length)


## Analysis of Readability

In [99]:
# Calculate Average Sentence Length
df['Average Sentence Length'] = df['Total Words'] / df['Total Sentences']
# Percentage of Complex words
df['Percentage of Complex words'] = df['Complex Word Count'] / df['Total Words']
# Calculate Fog Index
df['Fog Index'] = 0.4 * (df['Average Sentence Length'] + df['Percentage of Complex words'])


## Display

In [100]:
df.head()

Unnamed: 0,URL_ID,URL,Title,Article Text,Positive Score,Negative Score,Polarity Score,Total Words,Subjectivity Score,Total Sentences,Average Number of Words Per Sentence,Syllable Count,Complex Word Count,Cleaned Word Count,Personal Pronouns,Average Word Length,Average Sentence Length,Percentage of Complex words,Fog Index
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,Rise of telemedicine and its Impact on Livelih...,"Telemedicine, technology diagnose treat patien...",68,19,0.563218,859,0.101281,79,10.873418,2399,446,834,0,8.009313,10.873418,0.519208,4.55705
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,Rise of e-health and its impact on humans by t...,"rise e-health, electronic means facilitate hea...",37,11,0.541667,281,0.170819,24,11.708333,780,159,259,0,8.291815,11.708333,0.565836,4.909668
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,Rise of e-health and its impact on humans by t...,2020 year world ravaged SarsCov2 virus. notori...,19,20,-0.025641,547,0.071298,67,8.164179,1311,226,492,0,7.559415,8.164179,0.413163,3.430937
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,Rise of telemedicine and its Impact on Livelih...,"“More gains quality, affordability accessibili...",32,24,0.142857,662,0.084592,59,11.220339,1623,291,602,0,7.60574,11.220339,0.439577,4.663966
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,Rise of telemedicine and its Impact on Livelih...,"“More gains quality, affordability accessibili...",32,24,0.142857,662,0.084592,59,11.220339,1623,291,602,0,7.60574,11.220339,0.439577,4.663966


In [101]:
# Drop 'Total Words' and 'Total Sentences' columns
df.drop(['Total Words', 'Total Sentences'], axis=1, inplace=True)

# Change column name 'Cleaned Word Count' to 'Word Count'
df.rename(columns={'Cleaned Word Count': 'Word Count'}, inplace=True)


In [102]:
df.drop(['Title', 'Article Text'], axis=1, inplace=True)

In [103]:
df.head()

Unnamed: 0,URL_ID,URL,Positive Score,Negative Score,Polarity Score,Subjectivity Score,Average Number of Words Per Sentence,Syllable Count,Complex Word Count,Word Count,Personal Pronouns,Average Word Length,Average Sentence Length,Percentage of Complex words,Fog Index
0,123.0,https://insights.blackcoffer.com/rise-of-telem...,68,19,0.563218,0.101281,10.873418,2399,446,834,0,8.009313,10.873418,0.519208,4.55705
1,321.0,https://insights.blackcoffer.com/rise-of-e-hea...,37,11,0.541667,0.170819,11.708333,780,159,259,0,8.291815,11.708333,0.565836,4.909668
2,2345.0,https://insights.blackcoffer.com/rise-of-e-hea...,19,20,-0.025641,0.071298,8.164179,1311,226,492,0,7.559415,8.164179,0.413163,3.430937
3,4321.0,https://insights.blackcoffer.com/rise-of-telem...,32,24,0.142857,0.084592,11.220339,1623,291,602,0,7.60574,11.220339,0.439577,4.663966
4,432.0,https://insights.blackcoffer.com/rise-of-telem...,32,24,0.142857,0.084592,11.220339,1623,291,602,0,7.60574,11.220339,0.439577,4.663966


In [None]:
df.to_csv('output2.csv', index=False)