In [69]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

# Download the stopwords resource
import nltk
nltk.download('stopwords')

# Rest of your code...

# URLs to scrape
# Load URLs from an Excel sheet
excel_path = '/content/drive/MyDrive/bookmarks/Input.xlsx'  # Update this path
df = pd.read_excel(excel_path)

# Load stopwords
file_paths = [
    '/content/drive/MyDrive/bookmarks/StopWords_Auditor.txt',
    '/content/drive/MyDrive/bookmarks/StopWords_DatesandNumbers.txt',
    '/content/drive/MyDrive/bookmarks/StopWords_GenericLong.txt',
    '/content/drive/MyDrive/bookmarks/StopWords_Geographic.txt',
    '/content/drive/MyDrive/bookmarks/StopWords_Names.txt',
    '/content/drive/MyDrive/bookmarks/StopWords_Currencies.txt'
]

all_words = []

for file_path in file_paths:
    with open(file_path, 'r', encoding='ISO-8859-1') as file:
        words = file.read().splitlines()
        all_words.extend(words)
stopwords_df = pd.DataFrame(all_words, columns=['stopword'])
# Convert stopwords dataframe to a set for efficient lookup
stopwords_set = set(stopwords_df['stopword'])

# Calculate positive and negative scores
def calculate_positive_score(tokens):
    # Calculate positive score based on the number of positive words
    positive_score = sum(1 for token in tokens if token in positive_dict)
    return positive_score

def calculate_negative_score(tokens):
    # Calculate negative score based on the number of negative words
    negative_score = sum(1 for token in tokens if token in negative_dict)
    return negative_score
urls = df['URL']
# Create an empty list to store data from all URLs
all_data = []

for url in urls:
    page = requests.get(url)  # Use the URL directly
    soup = BeautifulSoup(page.content, 'html.parser')
    title_tags = soup.find_all("title")

    if title_tags:
        full_title = title_tags[0].get_text()
        main_title = full_title.split('|')[0].strip()
        print(main_title)

    s1 = soup.findAll(attrs={'class': 'td-post-content tagdiv-type'})
    article_text = "\n".join([item.get_text() for item in s1]).replace('\n', "  ")

    # Append the scraped data to the all_data list
    all_data.append({'URL': url,'main_title': main_title, 'article_text': article_text})

# Create a DataFrame from the accumulated data
combined_df = pd.DataFrame(all_data)

# Rest of your analysis code...
# ... Continue from where the combined_df DataFrame is created ...


# Clean the text data
def clean_text(text):
    if isinstance(text, str):
        words = word_tokenize(text)
        words_cleaned = [word.lower() for word in words if word not in string.punctuation]
        words_cleaned = [word for word in words_cleaned if word not in stopwords_set]
        cleaned_text = ' '.join(words_cleaned)
        return cleaned_text
    else:
        return ''  # Return an empty string for non-string values

combined_df['cleaned_text'] = combined_df['article_text'].apply(clean_text)


combined_df['cleaned_text'] = combined_df['article_text'].apply(clean_text)

# Load positive and negative word lists
positive_file_path = '/content/drive/MyDrive/bookmarks/positive-words.txt'
negative_file_path = '/content/drive/MyDrive/bookmarks/negative-words.txt'
positive_dict = set()
negative_dict = set()

with open(positive_file_path, 'r', encoding='ISO-8859-1') as file:
    positive_words = file.read().splitlines()
    positive_dict.update(positive_words)

with open(negative_file_path, 'r', encoding='ISO-8859-1') as file:
    negative_words = file.read().splitlines()
    negative_dict.update(negative_words)

# Calculate positive and negative scores
combined_df['tokenized_text'] = combined_df['cleaned_text'].apply(word_tokenize)
combined_df['positive_score'] = combined_df['tokenized_text'].apply(calculate_positive_score)
combined_df['negative_score'] = combined_df['tokenized_text'].apply(calculate_negative_score)

# Calculate polarity score and subjectivity score
combined_df['polarity_score'] = (combined_df['positive_score'] - combined_df['negative_score']) / \
                                 ((combined_df['positive_score'] + combined_df['negative_score']) + 0.000001)
combined_df['total_words'] = combined_df['cleaned_text'].apply(lambda text: len(text.split()))
combined_df['subjectivity_score'] = (combined_df['positive_score'] + combined_df['negative_score']) / \
                                     (combined_df['total_words'] + 0.000001)



# Calculate complex words count
combined_df['complex_words'] = combined_df['tokenized_text'].apply(calculate_complex_words)

# Calculate the percentage of complex words
combined_df['percentage_complex_words'] = (combined_df['complex_words'] / combined_df['total_words']) * 100

# Calculate average sentence length
combined_df['sentences'] = combined_df['cleaned_text'].apply(sent_tokenize)
combined_df['average_sentence_length'] = combined_df['tokenized_text'].apply(len) / combined_df['sentences'].apply(len)

# Calculate Fog Index
combined_df['fog_index'] = 0.4 * (combined_df['average_sentence_length'] + combined_df['percentage_complex_words'])

#calculate word count
cleaned_words = [word for word in words if word.lower() not in stopwords_set and word not in punctuation]
word_count = len(cleaned_words)
combined_df['word_count'] = combined_df['cleaned_text'].apply(calculate_word_count)

# Calculate average number of words per sentence
combined_df['avg_words_per_sentence'] = combined_df['tokenized_text'].apply(len) / combined_df['sentences'].apply(len)

# Calculate average word length
combined_df['average_word_length'] = combined_df['cleaned_text'].apply(calculate_average_word_length)

# Calculate syllable count per word
combined_df['syllable_count_per_word'] = combined_df['cleaned_text'].apply(calculate_syllable_count)

# Calculate personal pronoun counts
combined_df['personal_pronouns'] = combined_df['cleaned_text'].apply(calculate_personal_pronouns)

# Define the desired column order
column_order = ['URL', 'positive_score', 'negative_score', 'polarity_score', 'subjectivity_score', 'average_sentence_length', 'percentage_complex_words', 'fog_index','word_count' ,'avg_words_per_sentence', 'complex_words', 'syllable_count_per_word', 'personal_pronouns', 'average_word_length']

# Reorder the columns in the DataFrame
combined_df = combined_df[column_order]

# Save the results to a CSV file
combined_df.to_csv("result_data.csv", index=False)




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Rise of telemedicine and its Impact on Livelihood by 2040
Rise of e-health and its impact on humans by the year 2030
Rise of e-health and its impact on humans by the year 2030
Rise of telemedicine and its Impact on Livelihood by 2040
Rise of telemedicine and its Impact on Livelihood by 2040
Rise of Chatbots and its impact on customer support by the year 2040
Rise of e-health and its impact on humans by the year 2030
How does marketing influence businesses and consumers?
How advertisement increase your market value?
Negative effects of marketing on society
How advertisement/marketing affects business.
Rising IT cities will impact the economy, environment, infrastructure, and city life by the year 2035
Rise of OTT platform and its impact on entertainment industry by the year 2030
Rise of Electric Vehicles and its Impact on Livelihood by 2040
Rise of electric vehicle and its impact on livelihood by the year 2040.
Oil prices by the year 2040, and how it will impact the world economy.
An ou