In [1]:
import pandas as pd

input_data = pd.read_excel(r"Input.xlsx")
input_data

Unnamed: 0,URL_ID,URL
0,blackassign0001,https://insights.blackcoffer.com/rising-it-cit...
1,blackassign0002,https://insights.blackcoffer.com/rising-it-cit...
2,blackassign0003,https://insights.blackcoffer.com/internet-dema...
3,blackassign0004,https://insights.blackcoffer.com/rise-of-cyber...
4,blackassign0005,https://insights.blackcoffer.com/ott-platform-...
...,...,...
93,blackassign0096,https://insights.blackcoffer.com/what-is-the-r...
94,blackassign0097,https://insights.blackcoffer.com/impact-of-cov...
95,blackassign0098,https://insights.blackcoffer.com/contribution-...
96,blackassign0099,https://insights.blackcoffer.com/how-covid-19-...


In [2]:
from bs4 import BeautifulSoup
import requests
import re

def extract_article_text(html):
    soup = BeautifulSoup(html, 'html.parser')

    # Remove script and style tags
    for script_or_style in soup(['script', 'style']):
        script_or_style.decompose()

    # Get the text from the body
    body_text = soup.body.get_text(separator=' ', strip=True)

    # Remove extra whitespaces and newlines
    cleaned_text = re.sub('\s+', ' ', body_text).strip()

    return cleaned_text

for index, row in input_data.iterrows():
    url = row['URL']
    response = requests.get(url)

    if response.status_code == 200:
        article_text = extract_article_text(response.text)

        if article_text:
            # Save to text file
            with open(f"{row['URL_ID']}.txt", 'w', encoding='utf-8') as file:
                file.write(article_text)
        else:
            print(f"No article text found for URL_ID {row['URL_ID']}")
    else:
        print(f"Failed to retrieve the page for URL_ID {row['URL_ID']}. Status code: {response.status_code}")


In [3]:
pip install nltk



In [4]:
pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [5]:
pip install syllables

Collecting syllables
  Downloading syllables-1.0.9-py3-none-any.whl (15 kB)
Collecting cmudict<2.0.0,>=1.0.11 (from syllables)
  Downloading cmudict-1.0.15-py3-none-any.whl (939 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.4/939.4 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting importlib-metadata<7.0,>=5.1 (from syllables)
  Downloading importlib_metadata-6.11.0-py3-none-any.whl (23 kB)
Collecting importlib-resources<6.0.0,>=5.10.1 (from cmudict<2.0.0,>=1.0.11->syllables)
  Downloading importlib_resources-5.13.0-py3-none-any.whl (32 kB)
Installing collected packages: importlib-resources, importlib-metadata, cmudict, syllables
  Attempting uninstall: importlib-resources
    Found existing installation: importlib-resources 6.1.1
    Uninstalling importlib-resources-6.1.1:
      Successfully uninstalled importlib-resources-6.1.1
  Attempting uninstall: importlib-metadata
    Found existing installation: importlib-metadata 7.0.0
    Uninstalling i

In [6]:
import pandas as pd
import nltk.data
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import syllables


nltk.download('popular')
nltk.download('punkt')
nltk.download('stopwords')

# Function for text analysis
def analyze_text(text):


     # Tokenize the text into words and sentences
    words = word_tokenize(text)
    sentences = sent_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

    # Calculate metrics
    word_count = len(words)
    sentence_count = len(sentences)
    avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0
    percentage_of_complex_words = sum(1 for word in words if syllables.estimate(word) > 2) / word_count if word_count > 0 else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_of_complex_words)

    # Example sentiment analysis using NLTK's SentimentIntensityAnalyzer
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = sia.polarity_scores(text)
    positive_score = sentiment_scores['pos']
    negative_score = sentiment_scores['neg']
    polarity_score = sentiment_scores['compound']

    # Example syllable per word calculation using the 'syllables' library
    syllables_per_word = sum(syllables.estimate(w) for w in words) / word_count if word_count > 0 else 0

    # Personal Pronouns (example: he, she, I, etc.)
    personal_pronouns = sum(1 for word in words if word.lower() in ['i', 'me', 'my', 'mine', 'myself', 'you', 'your', 'yours', 'yourself', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'we', 'us', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourselves', 'they', 'them', 'their', 'theirs', 'themselves'])

    # Average word length
    avg_word_length = sum(len(word) for word in words) / word_count if word_count > 0 else 0

    # Return the analysis results as a dictionary
    return {
        'WORD COUNT': word_count,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_of_complex_words,
        'FOG INDEX': fog_index,
        'POSITIVE SCORE': positive_score,
        'NEGATIVE SCORE': negative_score,
        'POLARITY SCORE': polarity_score,
        'SYLLABLE PER WORD': syllables_per_word,
        'PERSONAL PRONOUNS': personal_pronouns,
        'AVG WORD LENGTH': avg_word_length
        # ... (add more variables based on your requirements)
    }

# Read input data
input_data = pd.read_excel(r"Input.xlsx")

# Initialize list to store output data
output_data = []

# Loop through each row in the input data
for index, row in input_data.iterrows():
    # Read the text from the corresponding file
    with open(f"{row['URL_ID']}.txt", 'r', encoding='utf-8') as file:
        article_text = file.read()

    # Perform text analysis
    analysis_results = analyze_text(article_text)

    # Append the results to the output_data list
    output_data.append({
        'URL_ID': row['URL_ID'],
        'WORD COUNT': analysis_results['WORD COUNT'],
        'AVG SENTENCE LENGTH': analysis_results['AVG SENTENCE LENGTH'],
        'PERCENTAGE OF COMPLEX WORDS': analysis_results['PERCENTAGE OF COMPLEX WORDS'],
        'FOG INDEX': analysis_results['FOG INDEX'],
        'POSITIVE SCORE': analysis_results['POSITIVE SCORE'],
        'NEGATIVE SCORE': analysis_results['NEGATIVE SCORE'],
        'POLARITY SCORE': analysis_results['POLARITY SCORE'],
        'SYLLABLE PER WORD': analysis_results['SYLLABLE PER WORD'],
        'PERSONAL PRONOUNS': analysis_results['PERSONAL PRONOUNS'],
        'AVG WORD LENGTH': analysis_results['AVG WORD LENGTH'],
        # ... (add more variables based on your requirements)
    })




# Save the output data to Excel
output_df = pd.DataFrame(output_data)
output_df.to_excel('Output_Data.xlsx', index=False)


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt