In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import re
from textblob import TextBlob
import syllapy


In [2]:
# Step 1: Extracting Article Texts
def extract_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Assuming the title is in a <title> tag and the article in <article> or similar tag
    title = soup.find('title').get_text()
    article = soup.find('article')
    if not article:
        article = soup.find(attrs={"class": "article-content"})  # Example class, change as per actual site structure
    
    paragraphs = article.find_all('p')
    article_text = '\n'.join([p.get_text() for p in paragraphs])
    
    return title, article_text

In [3]:
# Read the input Excel file
input_df = pd.read_excel('Input.xlsx')


In [4]:
# Create directory for articles if not exists
os.makedirs('articles', exist_ok=True)

In [5]:
# Extract and save articles
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    try:
        title, text = extract_article_text(url)
        with open(f'articles/{url_id}.txt', 'w', encoding='utf-8') as file:
            file.write(title + "\n\n" + text)
        print(f"Article {url_id} extracted successfully.")
    except Exception as e:
        print(f"Failed to extract article {url_id}: {e}")

Article blackassign0001 extracted successfully.
Article blackassign0002 extracted successfully.
Article blackassign0003 extracted successfully.
Article blackassign0004 extracted successfully.
Article blackassign0005 extracted successfully.
Article blackassign0006 extracted successfully.
Article blackassign0007 extracted successfully.
Article blackassign0008 extracted successfully.
Article blackassign0009 extracted successfully.
Article blackassign0010 extracted successfully.
Article blackassign0011 extracted successfully.
Article blackassign0012 extracted successfully.
Article blackassign0013 extracted successfully.
Article blackassign0014 extracted successfully.
Article blackassign0015 extracted successfully.
Article blackassign0016 extracted successfully.
Article blackassign0017 extracted successfully.
Article blackassign0018 extracted successfully.
Article blackassign0019 extracted successfully.
Article blackassign0020 extracted successfully.
Article blackassign0021 extracted succes

In [6]:
# Step 2: Performing Text Analysis
# Load positive and negative words
with open('positive-words.txt', 'r', encoding='utf-8') as file:
    positive_words = set(file.read().split())
with open('negative-words.txt', 'r', encoding='utf-8', errors='ignore') as file:
    negative_words = set(file.read().split())

In [7]:
def analyze_text(text):
    blob = TextBlob(text)
    sentences = blob.sentences
    words = blob.words
    
    word_count = len(words)
    sentence_count = len(sentences)
    
    positive_score = sum(1 for word in words if word.lower() in positive_words)
    negative_score = sum(1 for word in words if word.lower() in negative_words)
    
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = blob.sentiment.subjectivity
    
    avg_sentence_length = word_count / sentence_count
    
    complex_words = [word for word in words if syllapy.count(word) > 2]
    complex_word_count = len(complex_words)
    percentage_complex_words = complex_word_count / word_count
    
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    
    syllables_per_word = sum(syllapy.count(word) for word in words) / word_count
    
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))
    
    avg_word_length = sum(len(word) for word in words) / word_count
    
    return {
        "POSITIVE SCORE": positive_score,
        "NEGATIVE SCORE": negative_score,
        "POLARITY SCORE": polarity_score,
        "SUBJECTIVITY SCORE": subjectivity_score,
        "AVG SENTENCE LENGTH": avg_sentence_length,
        "PERCENTAGE OF COMPLEX WORDS": percentage_complex_words,
        "FOG INDEX": fog_index,
        "AVG NUMBER OF WORDS PER SENTENCE": avg_sentence_length,
        "COMPLEX WORD COUNT": complex_word_count,
        "WORD COUNT": word_count,
        "SYLLABLE PER WORD": syllables_per_word,
        "PERSONAL PRONOUNS": personal_pronouns,
        "AVG WORD LENGTH": avg_word_length
    }



In [8]:
# Step 3: Saving the Results
# Read the input and output structure files
input_df = pd.read_excel('Input.xlsx')
output_structure = pd.read_excel('Output Data Structure.xlsx')

In [9]:
# Initialize the output DataFrame
output_df = pd.DataFrame(columns=output_structure.columns)


In [10]:
# Analyze each extracted article and append results to the output DataFrame
results = []  # To store each row of the results


In [11]:
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    try:
        with open(f'articles/{url_id}.txt', 'r', encoding='utf-8') as file:
            text = file.read()
            analysis_results = analyze_text(text)
            analysis_results.update(row.to_dict())
            results.append(analysis_results)
            print(f"Analysis for article {url_id} completed.")
    except Exception as e:
        print(f"Failed to analyze article {url_id}: {e}")

Analysis for article blackassign0001 completed.
Analysis for article blackassign0002 completed.
Analysis for article blackassign0003 completed.
Analysis for article blackassign0004 completed.
Analysis for article blackassign0005 completed.
Analysis for article blackassign0006 completed.
Analysis for article blackassign0007 completed.
Analysis for article blackassign0008 completed.
Analysis for article blackassign0009 completed.
Analysis for article blackassign0010 completed.
Analysis for article blackassign0011 completed.
Analysis for article blackassign0012 completed.
Analysis for article blackassign0013 completed.
Analysis for article blackassign0014 completed.
Analysis for article blackassign0015 completed.
Analysis for article blackassign0016 completed.
Analysis for article blackassign0017 completed.
Analysis for article blackassign0018 completed.
Analysis for article blackassign0019 completed.
Analysis for article blackassign0020 completed.
Analysis for article blackassign0021 com

In [12]:
# Create a DataFrame from the results list
output_df = pd.DataFrame(results, columns=output_structure.columns)


In [17]:
# Ensure there are no issues with data
print(output_df.head(50))


             URL_ID                                                URL  \
0   blackassign0001  https://insights.blackcoffer.com/rising-it-cit...   
1   blackassign0002  https://insights.blackcoffer.com/rising-it-cit...   
2   blackassign0003  https://insights.blackcoffer.com/internet-dema...   
3   blackassign0004  https://insights.blackcoffer.com/rise-of-cyber...   
4   blackassign0005  https://insights.blackcoffer.com/ott-platform-...   
5   blackassign0006  https://insights.blackcoffer.com/the-rise-of-t...   
6   blackassign0007  https://insights.blackcoffer.com/rise-of-cyber...   
7   blackassign0008  https://insights.blackcoffer.com/rise-of-inter...   
8   blackassign0009  https://insights.blackcoffer.com/rise-of-cyber...   
9   blackassign0010  https://insights.blackcoffer.com/rise-of-cyber...   
10  blackassign0011  https://insights.blackcoffer.com/rise-of-inter...   
11  blackassign0012  https://insights.blackcoffer.com/rise-of-telem...   
12  blackassign0013  https://insights.

In [18]:
print(output_df.tail(50))


             URL_ID                                                URL  \
48  blackassign0051  https://insights.blackcoffer.com/how-data-anal...   
49  blackassign0052  https://insights.blackcoffer.com/difference-be...   
50  blackassign0053  https://insights.blackcoffer.com/how-python-be...   
51  blackassign0054  https://insights.blackcoffer.com/how-google-fi...   
52  blackassign0055  https://insights.blackcoffer.com/what-is-the-f...   
53  blackassign0056  https://insights.blackcoffer.com/impact-of-ai-...   
54  blackassign0057  https://insights.blackcoffer.com/telemedicine-...   
55  blackassign0058  https://insights.blackcoffer.com/how-we-foreca...   
56  blackassign0059  https://insights.blackcoffer.com/can-robots-ta...   
57  blackassign0060  https://insights.blackcoffer.com/embedding-car...   
58  blackassign0061  https://insights.blackcoffer.com/management-ch...   
59  blackassign0062  https://insights.blackcoffer.com/are-we-any-cl...   
60  blackassign0063  https://insights.