In [1]:
# !pip install beautifulsoup4

In [2]:
# !pip install openpyxl

In [3]:
# !pip install nltk

In [4]:
# !pip install textstat

In [5]:
# nltk.downloader popular
# nltk.download('popular')

In [6]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import requests
from textblob import TextBlob
import textstat
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Load input data
input_data = pd.read_excel('Input.xlsx')

# Create a directory for saving text files
if not os.path.exists('article_texts'):
    os.makedirs('article_texts')

# Extract article text from URLs
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    article_tag = soup.find('article')
    
    if article_tag:
        article_text = article_tag.text
        with open(f'article_texts/{url_id}.txt', 'w', encoding='utf-8') as f:
            f.write(article_text)
    else:
        print(f"Article content not found for URL_ID: {url_id}")
        
# Initialize a list to store results
results = []

# Iterate through extracted texts and perform analysis
for index, row in input_data.iterrows():
    url_id = row['URL_ID']
    
    with open(f'article_texts/{url_id}.txt', 'r', encoding='utf-8') as f:
        article_text = f.read()
    
    blob = TextBlob(article_text)
    
    # TextBlob sentiment analysis
    sentiment = blob.sentiment
    positive_score = sentiment.polarity
    negative_score = -sentiment.polarity
    polarity_score = sentiment.subjectivity
    subjectivity_score = sentiment.subjectivity
    
    # Text statistics analysis
    avg_sentence_length = textstat.lexicon_count(article_text) / len(blob.sentences)
    complex_words = textstat.difficult_words(article_text)
    word_count = len(blob.words)
    avg_word_length = sum(len(word) for word in blob.words) / word_count
    
    # Append the computed variables to the results list
    results.append([
        url_id,
        positive_score,
        negative_score,
        polarity_score,
        subjectivity_score,
        avg_sentence_length,
        complex_words / word_count * 100,
        textstat.gunning_fog(article_text),
        textstat.avg_sentence_length(article_text),
        complex_words,
        word_count,
        textstat.syllable_count(article_text) / word_count,
        blob.words.count('I') + blob.words.count('me') + blob.words.count('my'),
        avg_word_length
    ])

# Create a DataFrame from the results
output_columns = ['URL_ID', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
                  'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE',
                  'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH']

output_data = pd.DataFrame(results, columns=output_columns)

# Save the output DataFrame to an Excel file
output_data.to_excel('Output_Data_Structure.xlsx', index=False)


[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [WinError 10060] A connection attempt failed because
[nltk_data]     the connected party did not properly respond after a
[nltk_data]     period of time, or established connection failed
[nltk_data]     because connected host has failed to respond>


Article content not found for URL_ID: 44
Article content not found for URL_ID: 57
Article content not found for URL_ID: 144

**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\jayac/nltk_data'
    - 'C:\\Users\\jayac\\AppData\\Local\\anaconda3\\nltk_data'
    - 'C:\\Users\\jayac\\AppData\\Local\\anaconda3\\share\\nltk_data'
    - 'C:\\Users\\jayac\\AppData\\Local\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\jayac\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************



MissingCorpusError: 
Looks like you are missing some required data for this feature.

To download the necessary data, simply run

    python -m textblob.download_corpora

or use the NLTK downloader to download the missing data: http://nltk.org/data.html
If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.


In [7]:
# scrapper.py

import os
import pandas as pd
from bs4 import BeautifulSoup
import requests

# Read input data
input_df = pd.read_excel('input.xlsx')

# Create a directory for saving text files
if not os.path.exists('output'):
    os.makedirs('output')

# Scrape article text from URLs
for index, row in input_df.iterrows():
    url = row['URL']
    url_id = row['URL_ID']
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract title and article content
    title = soup.title.text.strip()
    article_content = soup.find('article').text.strip()
    
    # Save article content to a text file
    with open(f'output/{url_id}.txt', 'w', encoding='utf-8') as f:
        f.write(f'{title}\n\n{article_content}')


AttributeError: 'NoneType' object has no attribute 'text'

In [None]:
# text_analysis.py

import os
import pandas as pd
import nltk
from textblob import TextBlob
from textstat import flesch_reading_ease

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Read input data
input_df = pd.read_excel('Input.xlsx')

# Initialize variables for storing results
results = []

# Text analysis
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    with open(f'output/{url_id}.txt', 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Perform text analysis
    blob = TextBlob(content)
    num_words = len(blob.words)
    num_sentences = len(blob.sentences)
    num_syllables = textstat.syllable_count(content)
    # ... other calculations
    
    # Append results to the list
    results.append({
        'URL_ID': url_id,
        'POSITIVE SCORE': positive_score,
        'NEGATIVE SCORE': negative_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        # ... other calculated variables
    })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Save results to Output Data Structure.xlsx
results_df.to_excel('Output Data Structure.xlsx', index=False)


[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>
