In [1]:
!pip install nltk textblob
!python -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
Finished.


In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Read the input Excel file
input_file = 'Input.xlsx'
df = pd.read_excel(input_file)

# Function to extract text from a URL
def extract_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract title and article text
    title = soup.find('title').get_text() if soup.find('title') else ''
    paragraphs = soup.find_all('p')
    article_text = ' '.join([para.get_text() for para in paragraphs])

    return title, article_text

# Loop through the URLs and save the extracted text to files
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    title, article_text = extract_article_text(url)

    with open(f'{url_id}.txt', 'w', encoding='utf-8') as file:
        file.write(f"{title}\n{article_text}")

print("Article extraction completed.")


Article extraction completed.


In [3]:
import nltk
nltk.download('opinion_lexicon')

[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Unzipping corpora/opinion_lexicon.zip.


True

In [7]:
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import opinion_lexicon, stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
import pandas as pd
import nltk
from textblob import TextBlob
import re

In [9]:
# Load stopwords and master dictionary
stop_words = set(stopwords.words('english'))
positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

In [10]:
# Helper function to count syllables in a word
def syllable_count(word):
    word = word.lower()
    vowels = "aeiouy"
    syllables = 0
    if word[0] in vowels:
        syllables += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            syllables += 1
    if word.endswith("e"):
        syllables -= 1
    if syllables == 0:
        syllables += 1
    return syllables

In [11]:
# Function to clean text and remove stop words
def clean_text(text):
    words = nltk.word_tokenize(text)
    cleaned_words = [word for word in words if word.lower() not in stop_words and word.isalnum()]
    return cleaned_words

In [12]:
# Function to compute the variables
def compute_variables(text):
    # Clean text and tokenize
    cleaned_words = clean_text(text)
    num_words = len(cleaned_words)

    # Tokenize sentences
    sentences = nltk.sent_tokenize(text)
    num_sentences = len(sentences)

    # Positive and Negative Scores
    positive_score = sum(1 for word in cleaned_words if word in positive_words)
    negative_score = sum(1 for word in cleaned_words if word in negative_words)

    # Polarity and Subjectivity Scores
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (num_words + 0.000001)

    # Avg Sentence Length
    avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0

    # Percentage of Complex Words
    complex_words = [word for word in cleaned_words if syllable_count(word) > 2]
    percentage_complex_words = len(complex_words) / num_words if num_words > 0 else 0

    # Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Avg Number of Words Per Sentence
    avg_number_of_words_per_sentence = num_words / num_sentences if num_sentences > 0 else 0

    # Complex Word Count
    complex_word_count = len(complex_words)

    # Syllables Per Word
    syllables_per_word = sum(syllable_count(word) for word in cleaned_words) / num_words if num_words > 0 else 0

    # Personal Pronouns
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))

    # Avg Word Length
    avg_word_length = sum(len(word) for word in cleaned_words) / num_words if num_words > 0 else 0

    return {
        'POSITIVE SCORE': positive_score,
        'NEGATIVE SCORE': negative_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_number_of_words_per_sentence,
        'COMPLEX WORD COUNT': complex_word_count,
        'WORD COUNT': num_words,
        'SYLLABLE PER WORD': syllables_per_word,
        'PERSONAL PRONOUNS': personal_pronouns,
        'AVG WORD LENGTH': avg_word_length
    }

In [13]:
# Initialize output DataFrame
output_columns = [
    'URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
    'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE',
    'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'
]

In [14]:
output_df = pd.DataFrame(columns=output_columns)

In [16]:
# Loop through the URLs and perform text analysis
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']

    # Read the extracted article text
    with open(f'{url_id}.txt', 'r', encoding='utf-8') as file:
        text = file.read()

    # Compute variables
    variables = compute_variables(text)

    # Append results to the output DataFrame
    # Create a temporary DataFrame for the new row
    new_row = pd.DataFrame({
        'URL_ID': [url_id],
        'URL': [url],
        **variables
    })

    # Use concat to append the new row to the output DataFrame
    output_df = pd.concat([output_df, new_row], ignore_index=True)

In [17]:
# Save the output DataFrame to an Excel file
output_file = 'Output Data Structure.xlsx'
output_df.to_excel(output_file, index=False)

print("Text analysis completed.")

Text analysis completed.
