In [1]:
import pandas as pd
import os

In [17]:
data = pd.read_excel('Input.xlsx')

In [3]:
import requests
from bs4 import BeautifulSoup

In [4]:
# Function to extract article text from a URL
def extract_article_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        article_div = soup.find('div', class_='td-post-content tagdiv-type')
        if article_div:
            paragraphs = article_div.find_all('p')
            article_text = '\n'.join([p.text.strip() for p in paragraphs])
            return article_text
        else:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            article_div = soup.find('div', class_='tdb-block-inner td-fix-index')
            if article_div:
                paragraphs = article_div.find_all('p')
                article_text = '\n'.join([p.text.strip() for p in paragraphs])
                return article_text
            else:
                print(f"No article content found at {url}")
                return None
    except Exception as e:
        print(f"Error occurred while extracting data from {url}: {e}")
        return None, None

In [5]:
output_dir = 'extracted_articles'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [6]:
for index, row in data.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    
    # Extract article text from the URL
    article_text = extract_article_text(url)
    
    if article_text:
        # Save the extracted article text in a text file
        file_name = os.path.join(output_dir, f"{url_id}.txt")
        with open(file_name, 'w', encoding='utf-8') as file:
            file.write(article_text)
        print(f"Article extracted and saved for URL_ID {url_id}")

Article extracted and saved for URL_ID blackassign0001
Article extracted and saved for URL_ID blackassign0002
Article extracted and saved for URL_ID blackassign0003
Article extracted and saved for URL_ID blackassign0004
Article extracted and saved for URL_ID blackassign0005
Article extracted and saved for URL_ID blackassign0006
Article extracted and saved for URL_ID blackassign0007
Article extracted and saved for URL_ID blackassign0008
Article extracted and saved for URL_ID blackassign0009
Article extracted and saved for URL_ID blackassign0010
Article extracted and saved for URL_ID blackassign0011
Article extracted and saved for URL_ID blackassign0012
Article extracted and saved for URL_ID blackassign0013
Article extracted and saved for URL_ID blackassign0015
Article extracted and saved for URL_ID blackassign0016
Article extracted and saved for URL_ID blackassign0017
Article extracted and saved for URL_ID blackassign0018
Article extracted and saved for URL_ID blackassign0019
Article ex

In [20]:
article_data = []

# Iterate over each file in the directory
for file_name in os.listdir(output_dir):
    if file_name.endswith('.txt'):
        # Read the content of the file
        with open(os.path.join(output_dir, file_name), 'r', encoding='utf-8') as file:
            article_text = file.read()

            article_data.append(article_text)

In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from pyphen import Pyphen

In [8]:
p = Pyphen(lang='en_US')
nltk.download('wordnet')
nltk.download('stopwords')
stopWords = stopwords.words('english')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mridu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mridu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
positive_words = []

pos = open("MasterDictionary//positive-words.txt")


for i in pos:
    if(i.lower() not in stopWords):
        str1 = ""
        for x in i:
            if(x != '\n'):
                str1 += x
        positive_words.append(str1)

In [11]:
negative_words = []

neg = open("MasterDictionary//negative-words.txt")

for i in neg:
    if(i.lower() not in stopWords):
        str1 = ""
        for x in i:
            if(x != '\n'):
                str1 += x
        negative_words.append(str1)

In [13]:
def analyze_sentiment(tokens, positive_words, negative_words):

    positive_score = 0
    negative_score = 0

    for token in tokens:
        if token in positive_words:
            positive_score += 1
        elif token in negative_words:
            negative_score += 1

    return positive_score, negative_score

    

In [21]:
output_data = []

curr = 0

for i in article_data:

    tokens = word_tokenize(i)
    filter_data = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stopWords] #stop words defined by us
    
    positive_score, negative_score = analyze_sentiment(filter_data, positive_words, negative_words)

    sentences = nltk.sent_tokenize(i)
    num_sentences = len(sentences)
    # Tokenize words and remove punctuation
    words = nltk.word_tokenize(i)
    words = [word.lower() for word in words if word.isalnum()]
    
    # Complex Word Count
    complex_words = [word for word in words if (len(p.positions(word))+1) > 2]

    # Average Sentence Length
    avg_sentence_length = len(words) / num_sentences

    #Percentage Complex Words
    percentage_complex_words = len(complex_words) / len(words) * 100

    # Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Average Number of Words Per Sentenc
    avg_words_per_sentence = len(words) / num_sentences

    # Complex Word Count
    complex_word_count = len(complex_words)

    word_count = len(filter_data)

    # Apply Syllable Count Per Word
    syllable_count_per_word = sum((len(p.positions(word))+1) for word in filter_data) / word_count

    
    # Average Word Length
    avg_word_length = sum(len(word) for word in words) / word_count




    output_data.append({'URL_ID':data['URL_ID'][curr] ,'URL':data['URL'][curr] ,'POSITIVE SCORE':positive_score, 'NEGATIVE SCORE': negative_score, 
                        'AVG SENTENCE LENGTH':avg_sentence_length, 'PERCENTAGE OF COMPLEX WORDS':percentage_complex_words, 'FOG INDEX': fog_index,
                        'AVG NUMBER OF WORDS PER SENTENCE':avg_words_per_sentence, 'COMPLEX WORD COUNT':complex_word_count, 'WORD COUNT':word_count, 
                        'SYLLABLE PER WORD': syllable_count_per_word, 'AVG WORD LENGTH': avg_word_length})
    
    curr += 1


In [22]:
df = pd.DataFrame(output_data)
df.to_csv('output_data.csv', index=False)