<a href="https://colab.research.google.com/github/KhushiiChoudhary/BlackCofferInternshipTask/blob/main/BlackCoffer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

# Function to extract article text from a URL
def extract_article_text(url):
    # Make a GET request to the URL
    response = requests.get(url)
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    # Find the main article content
    article_content = soup.find('div', class_='td-post-content')
    if article_content:
        # Extract text from paragraphs within the main content
        paragraphs = article_content.find_all('p')
        article_text = '\n'.join([p.get_text(strip=True) for p in paragraphs])
        return article_text.strip()
    else:
        print(f"Error: Couldn't find article content on the page: {url}")
        return None

# Read input Excel file containing URLs
input_df = pd.read_excel('Input.xlsx')

# Create a directory to save extracted text files
if not os.path.exists('Extracted_Text'):
    os.makedirs('Extracted_Text')

# Iterate through each URL
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    # Extract article text
    article_text = extract_article_text(url)
    if article_text:
        # Save extracted text into a text file
        with open(f'Extracted_Text/{url_id}.txt', 'w', encoding='utf-8') as file:
            file.write(article_text)
        print(f"Article text extracted and saved into 'Extracted_Text/{url_id}.txt' file.")


Article text extracted and saved into 'Extracted_Text/blackassign0001.txt' file.
Article text extracted and saved into 'Extracted_Text/blackassign0002.txt' file.
Article text extracted and saved into 'Extracted_Text/blackassign0003.txt' file.
Article text extracted and saved into 'Extracted_Text/blackassign0004.txt' file.
Article text extracted and saved into 'Extracted_Text/blackassign0005.txt' file.
Article text extracted and saved into 'Extracted_Text/blackassign0006.txt' file.
Article text extracted and saved into 'Extracted_Text/blackassign0007.txt' file.
Article text extracted and saved into 'Extracted_Text/blackassign0008.txt' file.
Article text extracted and saved into 'Extracted_Text/blackassign0009.txt' file.
Article text extracted and saved into 'Extracted_Text/blackassign0010.txt' file.
Article text extracted and saved into 'Extracted_Text/blackassign0011.txt' file.
Article text extracted and saved into 'Extracted_Text/blackassign0012.txt' file.
Article text extracted and s

In [3]:
import os
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from textblob import TextBlob

# Download NLTK resources
nltk.download('punkt')

# Function to compute variables from article text
def compute_variables(article_text):
    # Tokenize the text into words and sentences
    words = word_tokenize(article_text)
    sentences = sent_tokenize(article_text)

    # Compute variables
    word_count = len(words)
    sentence_count = len(sentences)
    avg_sentence_length = word_count / sentence_count
    # You can compute other variables here

    return {
        'WORD COUNT': word_count,
        'SENTENCE COUNT': sentence_count,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        # Add other variables here
    }

# Read the list of extracted text files
extracted_files = os.listdir('Extracted_Text')

# Create an empty list to store results
results = []

# Iterate through each extracted text file
for file_name in extracted_files:
    url_id = file_name.split('.')[0]  # Extract URL_ID from the file name
    # Read the content of the text file
    with open(f'Extracted_Text/{file_name}', 'r', encoding='utf-8') as file:
        article_text = file.read()
    # Compute variables
    variables = compute_variables(article_text)
    # Store results
    result = {
        'URL_ID': url_id,
        **variables
    }
    results.append(result)

# Create DataFrame from results
output_df = pd.DataFrame(results)

# Save output DataFrame to Excel file
output_df.to_excel('Output.xlsx', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
!pip install syllapy




In [4]:
import nltk
nltk.download('cmudict')

def syllable_count(word):
    try:
        pronunciation = nltk.corpus.cmudict.dict()[word.lower()][0]
        return sum(1 for phoneme in pronunciation if phoneme[-1].isdigit())
    except KeyError:
        return 0

# Test the syllable_count function
words = ['hello', 'world', 'programming', 'artificial', 'intelligence']
for word in words:
    print(f'{word}: {syllable_count(word)}')


[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


hello: 2
world: 1
programming: 3
artificial: 4
intelligence: 4


In [5]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('cmudict')

def compute_additional_variables(article_text):
    words = word_tokenize(article_text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]

    word_count = len(filtered_words)
    sentence_count = len(sent_tokenize(article_text))
    avg_sentence_length = word_count / sentence_count

    syllables_per_word = sum([syllable_count(word) for word in filtered_words]) / word_count

    percentage_complex_words = len([word for word in filtered_words if syllable_count(word) > 2]) / word_count * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    return {
        'WORD COUNT': word_count,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
        'FOG INDEX': fog_index,
        'SYLLABLE PER WORD': syllables_per_word,
        # Add other variables here
    }

# Test the compute_additional_variables function with a sample article text
article_text = """
    Natural language processing (NLP) is a subfield of linguistics, computer science, information engineering, and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data. Challenges in natural language processing frequently involve speech recognition, natural language understanding, and natural language generation.
"""
additional_variables = compute_additional_variables(article_text)
print(additional_variables)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


{'WORD COUNT': 54, 'AVG SENTENCE LENGTH': 27.0, 'PERCENTAGE OF COMPLEX WORDS': 46.2962962962963, 'FOG INDEX': 29.318518518518523, 'SYLLABLE PER WORD': 2.074074074074074}


In [6]:
import os
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Function to compute variables from article text
def compute_variables(article_text):
    # Tokenize the text into words and sentences
    words = word_tokenize(article_text)
    sentences = sent_tokenize(article_text)

    # Compute variables
    word_count = len(words)
    sentence_count = len(sentences)
    avg_sentence_length = word_count / sentence_count

    return {
        'WORD COUNT': word_count,
        'SENTENCE COUNT': sentence_count,
        'AVG SENTENCE LENGTH': avg_sentence_length,
    }

# Function to compute additional variables from article text
def compute_additional_variables(article_text):
    # Tokenize the text into words
    words = word_tokenize(article_text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]

    # Compute additional variables
    word_count = len(filtered_words)
    sentence_count = len(sent_tokenize(article_text))
    avg_sentence_length = word_count / sentence_count

    # Compute POSITIVE SCORE, NEGATIVE SCORE, POLARITY SCORE, and SUBJECTIVITY SCORE using TextBlob
    blob = TextBlob(article_text)
    positive_score = sum(1 for sentence in blob.sentences if sentence.sentiment.polarity > 0)
    negative_score = sum(1 for sentence in blob.sentences if sentence.sentiment.polarity < 0)
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity

    # Compute AVG NUMBER OF WORDS PER SENTENCE
    avg_words_per_sentence = word_count / sentence_count

    # Count personal pronouns
    personal_pronouns = sum(1 for word in filtered_words if word.lower() in ['i', 'me', 'my', 'mine', 'myself', 'you', 'your', 'yours', 'yourself', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'we', 'us', 'our', 'ours', 'ourselves', 'they', 'them', 'their', 'theirs', 'themselves'])

    # Compute AVG WORD LENGTH
    avg_word_length = sum(len(word) for word in filtered_words) / word_count

    return {
        'POSITIVE SCORE': positive_score,
        'NEGATIVE SCORE': negative_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_words_per_sentence,
        'PERSONAL PRONOUNS': personal_pronouns,
        'AVG WORD LENGTH': avg_word_length
    }

# Read the list of extracted text files
extracted_files = os.listdir('Extracted_Text')

# Create an empty list to store results
results = []

# Iterate through each extracted text file
for file_name in extracted_files:
    url_id = file_name.split('.')[0]  # Extract URL_ID from the file name
    # Read the content of the text file
    with open(f'Extracted_Text/{file_name}', 'r', encoding='utf-8') as file:
        article_text = file.read()
    # Compute variables
    variables = compute_variables(article_text)
    # Compute additional variables
    additional_variables = compute_additional_variables(article_text)
    # Merge variables
    result = {
        'URL_ID': url_id,
        **variables,
        **additional_variables
    }
    results.append(result)

# Create DataFrame from results
output_df = pd.DataFrame(results)

# Save output DataFrame to Excel file
output_df.to_excel('Output.xlsx', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
