In [1]:
#importing library
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

import re
import os

In [2]:
# Filter out all warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
#importing input file
df=pd.read_excel('/content/drive/MyDrive/Blackcoffer NLP Assignment/Input.xlsx')

In [5]:
df.head()

Unnamed: 0,URL_ID,URL
0,37,https://insights.blackcoffer.com/ai-in-healthc...
1,38,https://insights.blackcoffer.com/what-if-the-c...
2,39,https://insights.blackcoffer.com/what-jobs-wil...
3,40,https://insights.blackcoffer.com/will-machine-...
4,41,https://insights.blackcoffer.com/will-ai-repla...


In [6]:
df.shape

(114, 2)

In [7]:
# Function to extract article text from HTML content
def extract_article_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    article_title = soup.find(class_='entry-title').text.strip()

    # Check if the element with class 'td-post-content' exists
    article_text_element = soup.find(class_='td-post-content')
    if article_text_element is not None:
        article_text = article_text_element.text.strip()
    else:
        article_text = ''

    return article_title, article_text

folder_path = "/content/drive/MyDrive/Blackcoffer NLP Assignment/Text Files Extracted/"

# Loop through the URLs and extract the article text
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    #print(f"Processing URL_ID {url_id}...")

    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception if an HTTP error occurred

        # Extract the article title and text from the response
        article_title, article_text = extract_article_text(response.content)

        # Save the extracted article in a text file with URL_ID as the file name
        file_name = f"{url_id}.txt"
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'w') as file:
            file.write(f"Article Title: {article_title}\n")
            file.write(f"Article Text: {article_text}\n")

        #print(f"Saved article with URL_ID {url_id} to {file_name}")

    except requests.exceptions.HTTPError as err:
        if response.status_code == 404:
            print(f"{url_id} Ooops... Error 404 was found for that page")
        else:
            print(f"An HTTP error occurred: {err}")

    except Exception as e:
        print(f"An error occurred: {e}")

print("Data extraction completed.")

44 Ooops... Error 404 was found for that page
57 Ooops... Error 404 was found for that page
144 Ooops... Error 404 was found for that page
Data extraction completed.


In [8]:
directory = "/content/drive/MyDrive/Blackcoffer NLP Assignment/Text Files Extracted/"

for filename in os.listdir(directory):
    filepath = os.path.join(directory, filename)
    if os.path.isfile(filepath):
        # Read the contents of the file
        with open(filepath, "r") as file:
            text = file.read()

In [9]:
directory = "/content/drive/MyDrive/Blackcoffer NLP Assignment/Text Files Extracted/"

data = []  # List to store the data

# Iterate over each file in the directory
for filename in os.listdir(directory):
    filepath = os.path.join(directory, filename)
    if os.path.isfile(filepath):
        # Read the contents of the file
        with open(filepath, "r") as file:
            text = file.read()
            data.append({'Filename': filename, 'Text': text})

# Create a DataFrame from the data list
df_extrated_data = pd.DataFrame(data)

# Print the DataFrame
print(df_extrated_data.head())

  Filename                                               Text
0   37.txt  Article Title: Ranking customer behaviours for...
1   38.txt  Article Title: Ranking customer behaviours for...
2   39.txt  Article Title: Ranking customer behaviours for...
3   40.txt  Article Title: Ranking customer behaviours for...
4   41.txt  Article Title: Ranking customer behaviours for...


In [10]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
# Load stop words
stop_words_files = [
    "/content/drive/MyDrive/Blackcoffer NLP Assignment/StopWords/StopWords_Auditor.txt",
    "/content/drive/MyDrive/Blackcoffer NLP Assignment/StopWords/StopWords_Currencies.txt",
    "/content/drive/MyDrive/Blackcoffer NLP Assignment/StopWords/StopWords_DatesandNumbers.txt",
    "/content/drive/MyDrive/Blackcoffer NLP Assignment/StopWords/StopWords_Generic.txt",
    "/content/drive/MyDrive/Blackcoffer NLP Assignment/StopWords/StopWords_GenericLong.txt",
    "/content/drive/MyDrive/Blackcoffer NLP Assignment/StopWords/StopWords_Geographic.txt",
    "/content/drive/MyDrive/Blackcoffer NLP Assignment/StopWords/StopWords_Names.txt"
]

stop_words = set()
for file in stop_words_files:
    with open(file, "r", encoding='latin-1') as f:
        words = f.read().split()
        stop_words.update(words)

# Load positive and negative word lists
positive_words_file = "/content/drive/MyDrive/Blackcoffer NLP Assignment/MasterDictionary/positive-words.txt"
negative_words_file = "/content/drive/MyDrive/Blackcoffer NLP Assignment/MasterDictionary/negative-words.txt"

positive_words = set()
negative_words = set()

with open(positive_words_file, "r",encoding='latin-1') as f:
    positive_words = set(word for word in f.read().split() if word not in stop_words)

with open(negative_words_file, "r", encoding='latin-1') as f:
    negative_words = set(word for word in f.read().split() if word not in stop_words)

In [12]:
def clean_text(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    cleaned_words = []
    total_words = 0

    for sentence in sentences:
        # Tokenize the sentence into words
        words = word_tokenize(sentence)

        for word in words:
            # Remove punctuation and convert to lowercase
            cleaned_word = re.sub(r"[^\w\s]", "", word.lower())

            # Remove stop words
            if cleaned_word not in stop_words and cleaned_word != "":
                cleaned_words.append(cleaned_word)
                total_words += 1

    return cleaned_words, total_words

In [13]:
def syllable_count(word):
    vowels = 'aeiouy'
    count = 0
    if len(word) == 0:
        return count
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith('e'):
        count -= 1
    if count == 0:
        count += 1
    return count

In [14]:
def sentimental_analysis(text, cleaned_words, positive_words, negative_words):
    positive_score = int(sum(1 for word in cleaned_words if word in positive_words))
    negative_score = int(sum(1 for word in cleaned_words if word in negative_words))

    polarity_score = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)

    return positive_score, negative_score, polarity_score, subjectivity_score

In [15]:
def analyze_readability(text):
    cleaned_words, total_words = clean_text(text)
    sentences = sent_tokenize(text)
    total_sentences = len(sentences)
    complex_word_count = sum(1 for word in cleaned_words if syllable_count(word) > 2)

    average_sentence_length = total_words / total_sentences
    percentage_complex_words = complex_word_count / total_words
    fog_index = 0.4 * (average_sentence_length + percentage_complex_words)

    return average_sentence_length, fog_index

In [16]:
def calculate_average_words_per_sentence(text):
    sentences = sent_tokenize(text)
    total_sentences = len(sentences)
    total_words = len(word_tokenize(text))
    average_words_per_sentence = total_words / total_sentences
    return average_words_per_sentence

In [17]:
def calculate_syllables_per_word(text):
    words = word_tokenize(text)
    total_syllables = sum(syllable_count(word) for word in words)
    total_words = len(words)
    syllables_per_word = total_syllables / total_words
    return syllables_per_word

In [18]:
def calculate_average_word_length(text):
    words = word_tokenize(text)
    total_words = len(words)
    total_characters = sum(len(word) for word in words)
    average_word_length = total_characters / total_words
    return average_word_length

In [19]:
def count_personal_pronouns(text):
    pronouns = ["I", "me", "my", "mine", "we", "us", "our", "ours", "you", "your", "yours", "he", "him", "his",
                "she", "her", "hers", "it", "its", "they", "them", "their", "theirs"]
    pronoun_count = 0
    words = word_tokenize(text)
    for word in words:
        if word.lower() in pronouns:
            pronoun_count += 1
    return pronoun_count

In [20]:
# Example Usage 1
text1 = "This is a positive sentence."
cleaned_words, total_words = clean_text(text1)
positive_score, negative_score, polarity_score, subjectivity_score = sentimental_analysis(text1, cleaned_words, positive_words, negative_words)
print("Positive Score:", positive_score)
print("Negative Score:", negative_score)
print("Polarity Score:", polarity_score)
print("Subjectivity Score:", subjectivity_score)


Positive Score: 1
Negative Score: 0
Polarity Score: 0.9999990000010001
Subjectivity Score: 0.499999750000125


In [21]:
# Example Usage 2
text2 = "This is a sample text for readability analysis."
average_sentence_length, fog_index = analyze_readability(text2)
print("Average Sentence Length:", average_sentence_length)
print("FOG Index:", fog_index)


Average Sentence Length: 4.0
FOG Index: 1.8


In [22]:
# Example Usage 3
text3 = "I love to travel and explore new places. You should join us on our next adventure!"
avg_word_length = calculate_average_word_length(text3)
pronoun_count = count_personal_pronouns(text3)

print("Average word length:", avg_word_length)
print("Personal pronoun count:", pronoun_count)

Average word length: 3.7222222222222223
Personal pronoun count: 3


In [23]:
# Example Usage 4
sample_text = "This is a sample text. It contains both positive and negative words. This text will be used to demonstrate the calculation of variables."

# Perform calculations for the sample text
cleaned_words, total_words = clean_text(sample_text)
positive_score, negative_score, polarity_score, subjectivity_score = sentimental_analysis(sample_text, cleaned_words, positive_words, negative_words)
average_sentence_length, fog_index = analyze_readability(sample_text)
average_words_per_sentence = calculate_average_words_per_sentence(sample_text)
complex_word_count = sum(1 for word in cleaned_words if syllable_count(word) > 2)
syllables_per_word = calculate_syllables_per_word(sample_text)
personal_pronouns = count_personal_pronouns(sample_text)
average_word_length = calculate_average_word_length(sample_text)

# Print the calculated values
print("Cleaned Text:", cleaned_words)
print("Total Words:", total_words)
print("Positive Score:", positive_score)
print("Negative Score:", negative_score)
print("Polarity Score:", polarity_score)
print("Subjectivity Score:", subjectivity_score)
print("Average Sentence Length:", average_sentence_length)
print("Fog Index:", fog_index)
print("Average Words per Sentence:", average_words_per_sentence)
print("Complex Word Count:", complex_word_count)
print("Syllables per Word:", syllables_per_word)
print("Personal Pronouns:", personal_pronouns)
print("Average Word Length:", average_word_length)


Cleaned Text: ['sample', 'text', 'positive', 'negative', 'words', 'text', 'demonstrate', 'calculation', 'variables']
Total Words: 9
Positive Score: 1
Negative Score: 1
Polarity Score: 0.0
Subjectivity Score: 0.22222219753086697
Average Sentence Length: 3.0
Fog Index: 1.4222222222222223
Average Words per Sentence: 8.666666666666666
Complex Word Count: 5
Syllables per Word: 1.5
Personal Pronouns: 1
Average Word Length: 4.384615384615385


In [24]:
# Convert positive_words set to a DataFrame
positive_words_df = pd.DataFrame(positive_words, columns=['Positive Words'])

# Save the DataFrame to a CSV file
positive_words_df.to_csv('/content/drive/MyDrive/Blackcoffer NLP Assignment/positive_words.csv', index=False)

In [25]:
# Convert positive_words set to a DataFrame
negative_words_df = pd.DataFrame(negative_words, columns=['Negative Words'])

# Save the DataFrame to a CSV file
negative_words_df.to_csv('/content/drive/MyDrive/Blackcoffer NLP Assignment/negative_words.csv', index=False)

In [26]:
# Read the output file
output_file = '/content/drive/MyDrive/Blackcoffer NLP Assignment/Output Data Structure.xlsx'
df_output = pd.read_excel(output_file)

# Iterate over the URLs and calculate the variables for each URL
for index, row in df_output.iterrows():
    url_id = row['URL_ID']
    file_name = f"{url_id}.txt"
    folder_path="/content/drive/MyDrive/Blackcoffer NLP Assignment/Text Files Extracted/"
    file_path = os.path.join(folder_path, file_name)  # Include the folder path

    # Check if the file exists
    if os.path.isfile(file_path):  # Use the file_path variable
        # Read the contents of the file
        with open(file_path, "r") as file:  # Use the file_path variable
            text = file.read()

        # Perform calculations for each URL
        cleaned_words, total_words = clean_text(text)
        positive_score, negative_score, polarity_score, subjectivity_score = sentimental_analysis(text,cleaned_words, positive_words, negative_words)
        average_sentence_length, fog_index = analyze_readability(text)
        average_words_per_sentence = calculate_average_words_per_sentence(text)
        complex_word_count = sum(1 for word in cleaned_words if syllable_count(word) > 2)
        syllables_per_word = calculate_syllables_per_word(text)
        personal_pronouns = count_personal_pronouns(text)
        average_word_length = calculate_average_word_length(text)

        # Append the calculated values to the output dataframe
        df_output.at[index, 'POSITIVE SCORE'] = positive_score
        df_output.at[index, 'NEGATIVE SCORE'] = negative_score
        df_output.at[index, 'POLARITY SCORE'] = polarity_score
        df_output.at[index, 'SUBJECTIVITY SCORE'] = subjectivity_score
        df_output.at[index, 'AVG SENTENCE LENGTH'] = average_sentence_length
        df_output.at[index, 'PERCENTAGE OF COMPLEX WORDS'] = complex_word_count / total_words
        df_output.at[index, 'FOG INDEX'] = fog_index
        df_output.at[index, 'AVG NUMBER OF WORDS PER SENTENCE'] = average_words_per_sentence
        df_output.at[index, 'COMPLEX WORD COUNT'] = complex_word_count
        df_output.at[index, 'WORD COUNT'] = total_words
        df_output.at[index, 'SYLLABLE PER WORD'] = syllables_per_word
        df_output.at[index, 'PERSONAL PRONOUNS'] = personal_pronouns
        df_output.at[index, 'AVG WORD LENGTH'] = average_word_length

    else:
        # Fill empty variables with "NA"
        df_output.at[index, 'POSITIVE SCORE'] = "NA"
        df_output.at[index, 'NEGATIVE SCORE'] = "NA"
        df_output.at[index, 'POLARITY SCORE'] = "NA"
        df_output.at[index, 'SUBJECTIVITY SCORE'] = "NA"
        df_output.at[index, 'AVG SENTENCE LENGTH'] = "NA"
        df_output.at[index, 'PERCENTAGE OF COMPLEX WORDS'] = "NA"
        df_output.at[index, 'FOG INDEX'] = "NA"
        df_output.at[index, 'AVG NUMBER OF WORDS PER SENTENCE'] = "NA"
        df_output.at[index, 'COMPLEX WORD COUNT'] = "NA"
        df_output.at[index, 'WORD COUNT'] = "NA"
        df_output.at[index, 'SYLLABLE PER WORD'] = "NA"
        df_output.at[index, 'PERSONAL PRONOUNS'] = "NA"
        df_output.at[index, 'AVG WORD LENGTH'] = "NA"

# Save the updated output dataframe to the file
df_output.to_excel(output_file, index=False)