<a href="https://colab.research.google.com/github/HusnaRiyaz/Text_Analysis_using_NLP/blob/main/Black_coffer_NLP_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from google.colab import drive
from nltk.corpus import stopwords

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Mount Google Drive
drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Function to extract title and text from a URL
def extract_title_and_text(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'lxml')
        title = soup.find('title').get_text(strip=True)
        main_content = soup.find('div', class_='td-post-content')
        article_content = main_content.find_all(['p', 'li'])
        content = [element.get_text(separator=' ', strip=True) for element in article_content]
        text = '\n'.join(content)
        return title, text
    except Exception as e:
        print(f"Failed to extract data from {url}: {e}")
        return None, None

# Function to read URLs from Excel file
def read_urls_from_excel(file_path):
    df = pd.read_excel(file_path)
    return df

# Function to save text to a file
def save_text_to_file(file_name, title, text):
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(title + "\n\n" + text)

# Function to read stopwords from files
def read_stopwords(folder):
    stopwords_set = set()
    for filename in os.listdir(folder):
        if filename.endswith('.txt'):
            filepath = os.path.join(folder, filename)
            try:
                with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
                    for line in file:
                        stopwords_set.add(line.strip().lower())
            except UnicodeDecodeError:
                with open(filepath, 'r', encoding='latin1', errors='ignore') as file:
                    for line in file:
                        stopwords_set.add(line.strip().lower())
    return stopwords_set

# Function to create a dictionary from the masterdict folder
def create_word_dict(masterdict_folder, stopwords):
    word_dict = {'positive': set(), 'negative': set()}
    for filename in os.listdir(masterdict_folder):
        if filename.endswith('.txt'):
            filepath = os.path.join(masterdict_folder, filename)
            category = 'positive' if 'positive' in filename.lower() else 'negative'
            try:
                with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
                    for line in file:
                        word = line.strip().lower()
                        if word not in stopwords:
                            word_dict[category].add(word)
            except UnicodeDecodeError:
                with open(filepath, 'r', encoding='latin1', errors='ignore') as file:
                    for line in file:
                        word = line.strip().lower()
                        if word not in stopwords:
                            word_dict[category].add(word)
    return word_dict

# Function to calculate text analysis scores
def calculate_scores(text, positive_words, negative_words):
    tokens = word_tokenize(text.lower())
    positive_score = sum(1 for word in tokens if word in positive_words)
    negative_score = (sum(-1 for word in tokens if word in negative_words)*-1)
    total_words = len(tokens)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)
    return positive_score, negative_score, polarity_score, subjectivity_score

# Function to calculate readability metrics
def calculate_readability_metrics(text, stopwords_set):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)

    # Remove stopwords and punctuation for word count
    clean_words = [word for word in words if word.isalnum() and word.lower() not in stopwords_set]

    num_sentences = len(sentences)
    num_words = len(clean_words)

    # Calculate average sentence length
    avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0

    # Calculate complex words (words with more than 2 syllables)
    def count_syllables(word):
        word = word.lower()
        vowels = "aeiouy"
        count = 0
        if word[0] in vowels:
            count += 1
        for index in range(1, len(word)):
            if word[index] in vowels and word[index - 1] not in vowels:
                count += 1
        if word.endswith("es") or word.endswith("ed"):
            count -= 1
        if count == 0:
            count += 1
        return count

    complex_words = [word for word in clean_words if count_syllables(word) > 2]
    num_complex_words = len(complex_words)

    # Calculate percentage of complex words
    percentage_complex_words = num_complex_words / num_words if num_words > 0 else 0

    # Calculate Fog Index
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)

    # Calculate average word length
    avg_word_length = sum(len(word) for word in clean_words) / num_words if num_words > 0 else 0

    # Calculate average number of words per sentence
    avg_words_per_sentence = num_words / num_sentences if num_sentences > 0 else 0

    # Calculate syllable count per word
    total_syllables = sum(count_syllables(word) for word in clean_words)
    syllable_count_per_word = total_syllables / num_words if num_words > 0 else 0

    # Count personal pronouns
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I)) #search is case-insensitive

    return avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, num_complex_words, num_words, syllable_count_per_word, personal_pronouns, avg_word_length

# Define the path to the URLs Excel file in Google Drive
excel_file_path = '/content/drive/My Drive/Black_coffer/Input.xlsx'

# Define the path to the stopwords folder in Google Drive
stopwords_folder = '/content/drive/My Drive/Black_coffer/Stop_words'

# Define the path to the masterdict folder in Google Drive
masterdict_folder = '/content/drive/My Drive/Black_coffer/MasterDictionary'

# Define the output folder in Google Drive
output_folder = '/content/drive/My Drive/Black_coffer/extracted_texts'
os.makedirs(output_folder, exist_ok=True)

# Read stopwords
stopwords_set = read_stopwords(stopwords_folder)

# Create the word dictionary
word_dict = create_word_dict(masterdict_folder, stopwords_set)

# Read URLs from Excel file
df = read_urls_from_excel(excel_file_path)

# Prepare the output DataFrame
output_data = []

# Process each URL
for index, row in df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    title, text = extract_title_and_text(url)
    if title and text:
        positive_score, negative_score, polarity_score, subjectivity_score = calculate_scores(
            text, word_dict['positive'], word_dict['negative'])
        avg_sentence_length, percentage_complex_words, fog_index, avg_words_per_sentence, num_complex_words, num_words, syllable_count_per_word, personal_pronouns, avg_word_length = calculate_readability_metrics(text, stopwords_set)

        output_data.append({
            'URL_ID': url_id,
            'URL': url,
            'POSITIVE SCORE': positive_score,
            'NEGATIVE SCORE': negative_score,
            'POLARITY SCORE': polarity_score,
            'SUBJECTIVITY SCORE': subjectivity_score,
            'AVG SENTENCE LENGTH': avg_sentence_length,
            'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
            'FOG INDEX': fog_index,
            'AVG WORDS PER SENTENCE': avg_words_per_sentence,
            'COMPLEX WORD COUNT': num_complex_words,
            'WORD COUNT': num_words,
            'SYLLABLE PER WORD': syllable_count_per_word,
            'PERSONAL PRONOUNS': personal_pronouns,
            'AVG WORD LENGTH': avg_word_length
        })

        file_name = os.path.join(output_folder, f"{url_id}.txt")
        save_text_to_file(file_name, title, text)

# Create DataFrame and save to Excel
output_df = pd.DataFrame(output_data)
output_df.to_excel('/content/drive/My Drive/Black_coffer/Output_text_analysis.xlsx', index=False)


Failed to extract data from https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/: 'NoneType' object has no attribute 'find_all'
Failed to extract data from https://insights.blackcoffer.com/covid-19-environmental-impact-for-the-future/: 'NoneType' object has no attribute 'find_all'


# **URL - 36, 49 returns 404 error so failed to extract data**