<a href="https://colab.research.google.com/github/ManasaGit99/Assignment/blob/main/TextAnalysisB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import os
import re
from textblob import TextBlob
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import cmudict
import nltk
import requests
from bs4 import BeautifulSoup

In [2]:
# Download nltk data
nltk.download('punkt')
nltk.download('cmudict')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


True

In [9]:
# Read the input file
input_file = 'Input.xlsx'
df_input = pd.read_excel(input_file)

# Directory to save the articles
articles_dir = 'articles'
os.makedirs(articles_dir, exist_ok=True)

for index, row in df_input.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the title (example: assuming <h1> contains the title)
    title = soup.find('h1').get_text()

    # Extract the article text (example: assuming <p> tags contain the paragraphs)
    paragraphs = soup.find_all('p')
    article_text = '\n'.join([p.get_text() for p in paragraphs])

    # Save the article to a text file
    file_path = os.path.join(articles_dir, f'{url_id}.txt')
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(title + '\n')
        file.write(article_text)

    print(f"Saved article {url_id} from {url}")

print("All articles have been saved.")

Saved article bctech2011 from https://insights.blackcoffer.com/ml-and-ai-based-insurance-premium-model-to-predict-premium-to-be-charged-by-the-insurance-company/
Saved article bctech2012 from https://insights.blackcoffer.com/streamlined-integration-interactive-brokers-api-with-python-for-desktop-trading-application/
Saved article bctech2013 from https://insights.blackcoffer.com/efficient-data-integration-and-user-friendly-interface-development-navigating-challenges-in-web-application-deployment/
Saved article bctech2014 from https://insights.blackcoffer.com/effective-management-of-social-media-data-extraction-strategies-for-authentication-security-and-reliability/
Saved article bctech2015 from https://insights.blackcoffer.com/streamlined-trading-operations-interface-for-metatrader-4-empowering-efficient-management-and-monitoring/
Saved article bctech2016 from https://insights.blackcoffer.com/efficient-aws-infrastructure-setup-and-management-addressing-security-scalability-and-complianc

In [10]:
# Function to count syllables in a word
d = cmudict.dict()
def syllable_count(word):
    return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0] if word.lower() in d else 0

# Function to count personal pronouns
def count_personal_pronouns(text):
    pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text, re.I)
    return len(pronouns)
# Load positive and negative words
def load_word_list(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        words = file.read().splitlines()
    return set(words)

In [11]:
positive_words = load_word_list('MasterDictionary/positive-words.txt')
negative_words = load_word_list('MasterDictionary/negative-words.txt')

In [12]:
# Load stop words from multiple files
def load_stop_words(directory):
    stop_words = set()
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            stop_words.update(load_word_list(os.path.join(directory, filename)))
    return stop_words

stop_words = load_stop_words('StopWords')

In [13]:
# Remove stop words from positive and negative words
positive_words = positive_words - stop_words
negative_words = negative_words - stop_words

# Function to clean text by removing stop words
def clean_text(text):
    tokens = word_tokenize(text)
    cleaned_tokens = [word for word in tokens if word.lower() not in stop_words and word.isalpha()]
    return ' '.join(cleaned_tokens)


In [14]:
# Read the input and output structure files
input_file = 'Input.xlsx'
output_file = 'Output Data Structure.xlsx'
df_input = pd.read_excel(input_file)
df_output_structure = pd.read_excel(output_file)

# Check if input DataFrame is read correctly
print(f"Input DataFrame:\n{df_input.head()}")


Input DataFrame:
       URL_ID                                                URL
0  bctech2011  https://insights.blackcoffer.com/ml-and-ai-bas...
1  bctech2012  https://insights.blackcoffer.com/streamlined-i...
2  bctech2013  https://insights.blackcoffer.com/efficient-dat...
3  bctech2014  https://insights.blackcoffer.com/effective-man...
4  bctech2015  https://insights.blackcoffer.com/streamlined-t...


In [16]:
# Create a dataframe for output
df_output = pd.DataFrame(columns=df_output_structure.columns)

# Verify the 'articles' directory
articles_dir = 'articles'
if not os.path.isdir(articles_dir):
    print(f"The directory '{articles_dir}' does not exist. Please check the path.")
else:
    print(f"The directory '{articles_dir}' exists.")

The directory 'articles' exists.


In [17]:
for index, row in df_input.iterrows():
    url_id = row['URL_ID']
    file_path = os.path.join(articles_dir, f'{url_id}.txt')

    # Check if the article file exists
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue

    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Separate the title and article text
    lines = content.split('\n')
    title = lines[0]
    text = ' '.join(lines[1:])

    # Clean the text by removing stop words
    cleaned_text = clean_text(text)

    # Perform text analysis on the cleaned text
    word_tokens = word_tokenize(cleaned_text)
    sent_tokens = sent_tokenize(cleaned_text)

    positive_score = sum(1 for word in word_tokens if word in positive_words)
    negative_score = sum(1 for word in word_tokens if word in negative_words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (len(word_tokens) + 0.000001)
    avg_sentence_length = len(word_tokens) / len(sent_tokens)
    complex_word_count = sum(1 for word in word_tokens if syllable_count(word) >= 3)
    percentage_complex_words = complex_word_count / len(word_tokens)
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_number_words_per_sentence = len(word_tokens) / len(sent_tokens)
    word_count = len(word_tokens)
    syllable_per_word = sum(syllable_count(word) for word in word_tokens) / len(word_tokens)
    personal_pronouns = count_personal_pronouns(cleaned_text)
    avg_word_length = sum(len(word) for word in word_tokens) / len(word_tokens)
    # Add the computed values to the output dataframe
    new_row = {
        'URL_ID': url_id,
        'URL': row['URL'],
        'POSITIVE SCORE': positive_score,
        'NEGATIVE SCORE': negative_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_number_words_per_sentence,
        'COMPLEX WORD COUNT': complex_word_count,
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': syllable_per_word,
        'PERSONAL PRONOUNS': personal_pronouns,
        'AVG WORD LENGTH': avg_word_length
    }
    df_output = pd.concat([df_output, pd.DataFrame([new_row])], ignore_index=True)

  df_output = pd.concat([df_output, pd.DataFrame([new_row])], ignore_index=True)


In [18]:
# Save the output to an Excel file
output_path = 'formatted_output.xlsx'
df_output.to_excel(output_path, index=False)


In [20]:
from openpyxl import load_workbook

In [21]:
# Adjust column widths
wb = load_workbook(output_path)
ws = wb.active

for col in ws.columns:
    max_length = 0
    column = col[0].column_letter # Get the column name
    for cell in col:
        try:
            if len(str(cell.value)) > max_length:
                max_length = len(cell.value)
        except:
            pass
    adjusted_width = (max_length + 2)
    ws.column_dimensions[column].width = adjusted_width

wb.save(output_path)
print(f"Output saved to {output_path} with adjusted column widths.")

Output saved to formatted_output.xlsx with adjusted column widths.


In [25]:
!git config --global user.name "ManasaGit99"
!git config --global user.email "manasapotnuru094@gmail.com"

In [26]:
!git clone https://github.com/ManasaGit99/TextAnalysis.git

Cloning into 'TextAnalysis'...
