**Data Extraction and NLP(Natural Language Procesing)**
The goal of this Project is to extract text from the provided URLs and perform text analysis to compute the variables detailed below.

Note: Due to security reasons, dataset files will not be shared publicly. If you need them, please contact me directly on LinkedIn: Gathala Dilip Kumar.

In [7]:
""" This mounts your Google Drive to access files directly from there"""
# Mount Google Drive to access files stored in your Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# Install required Python libraries
!pip install pandas requests beautifulsoup4 nltk textblob numpy openpyxl



In [9]:
""" Import all the libraries you'll be using in the project"""

# Import necessary libraries for web scraping, data processing, and analysis
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from textblob import TextBlob
import nltk
import re
import numpy as np

# Download necessary NLTK data files (stopwords, tokenizers)
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
"""Load Input Data
   Load the input data (URLs) from an Excel file.
"""

# Load input data (URLs) from an Excel file
input_df = pd.read_excel('/content/drive/MyDrive/Input.xlsx')


In [11]:
#Prepare Stopwords and Dictionary Files

"""Load stopwords and positive/negative word dictionaries."""
# Load English stopwords from NLTK
stop_words = set(stopwords.words('english'))

# Initialize sets for positive and negative words
positive_words = set()
negative_words = set()

# Function to read files with different encodings
def read_file(file_path, encoding='utf-8'):
    try:
        with open(file_path, 'r', encoding=encoding) as file:
            return file.readlines()
    except UnicodeDecodeError:
        return read_file(file_path, encoding='latin1')

# Read and store positive words
for line in read_file('/content/drive/MyDrive/positive-words.txt'):
    positive_words.add(line.strip().lower())

# Read and store negative words
for line in read_file('/content/drive/MyDrive/negative-words.txt'):
    negative_words.add(line.strip().lower())


In [13]:
#Here we can see the list of Negative and positive words
print(positive_words)
print(negative_words)

{'complimentary', 'whooa', 'fastest', 'thrills', 'competitive', 'world-famous', 'zenith', 'dependably', 'rightness', 'eventful', 'assurances', 'wholeheartedly', 'acclaimed', 'delightfully', 'risk-free', 'dazzling', 'sensible', 'incredible', 'well-rounded', 'refreshing', 'smoothly', 'acclaim', 'envious', 'exuberant', 'magical', 'resilient', 'pain-free', 'faithfully', 'exceed', 'well-intentioned', 'rockstars', 'cherished', 'civilize', 'ergonomical', 'impassioned', 'fervently', 'comfortable', 'euphoric', 'smartest', 'enviably', 'affordably', 'glad', 'affirm', 'plusses', 'indulgent', 'progressive', 'upliftment', 'luckiest', 'accomplishments', 'useful', 'auspicious', 'simplifying', 'flourishing', 'toll-free', 'upliftingly', 'pardon', 'fantastic', 'awards', 'shiny', 'industrious', 'reclaim', 'cool', 'fair', 'productive', 'coolest', 'extoll', 'hardier', 'enlighten', 'autonomous', 'magnificent', 'picturesque', 'regal', 'sweetness', 'savvy', 'blithe', 'undisputed', 'roomy', 'affably', 'attentiv

In [14]:
#Web Scraping: Extract Text from URLs

"""Define a function to extract article text from a given URL"""
# Function to extract article text and title from a URL
def extract_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the article's title
    title = soup.find('h1').get_text()

    # Extract all paragraphs in the article
    paragraphs = soup.find_all('p')
    article_text = ' '.join([para.get_text() for para in paragraphs])

    return title, article_text


In [15]:
#Save Extracted Articles
"""Save each extracted article into a text file named by its URL_ID"""

# Create a directory to save extracted articles
os.makedirs('articles', exist_ok=True)

# Extract text for each URL and save it as a text file
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    url = row['URL']
    title, article_text = extract_article_text(url)

    # Save the article text to a file named with URL_ID
    with open(f'articles/{url_id}.txt', 'w', encoding='utf-8') as file:
        file.write(title + '\n' + article_text)


In [16]:
"""Text Analysis: Define Analysis Function"""
#Define a function to compute various text analysis metrics

# Function to analyze text and calculate various metrics
def analyze_text(text):
    # Tokenize text into words and sentences
    words = word_tokenize(text.lower())
    sentences = sent_tokenize(text)

    # Remove stopwords and keep only alphabetic words
    filtered_words = [word for word in words if word not in stop_words and word.isalpha()]

    # Calculate positive and negative word scores
    positive_score = sum(1 for word in filtered_words if word in positive_words)
    negative_score = sum(1 for word in filtered_words if word in negative_words)

    # Calculate polarity and subjectivity using TextBlob
    blob = TextBlob(text)
    polarity_score = blob.sentiment.polarity
    subjectivity_score = blob.sentiment.subjectivity

    # Calculate additional text metrics
    avg_sentence_length = len(filtered_words) / len(sentences)
    complex_words = [word for word in filtered_words if len(re.findall(r'[aeiou]', word)) > 2]
    percentage_complex_words = len(complex_words) / len(filtered_words) * 100
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = len(filtered_words) / len(sentences)
    complex_word_count = len(complex_words)
    word_count = len(filtered_words)
    syllable_per_word = sum(len(re.findall(r'[aeiou]', word)) for word in filtered_words) / len(filtered_words)
    personal_pronouns = sum(1 for word in filtered_words if word.lower() in ['i', 'we', 'my', 'ours', 'us'])
    avg_word_length = np.mean([len(word) for word in filtered_words])

    # Return a dictionary of the calculated metrics
    return {
        'POSITIVE SCORE': positive_score,
        'NEGATIVE SCORE': negative_score,
        'POLARITY SCORE': polarity_score,
        'SUBJECTIVITY SCORE': subjectivity_score,
        'AVG SENTENCE LENGTH': avg_sentence_length,
        'PERCENTAGE OF COMPLEX WORDS': percentage_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_words_per_sentence,
        'COMPLEX WORD COUNT': complex_word_count,
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': syllable_per_word,
        'PERSONAL PRONOUNS': personal_pronouns,
        'AVG WORD LENGTH': avg_word_length
    }


In [17]:
"""Analyze Extracted Articles"""
#Load each article, analyze the text, and save the results.
# Analyze each article and collect the results
results = []
for index, row in input_df.iterrows():
    url_id = row['URL_ID']
    try:
        # Read the saved article text
        with open(f'articles/{url_id}.txt', 'r', encoding='utf-8') as file:
            text = file.read()
        # Perform text analysis
        analysis_result = analyze_text(text)
        analysis_result['URL_ID'] = url_id
        results.append(analysis_result)
    except UnicodeDecodeError:
        print(f"Skipping URL_ID {url_id} due to encoding issues.")


In [18]:
"""Save the Results"""
#Save the analysis results to an Excel file.

# Convert the results into a DataFrame and save to an Excel file
output_df = pd.DataFrame(results)
output_df.to_excel('/content/drive/MyDrive/Output.xlsx', index=False)


In [22]:
output_df.head(20)

Unnamed: 0,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVG NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUNS,AVG WORD LENGTH,URL_ID
0,41,9,0.069339,0.386899,25.666667,58.766234,33.77316,25.666667,362,616,2.883117,1,7.501623,bctech2011
1,9,1,0.059315,0.331447,28.0,50.0,31.2,28.0,98,196,2.734694,1,7.239796,bctech2012
2,9,1,0.059315,0.331447,27.714286,52.061856,31.910457,27.714286,101,194,2.757732,1,7.268041,bctech2013
3,9,1,0.091484,0.344183,28.0,52.55102,32.220408,28.0,103,196,2.77551,1,7.265306,bctech2014
4,10,1,0.059315,0.331447,28.0,52.55102,32.220408,28.0,103,196,2.77551,1,7.316327,bctech2015
5,9,1,0.059315,0.331447,27.571429,52.331606,31.961214,27.571429,101,193,2.756477,1,7.295337,bctech2016
6,9,1,0.084142,0.325898,28.142857,51.77665,31.967803,28.142857,102,197,2.741117,1,7.187817,bctech2017
7,8,1,0.055361,0.309351,28.0,51.530612,31.812245,28.0,101,196,2.765306,1,7.229592,bctech2018
8,8,1,0.059315,0.331447,29.285714,52.682927,32.787456,29.285714,108,205,2.780488,1,7.229268,bctech2019
9,9,1,0.059315,0.331447,28.0,52.040816,32.016327,28.0,102,196,2.770408,1,7.285714,bctech2020
