# **DATA EXTARCTION AND NLP**

### **OBJECTIVE** 
<p>The objective of this task is to extract textual data articles from a given URL and perfrom textual<br>
analysis to compute a set of explained variables. while ensuring that the program extracts only the article<br>
title and the article text. It should not extract the website header, footer or anything<br>
other than the  article text.

**INSTALL AND IMPORT THE NECESSARY LIBRARIES**

In [1]:
!pip install requests



In [2]:
!pip install beautifulsoup4



In [3]:
!pip install openpyxl



In [4]:
import pandas as pd

In [5]:
df = pd.read_excel("Input.xlsx")

In [6]:
urls = df[["URL_ID", "URL"]] 
print(urls)

         URL_ID                                                URL
0    bctech2011  https://insights.blackcoffer.com/ml-and-ai-bas...
1    bctech2012  https://insights.blackcoffer.com/streamlined-i...
2    bctech2013  https://insights.blackcoffer.com/efficient-dat...
3    bctech2014  https://insights.blackcoffer.com/effective-man...
4    bctech2015  https://insights.blackcoffer.com/streamlined-t...
..          ...                                                ...
142  bctech2153  https://insights.blackcoffer.com/population-an...
143  bctech2154  https://insights.blackcoffer.com/google-lsa-ap...
144  bctech2155  https://insights.blackcoffer.com/healthcare-da...
145  bctech2156  https://insights.blackcoffer.com/budget-sales-...
146  bctech2157  https://insights.blackcoffer.com/amazon-buy-bo...

[147 rows x 2 columns]


* Use the request library to fetch the web page and BeautifulSoup to parse and extract<br>
the article text

In [7]:
import requests
from bs4 import BeautifulSoup

In [8]:
def extract_article(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        title = soup.find("title").get_text()
        article_body = soup.find("article")
        if not article_body:
            article_body = soup.find("div", {"class": "post-content"})
        if article_body:
            paragraphs = [p.get_text() for p in article_body.find_all("p")]
            article_text = " ".join(paragraphs)
        else:
            article_text = ""
        return title, article_text 

    else:
        print(f"Failed to retrieve the article from {url}. Status code: {response.status_code}")
        return None, None
            
            

* Save the extracted Content to text files

In [9]:
import os

In [10]:
os.makedirs("extracted_articles", exist_ok=True)
for index, row in urls.iterrows():
    url_id = row["URL_ID"]
    url = row["URL"]
    title, article_text = extract_article(url)
    if title and article_text:
        full_text = f"{title}\n\n{article_text}"
        with open(f"extracted_articles/{url_id}.txt", "w", encoding="utf-8") as f:
            f.write(full_text)
        print(f"Article {url_id} saved successfully.") 
    else:
        print(f"Skipping article {url_id}.")

Article bctech2011 saved successfully.
Article bctech2012 saved successfully.
Article bctech2013 saved successfully.
Article bctech2014 saved successfully.
Article bctech2015 saved successfully.
Article bctech2016 saved successfully.
Article bctech2017 saved successfully.
Article bctech2018 saved successfully.
Article bctech2019 saved successfully.
Article bctech2020 saved successfully.
Article bctech2021 saved successfully.
Article bctech2022 saved successfully.
Article bctech2023 saved successfully.
Article bctech2024 saved successfully.
Article bctech2025 saved successfully.
Article bctech2026 saved successfully.
Article bctech2027 saved successfully.
Article bctech2028 saved successfully.
Article bctech2029 saved successfully.
Article bctech2030 saved successfully.
Article bctech2031 saved successfully.
Article bctech2032 saved successfully.
Article bctech2033 saved successfully.
Article bctech2034 saved successfully.
Article bctech2035 saved successfully.
Article bctech2036 saved 

In [11]:
!pip install nltk



In [12]:
!pip install textstat



In [None]:
* Unzip and Load Resources 
* Extract Text and Compute Variables 
* Save Computed Variables 

In [13]:
import zipfile
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import cmudict
import textstat

In [14]:
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("stopwords")
nltk.download("cmudict")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\P\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\P\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\P\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\P\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [15]:
path = "C:/Users/P/Downloads/data analysis"

In [16]:
with zipfile.ZipFile(os.path.join(path, "StopWords.zip"), "r") as zip_ref:
    zip_ref.extractall(path) 
with zipfile.ZipFile(os.path.join(path, "MasterDictionary.zip"), "r") as zip_ref:
    zip_ref.extractall(path)

In [17]:
stopwords_path = os.path.join(path, "StopWords")
master_dict_path = os.path.join(path, "MasterDictionary")
extracted_path = os.path.join(path, "extracted_articles")

In [18]:
all_stopwords = set()
stopwords_files = [
    "StopWords_Auditor.txt",
    "StopWords_Currencies.txt",
    "StopWords_DatesandNumbers.txt",
    "StopWords_Generic.txt",
    "StopWords_GenericLong.txt",
    "StopWords_Geographic.txt",
    "StopWords_Names.txt"
]

for stopwords_file in stopwords_files:
    file_path = os.path.join(stopwords_path, stopwords_file)
    with open(file_path, "r", encoding="ISO-8859-1") as file:
        words = file.read().splitlines()
        all_stopwords.update(words) 

all_stopwords = list(all_stopwords)
    

In [19]:
positive_words = set()
negative_words = set()

positive_words_file = os.path.join(master_dict_path, "positive-words.txt")
with open(positive_words_file, "r", encoding="ISO-8859-1") as file:
    positive_words.update(file.read().splitlines()) 
negative_words_file = os.path.join(master_dict_path, "negative-words.txt")
with open(negative_words_file, "r", encoding="ISO-8859-1") as file:
    negative_words.update(file.read().splitlines())

In [23]:
d = cmudict.dict()
def count_syllables(word):
    if word.lower() in d:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
    else:
        return 1 

def is_complex_word(word):
    return count_syllables(word) >=2
def count_personal_pronouns(text):
    pronouns = ["i", "we", "my", "ours", "us"]
    pronouns_count = sum([text.lower().split().count(pronoun) for pronoun in pronouns])
    return pronouns_count 


positive_scores = []
negative_scores = []
polarity_scores = []
subjectivity_scores = []
average_sentence_lengths = []
percentage_complex_words_list = []
fog_indices = []
average_words_per_sentence_list = []
complex_word_counts = []
word_counts = []
syllables_per_word_list = []
personal_pronouns = []
average_word_lengths = []


for texts_file in os.listdir(extracted_path):
    with open(os.path.join(extracted_path, texts_file), "r", encoding="ISO-8859-1", errors="ignore") as f:
        text = f.read()
        words = word_tokenize(text)
        sentences = sent_tokenize(text)
        filtered_text = [word.lower() for word in words if word.lower() not in all_stopwords]
        positive_score = sum(1 for word in filtered_text if word in positive_words)
        negative_score = sum(1 for word in filtered_text if word in negative_words) * -1
        negative_score = abs(negative_score)
        polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
        subjectivity_score = (positive_score + negative_score) / (len(filtered_text) + 0.000001)
        average_sentence_length = len(filtered_text) / len(sentences) if len(sentences) > 0 else 0
        complex_words = [word for word in filtered_text if is_complex_word(word)]
        complex_word_count = len(complex_words)
        percentage_complex_words = (complex_word_count / len(filtered_text)) * 100 if len(filtered_text) > 0 else 0
        fog_index = 0.4 * (average_sentence_length + percentage_complex_words)
        average_word_length = sum(len(word) for word in filtered_text) / len(filtered_text) if len(filtered_text) > 0 else 0
        syllable_count = sum(count_syllables(word) for word in filtered_text)
        syllables_per_word = syllable_count / len(filtered_text) if len(filtered_text) > 0 else 0
        personal_pronouns_count = count_personal_pronouns(text)
        word_count = len(filtered_text)
        average_words_per_sentence = len(filtered_text) / len(sentences) if len(sentences) > 0 else 0
        
        positive_scores.append(positive_score)
        negative_scores.append(negative_score)
        polarity_scores.append(polarity_scores)
        subjectivity_scores.append(subjectivity_score)
        average_sentence_lengths.append(average_sentence_length)
        percentage_complex_words_list.append(percentage_complex_words)
        fog_indices.append(fog_index)
        average_words_per_sentence_list.append(average_words_per_sentence)
        complex_word_counts.append(complex_word_count)
        word_counts.append(word_count)
        syllables_per_word_list.append(syllables_per_word)
        personal_pronouns.append(personal_pronouns_count)
        average_word_lengths.append(average_word_length)

In [28]:
data = {
    "URL_ID": urls["URL_ID"],
    "URL": urls["URL"],
    "POSITIVE SCORE": positive_scores,
    "NEGATIVE SCORE": negative_scores,
    "POLARITY SCORE": polarity_scores,
    "SUBJECTIVITY SCORE": subjectivity_scores,
    "AVERAGE SENTENCE LENGTH": average_sentence_lengths,
    "PERCENTAGE OF COMPLEX WORDS": percentage_complex_words_list,
    "FOG INDEX": fog_indices,
    "AVERAGE NUMBER OF WORDS PER SENTENCE": average_words_per_sentence_list,
    "COMPLEX WORD COUNT": complex_word_counts,
    "WORD COUNT": word_counts,
    "SYLLABLE PER WORD": syllables_per_word_list,
    "PERSONAL PRONOUNS": personal_pronouns,
    "AVERAGE WORD LENGTH": average_word_length
} 
results_df = pd.DataFrame(data)
    

In [29]:
results_df.to_excel("Output.xlsx", index=False)

In [30]:
results_df.to_csv("Output.csv", index=False)