## BlackCoffer Assignment

In [6]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Read the DataFrame outside the loop
df = pd.read_excel("input.xlsx")

# Create a folder to store text files
folder_name = "Articles_Txt_Files"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# Loop
for i in range(len(df["URL"])):
    try:
        url = df["URL"][i]
        url_id = df["URL_ID"][i]

        response = requests.get(url)

        soup = BeautifulSoup(response.content, "html.parser")

        print(f"URL_ID of URL {i+1} is:", df["URL_ID"][i])

        Article_Text = ""
        h1 = soup.find("body").find("h1")
        title = h1.text.strip()
        if h1:
            Article_Text += f"Title: {title}\n\n"

        paragraphs = soup.find("article").find_all("p")
    #     print("Text of the URL :")

        for paragraph in paragraphs:
            cleaned_text = paragraph.text.strip()
            Article_Text += cleaned_text + "\n"
    #         print(cleaned_text)

        # Save the article text in file
        file_path = os.path.join(folder_name, f"{url_id}.txt")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(Article_Text)

        print(f"\nArticle Text of the URL saved in file: {file_path}\n")
        print("-----------------------------------------------------------------------------------------------------------------------------\n")

    except Exception as e:
        print("URL_ID:", url_id, ", Error:", e)
        print("Sorry, but the page you are looking for doesn't exist.")
        print(f"Error occurred while processing URL {url}\n")
        print("-------------------------------------------------------------------------------------------------------------------------------")

URL_ID of URL 1 is: blackassign0001

Article Text of the URL saved in file: Articles_Txt_Files\blackassign0001.txt

-----------------------------------------------------------------------------------------------------------------------------

URL_ID of URL 2 is: blackassign0002

Article Text of the URL saved in file: Articles_Txt_Files\blackassign0002.txt

-----------------------------------------------------------------------------------------------------------------------------

URL_ID of URL 3 is: blackassign0003

Article Text of the URL saved in file: Articles_Txt_Files\blackassign0003.txt

-----------------------------------------------------------------------------------------------------------------------------

URL_ID of URL 4 is: blackassign0004

Article Text of the URL saved in file: Articles_Txt_Files\blackassign0004.txt

-----------------------------------------------------------------------------------------------------------------------------

URL_ID of URL 5 is: blackass

URL_ID of URL 35 is: blackassign0035

Article Text of the URL saved in file: Articles_Txt_Files\blackassign0035.txt

-----------------------------------------------------------------------------------------------------------------------------

URL_ID of URL 36 is: blackassign0036
URL_ID: blackassign0036 , Error: 'NoneType' object has no attribute 'text'
Sorry, but the page you are looking for doesn't exist.
Error occurred while processing URL https://insights.blackcoffer.com/how-neural-networks-can-be-applied-in-various-areas-in-the-future/

-------------------------------------------------------------------------------------------------------------------------------
URL_ID of URL 37 is: blackassign0037

Article Text of the URL saved in file: Articles_Txt_Files\blackassign0037.txt

-----------------------------------------------------------------------------------------------------------------------------

URL_ID of URL 38 is: blackassign0038

Article Text of the URL saved in file: Art

URL_ID of URL 68 is: blackassign0068

Article Text of the URL saved in file: Articles_Txt_Files\blackassign0068.txt

-----------------------------------------------------------------------------------------------------------------------------

URL_ID of URL 69 is: blackassign0069

Article Text of the URL saved in file: Articles_Txt_Files\blackassign0069.txt

-----------------------------------------------------------------------------------------------------------------------------

URL_ID of URL 70 is: blackassign0070

Article Text of the URL saved in file: Articles_Txt_Files\blackassign0070.txt

-----------------------------------------------------------------------------------------------------------------------------

URL_ID of URL 71 is: blackassign0071

Article Text of the URL saved in file: Articles_Txt_Files\blackassign0071.txt

-----------------------------------------------------------------------------------------------------------------------------

URL_ID of URL 72 is: bla

In [8]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

text_dir = "Articles_Txt_Files"
stopwords_dir = "StopWords"
sentiment_dir = "MasterDictionary"

# Load all stop words from the stopwords directory and store in the set variable
stop_words = set()
for file_name in os.listdir(stopwords_dir):
    with open(os.path.join(stopwords_dir, file_name), 'r', encoding='ISO-8859-1') as f:
        stop_words.update(set(f.read().splitlines()))

# Load all text files from the directory and store in a list (docs)
docs = []
for file_name in os.listdir(text_dir):
    file_path = os.path.join(text_dir, file_name)
    if os.path.isdir(file_path):
        continue  # Skip directories
    with open(file_path, 'r', encoding='utf-8') as f:  # Use UTF-8 encoding
        text = f.read()
        # Tokenize the given text file
        words = word_tokenize(text)
        # Remove the stop words from the tokens
        filtered_text = [word for word in words if word.lower() not in stop_words]
        # Add each filtered tokens of each file into a list
        docs.append(filtered_text)

# Store positive and negative words from the directory
pos = set()
neg = set()

for file_name in os.listdir(sentiment_dir):
    with open(os.path.join(sentiment_dir, file_name), 'r', encoding='ISO-8859-1') as f:
        if file_name == 'positive-words.txt':
            pos.update(f.read().splitlines())
        else:
            neg.update(f.read().splitlines())

# Now collect the positive and negative words from each file
# Calculate the scores from the positive and negative words 
positive_words = []
negative_words = []
positive_score = []
negative_score = []
polarity_score = []
subjectivity_score = []

# Iterate through the list of docs
for doc in docs:
    positive_words.append([word for word in doc if word.lower() in pos])
    negative_words.append([word for word in doc if word.lower() in neg])
    positive_score.append(len(positive_words[-1]))
    negative_score.append(len(negative_words[-1]))
    polarity_score.append((positive_score[-1] - negative_score[-1]) / ((positive_score[-1] + negative_score[-1]) + 0.000001))
    subjectivity_score.append((positive_score[-1] + negative_score[-1]) / (len(doc) + 0.000001))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\goura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\goura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
import re
# Download the stopwords corpus if not already downloaded
nltk.download('stopwords')

avg_sentence_length = []
Percentage_of_Complex_words = []
Fog_Index = []
complex_word_count = []
avg_syllable_word_count = []

# Avoid variable name conflict by renaming
stop_words_set = set(stopwords.words('english'))

def measure(file):
    if file.endswith('.ipynb_checkpoints'):
        return None, None, None, None, None  # Skip the file
    with open(os.path.join(text_dir, file), 'r', encoding='utf-8') as f:
        text = f.read()
    # remove punctuations 
    text = re.sub(r'[^\w\s.]', '', text)
    # split the given text file into sentences
    sentences = text.split('.')
    # total number of sentences in a file
    num_sentences = len(sentences)
    # total words in the file
    words = [word for word in text.split() if word.lower() not in stop_words_set]
    num_words = len(words)

    # complex words having syllable count is greater than 2
    # Complex words are words in the text that contain more than two syllables.
    complex_words = [word for word in words if count_syllables(word) > 2]

    # Syllable Count Per Word
    total_syllables = sum(count_syllables(word) for word in words)

    avg_sentence_len = num_words / num_sentences
    avg_syllable_word_count = total_syllables / len(words)
    Percent_Complex_words = len(complex_words) / num_words
    Fog_Index = 0.4 * (avg_sentence_len + Percent_Complex_words)

    return avg_sentence_len, Percent_Complex_words, Fog_Index, len(complex_words), avg_syllable_word_count

# Function to count syllables in a word
def count_syllables(word):
    if word.endswith('es'):
        word = word[:-2]
    elif word.endswith('ed'):
        word = word[:-2]
    vowels = 'aeiou'
    return sum(1 for letter in word if letter.lower() in vowels)

# iterate through each file or doc
for file in os.listdir(text_dir):
    x, y, z, a, b = measure(file)
    if x is not None:
        avg_sentence_length.append(x)
        Percentage_of_Complex_words.append(y)
        Fog_Index.append(z)
        complex_word_count.append(a)
        avg_syllable_word_count.append(b)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\goura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import os
import re
import nltk
from nltk.corpus import stopwords

# Download the stopwords corpus if not already downloaded
nltk.download('stopwords')

word_count = []
average_word_length = []
pp_count = []

# Avoid variable name conflict by renaming
stopwords_set = set(stopwords.words('english'))

def cleaned_words(file):
    with open(os.path.join(text_dir, file), 'r', encoding='utf-8') as f:
        text = f.read()
        text = re.sub(r'[^\w\s]', '', text)
        words = [word for word in text.split() if word.lower() not in stopwords_set]
        length = sum(len(word) for word in words)
        average_word_length = length / len(words)
    return len(words), average_word_length

def count_personal_pronouns(file):
    with open(os.path.join(text_dir, file), 'r', encoding='utf-8') as f:
        text = f.read()
        personal_pronouns = ["I", "we", "my", "ours", "us"]
        count = 0
        for pronoun in personal_pronouns:
            count += len(re.findall(r"\b" + pronoun + r"\b", text)) # \b is used to match word boundaries
    return count

for file in os.listdir(text_dir):
    if file.endswith('.ipynb_checkpoints'):
        continue  # Skip directories
    x, y = cleaned_words(file)
    word_count.append(x)
    average_word_length.append(y)
    pp_count.append(count_personal_pronouns(file))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\goura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
output_df = pd.read_excel('Output Data Structure.xlsx')

# URLs with IDs 36 and 49 do not exist (404 error), so we are going to drop these rows from the table
output_df.drop([36, 49], axis=0, inplace=True)

# These are the required parameters 
variables = [positive_score,
            negative_score,
            polarity_score,
            subjectivity_score,
            avg_sentence_length,
            Percentage_of_Complex_words,
            Fog_Index,
            avg_sentence_length,
            complex_word_count,
            word_count,
            avg_syllable_word_count,
            pp_count,
            average_word_length]

# Write the values to the dataframe
for i, var in enumerate(variables):
    output_df.iloc[:, i+2] = var

# Now save the dataframe to disk
output_df.to_csv('Output_Data_BlackCoffer.csv')

### Summary

In [None]:
# 1. In this Assignment while scrapping BlackCoffer URL from input.xlsx file, 2 URL of URL_ID = blackassign0036, blackassign0049
#    is not in the service as giving 404 error due to which the data is extracted from this two particualar file

# 2. While remaining all the URL's data (Title and text of the page) has been extracted Sucessfully in Folder named as :  
#    'Articles_Txt_Files'.

# 3. All the required parameters : positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length,
#    Percentage_of_Complex_words, Fog_Index, avg_sentence_length, complex_word_count, word_count, avg_syllable_word_count,
#    pp_count, average_word_length as been founded Sucessfully and has been stored in new file named as : 
#    'Output_Data_BlackCoffer.csv'.