## BLACKCOFFER DATA ENGINEER ASSIGNMENT

In [3]:
import pandas as pd

In [5]:
input_file = 'Input.xlsx'
df = pd.read_excel(input_file)
df

Unnamed: 0,URL_ID,URL
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...
3,bctech2014,https://insights.blackcoffer.com/effective-man...
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...
...,...,...
142,bctech2153,https://insights.blackcoffer.com/population-an...
143,bctech2154,https://insights.blackcoffer.com/google-lsa-ap...
144,bctech2155,https://insights.blackcoffer.com/healthcare-da...
145,bctech2156,https://insights.blackcoffer.com/budget-sales-...


In [9]:
import requests
from bs4 import BeautifulSoup

In [10]:
def extract_article_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.find('h1').get_text()
    article = ' '.join([p.get_text() for p in soup.find_all('p')])
    return title, article

In [8]:
for idx, row in df.iterrows():
    url = row['URL']
    url_id = row['URL_ID']
    title, article = extract_article_text(url)
    with open(f'{url_id}.txt', 'w', encoding='utf-8') as file:
        file.write(title + '\n' + article)

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import os

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

# Load positive and negative words
with open('positive-words.txt', 'r') as file:
    positive_words = set(file.read().split())
with open('negative-words.txt', 'r') as file:
    negative_words = set(file.read().split())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
def clean_text(text):
    words = word_tokenize(text)
    cleaned_words = [word for word in words if word.isalpha() and word.lower() not in stop_words]
    return cleaned_words

In [15]:
def syllable_count(word):
    word = word.lower()
    count = len(re.findall(r'[aeiouy]', word))
    if word.endswith('es') or word.endswith('ed'):
        count -= 1
    return count if count > 0 else 1

In [16]:
def calculate_scores(text):
    cleaned_words = clean_text(text)
    total_words = len(cleaned_words)
    positive_score = sum(1 for word in cleaned_words if word in positive_words)
    negative_score = sum(1 for word in cleaned_words if word in negative_words)
    polarity_score = (positive_score - negative_score) / ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score) / (total_words + 0.000001)

    sentences = sent_tokenize(text)
    avg_sentence_length = total_words / len(sentences)
    complex_words = sum(1 for word in cleaned_words if syllable_count(word) > 2)
    percentage_complex_words = complex_words / total_words
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    avg_words_per_sentence = total_words / len(sentences)
    syllables_per_word = sum(syllable_count(word) for word in cleaned_words) / total_words
    personal_pronouns = len(re.findall(r'\b(I|we|my|ours|us)\b', text, re.I))
    avg_word_length = sum(len(word) for word in cleaned_words) / total_words

    return [positive_score, negative_score, polarity_score, subjectivity_score, avg_sentence_length,
            percentage_complex_words, fog_index, avg_words_per_sentence, complex_words, total_words,
            syllables_per_word, personal_pronouns, avg_word_length]

In [17]:
output_data = []

for idx, row in df.iterrows():
    url_id = row['URL_ID']
    with open(f'{url_id}.txt', 'r', encoding='utf-8') as file:
        text = file.read()
    scores = calculate_scores(text)
    output_data.append([row['URL_ID'], row['URL']] + scores)

output_df = pd.DataFrame(output_data, columns=[
    'URL_ID', 'URL', 'POSITIVE SCORE', 'NEGATIVE SCORE', 'POLARITY SCORE', 'SUBJECTIVITY SCORE',
    'AVG SENTENCE LENGTH', 'PERCENTAGE OF COMPLEX WORDS', 'FOG INDEX', 'AVG NUMBER OF WORDS PER SENTENCE',
    'COMPLEX WORD COUNT', 'WORD COUNT', 'SYLLABLE PER WORD', 'PERSONAL PRONOUNS', 'AVG WORD LENGTH'
])
output_df.to_excel('Output Data Structure.xlsx', index=False)