In [1]:
import pandas as pd
import requests
import os
import re
from bs4 import BeautifulSoup
from textblob import TextBlob
import nltk
from nltk.corpus import cmudict

In [2]:
excel_file_path = "C:/Users/Harry/OneDrive/Desktop/interntask/Input.xlsx"
df = pd.read_excel(excel_file_path)

In [3]:
url_column = 'URL'
id_column = 'URL_ID'

In [4]:
output_dir = "C:/Users/Harry/OneDrive/Desktop/interntask/TextFiles"
os.makedirs(output_dir, exist_ok=True)

In [5]:
# Function to fetch and extract article content
def fetch_and_extract_article(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        title_tag = soup.find('h1', class_='entry-title')
        title = title_tag.get_text(strip=True) if title_tag else 'No Title'
        content_div = soup.find('div', class_='td-post-content')
        
        if content_div:
            paragraphs = content_div.find_all('p')
            article_text = '\n'.join(paragraph.get_text(strip=True) for paragraph in paragraphs)
        else:
            article_text = ''

        full_text = f"{title}\n\n{article_text}"
        return full_text
    except Exception as e:
        print(f"Error fetching or extracting content from {url}: {e}")
        return ''

In [23]:
nltk.download('cmudict')
d = cmudict.dict()

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\Harry\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [24]:
def syllable_count(word):
    word = word.lower()
    
    if word in d:
        return max(
            len([syllable for syllable in pronunciation if syllable[-1].isdigit()])
            for pronunciation in d[word]
        )
    return 0

In [16]:
#function to handle encoding error
def load_file_with_encoding(file_path, encodings=['utf-8', 'latin-1']):
    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as f:
                return f.read()
        except (UnicodeDecodeError, IOError):
            continue
    raise UnicodeDecodeError(f"Failed to read the file with any of the given encodings: {encodings}")

In [17]:
#function to load stopwords directory
def StopWords(dir_path):
    stopwords = set()
    for file in os.listdir(dir_path):
        if file.endswith(".txt"):
            file_path = os.path.join(dir_path, file)
            try:
                content = load_file_with_encoding(file_path)
                stopwords.update(line.strip().lower() for line in content.splitlines() if line.strip())
            except UnicodeDecodeError:
                print(f"Encoding error for file {file_path}. Skipping.")
    return stopwords

In [18]:
#function to load MasterDictonary directory
def MasterDict(dir_path):
    pos_words = set()
    neg_words = set()
    for file in os.listdir(dir_path):
        if file == 'positive-words.txt':
            file_path = os.path.join(dir_path, file)
            try:
                content = load_file_with_encoding(file_path)
                pos_words.update(line.strip().lower() for line in content.splitlines() if line.strip())
            except UnicodeDecodeError:
                print(f"Encoding error for file {file_path}. Skipping.")
        elif file == 'negative-words.txt':
            file_path = os.path.join(dir_path, file)
            try:
                content = load_file_with_encoding(file_path)
                neg_words.update(line.strip().lower() for line in content.splitlines() if line.strip())
            except UnicodeDecodeError:
                print(f"Encoding error for file {file_path}. Skipping.")
    return pos_words, neg_words

In [19]:
#function to load TextFiles directory
def TextFiles(dir_path):
    texts = {}
    for file in os.listdir(dir_path):
        if file.endswith(".txt"):
            file_path = os.path.join(dir_path, file)
            try:
                content = load_file_with_encoding(file_path)
                texts[file.replace('.txt', '')] = content
            except UnicodeDecodeError:
                print(f"Encoding error for file {file_path}. Skipping.")
    return texts

In [9]:
#function to save the variable answers
def OutputFile(file_path, results):
    df_existing = pd.read_excel(file_path)
    if 'URL_ID' not in df_existing.columns:
        raise KeyError("Missing 'URL_ID' column in the Excel file.")
    
    df_results = pd.DataFrame(results).T
    df_results.index.name = 'URL_ID'
    df_updated = df_existing.set_index('URL_ID').join(df_results)
    df_updated.to_excel(file_path)

In [14]:
#For web scrapping of articles
for _, row in df.iterrows():
    url_id = row[id_column]
    url = row[url_column]
    try:
        article_content = fetch_and_extract_article(url)
        if article_content:  # Check if content is not empty
            file_name = f"{url_id}.txt"
            file_path = os.path.join(output_dir, file_name)
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(article_content)
            print(f"Article saved to {file_path}")
        else:
            print(f"No content extracted for URL_ID {url_id} at {url}")
    except Exception as e:
        print(f"Failed to process URL_ID {url_id} at {url}: {e}")

print("Article extraction completed.")

Article saved to C:/Users/Harry/OneDrive/Desktop/interntask/TextFiles\bctech2011.txt
Article saved to C:/Users/Harry/OneDrive/Desktop/interntask/TextFiles\bctech2012.txt
Article saved to C:/Users/Harry/OneDrive/Desktop/interntask/TextFiles\bctech2013.txt
Article saved to C:/Users/Harry/OneDrive/Desktop/interntask/TextFiles\bctech2014.txt
Article saved to C:/Users/Harry/OneDrive/Desktop/interntask/TextFiles\bctech2015.txt
Article saved to C:/Users/Harry/OneDrive/Desktop/interntask/TextFiles\bctech2016.txt
Article saved to C:/Users/Harry/OneDrive/Desktop/interntask/TextFiles\bctech2017.txt
Article saved to C:/Users/Harry/OneDrive/Desktop/interntask/TextFiles\bctech2018.txt
Article saved to C:/Users/Harry/OneDrive/Desktop/interntask/TextFiles\bctech2019.txt
Article saved to C:/Users/Harry/OneDrive/Desktop/interntask/TextFiles\bctech2020.txt
Article saved to C:/Users/Harry/OneDrive/Desktop/interntask/TextFiles\bctech2021.txt
Article saved to C:/Users/Harry/OneDrive/Desktop/interntask/TextF

In [20]:
#function for text analysis
def TextAnalysis(text, stopwords, pos_words, neg_words):
    blob = TextBlob(text)
    sentences = blob.sentences
    words = [w for w in blob.words if w.lower() not in stopwords]
    
    word_count = len(words)
    complex_word_count = sum(1 for w in words if syllable_count(w) >= 3)
    avg_sentence_len = sum(len(s.words) for s in sentences) / len(sentences) if sentences else 0
    avg_word_len = sum(len(w) for w in words) / len(words) if words else 0
    
    pos_score = sum(1 for w in words if w.lower() in pos_words)
    neg_score = sum(1 for w in words if w.lower() in neg_words)
    
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    
    perc_complex_words = (complex_word_count / word_count) * 100 if word_count else 0
    fog_index = 0.4 * (avg_sentence_len + perc_complex_words) if avg_sentence_len else 0
    personal_pronouns = sum(1 for w in words if w.lower() in ['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours'])
    
    return {
        'POSITIVE SCORE': pos_score,
        'NEGATIVE SCORE': neg_score,
        'POLARITY SCORE': polarity,
        'SUBJECTIVITY SCORE': subjectivity,
        'AVG SENTENCE LENGTH': avg_sentence_len,
        'PERCENTAGE OF COMPLEX WORDS': perc_complex_words,
        'FOG INDEX': fog_index,
        'AVG NUMBER OF WORDS PER SENTENCE': avg_sentence_len,
        'COMPLEX WORD COUNT': complex_word_count,
        'WORD COUNT': word_count,
        'SYLLABLE PER WORD': sum(syllable_count(w) for w in words) / word_count if word_count else 0,
        'PERSONAL PRONOUNS': personal_pronouns,
        'AVG WORD LENGTH': avg_word_len
    }

In [21]:
text_files_dir = "C:/Users/Harry/OneDrive/Desktop/interntask/TextFiles"
stopwords_dir = "C:/Users/Harry/OneDrive/Desktop/interntask/StopWords"
master_dict_dir = "C:/Users/Harry/OneDrive/Desktop/interntask/MasterDictionary"
input_excel = "C:/Users/Harry/OneDrive/Desktop/interntask/Input.xlsx"

In [25]:
stopwords = StopWords(stopwords_dir)
pos_words, neg_words = MasterDict(master_dict_dir)

texts = TextFiles(text_files_dir)
results = {}
for url_id, text in texts.items():
    print(f"Processing {url_id}")
    results[url_id] = TextAnalysis(text, stopwords, pos_words, neg_words)

OutputFile(input_excel, results)
print("Analysis complete")

Processing bctech2011
Processing bctech2012
Processing bctech2013
Processing bctech2014
Processing bctech2015
Processing bctech2016
Processing bctech2017
Processing bctech2018
Processing bctech2019
Processing bctech2020
Processing bctech2021
Processing bctech2022
Processing bctech2023
Processing bctech2024
Processing bctech2025
Processing bctech2026
Processing bctech2027
Processing bctech2028
Processing bctech2029
Processing bctech2030
Processing bctech2031
Processing bctech2032
Processing bctech2033
Processing bctech2034
Processing bctech2035
Processing bctech2036
Processing bctech2037
Processing bctech2038
Processing bctech2039
Processing bctech2040
Processing bctech2041
Processing bctech2042
Processing bctech2043
Processing bctech2044
Processing bctech2045
Processing bctech2046
Processing bctech2047
Processing bctech2048
Processing bctech2049
Processing bctech2050
Processing bctech2051
Processing bctech2052
Processing bctech2053
Processing bctech2054
Processing bctech2055
Processing