In [1]:
import pandas as pd
import os
import re
import nltk
import requests
from bs4 import BeautifulSoup
from pathlib import Path
from nltk.tokenize import word_tokenize, sent_tokenize

In [19]:
nltk.download('punkt', quiet=True)

True

CONFIGURATION

In [3]:
INPUT_FILE = 'lead_sources.xlsx'
OUTPUT_FOLDER = "extracted_data_logs"
OUTPUT_EXCEL_PATH = "Structured_Lead_Data.xlsx"
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

HELPER FUNCTIONS

In [4]:
def count_syllables(word):
    """Counts syllables for Fog Index calculation."""
    word = word.lower()
    vowels = "aeiouy"
    count = 0
    if not word: return 0
    if word[0] in vowels: count += 1
    for i in range(1, len(word)):
        if word[i] in vowels and word[i - 1] not in vowels:
            count += 1
    if word.endswith('e'): count -= 1
    if word.endswith(('es', 'ed')): count -= 1
    return max(1, count)

In [5]:
def calculate_nlp_metrics(text):
    """Calculates readability and complexity metrics."""
    sentences = sent_tokenize(text)
    words = [word.lower() for word in word_tokenize(text) if word.isalpha()]

    num_sentences = len(sentences) if len(sentences) > 0 else 1
    num_words = len(words) if len(words) > 0 else 1

    # Complex words (> 2 syllables)
    complex_words = [w for w in words if count_syllables(w) > 2]
    num_complex = len(complex_words)

    avg_sent_len = num_words / num_sentences
    pct_complex = (num_complex / num_words) * 100
    fog_index = 0.4 * (avg_sent_len + pct_complex)

    return {
        'Word_Count': num_words,
        'Avg_Sentence_Length': round(avg_sent_len, 2),
        'Complex_Word_Count': num_complex,
        'Fog_Index': round(fog_index, 2),
        'Avg_Word_Length': round(sum(len(w) for w in words) / num_words, 2) }

In [6]:
def scrape_url(url):
    """Robust scraper for extracting title and body text."""
    try:
        response = requests.get(url, headers=HEADERS, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Generic selectors for broad compatibility
        title = soup.find('h1').get_text().strip() if soup.find('h1') else "No Title Found"
        paragraphs = soup.find_all('p')
        body_text = "\n".join([p.get_text().strip() for p in paragraphs if p.get_text()])

        return f"{title}\n\n{body_text}", title
    except Exception as e:
        return f"Error: {str(e)}", ""

MAIN EXECUTION PIPELINE

In [12]:
def run_pipeline():
    # Loading Input (With Fallback for Demo)
    try:
        df_input = pd.read_excel(INPUT_FILE)
        tasks = df_input.to_dict('records')
        print(f"Loaded {len(tasks)} URLs from {INPUT_FILE}")
    except:
        print("Input file not found. Running in DEMO MODE with public URLs...")
        tasks = [
            {'URL_ID': 'DEMO_01', 'URL': 'https://en.wikipedia.org/wiki/Data_science'},
            {'URL_ID': 'DEMO_02', 'URL': 'https://www.bbc.com/news/business'} ]

    results = []
    Path(OUTPUT_FOLDER).mkdir(exist_ok=True)

    for item in tasks:
        url = item['URL']
        uid = item['URL_ID']
        print(f"Processing {uid}...")

        full_text, title = scrape_url(url)

        if not full_text.startswith("Error"):
            # Saving Raw Text Log
            with open(f"{OUTPUT_FOLDER}/{uid}.txt", "w", encoding="utf-8") as f:
                f.write(full_text)

            #  Analysis
            metrics = calculate_nlp_metrics(full_text)

            #  CRM-Ready Row
            row = {
                'Lead_ID': uid,
                'Source_URL': url,
                'Title': title,
                'Status': 'Success'
            }
            row.update(metrics)
            results.append(row)
        else:
            results.append({'Lead_ID': uid, 'Source_URL': url, 'Status': 'Failed'})

    return results


In [15]:
# Export to Excel
if __name__ == "__main__":
    results = run_pipeline() # Capture the returned results
    df_output = pd.DataFrame(results)
    df_output.to_excel(OUTPUT_EXCEL_PATH, index=False)
    print(f"\nSUCCESS! Results saved to {OUTPUT_EXCEL_PATH}")


Input file not found. Running in DEMO MODE with public URLs...
Processing DEMO_01...
Processing DEMO_02...

SUCCESS! Results saved to Structured_Lead_Data.xlsx
