In [8]:
# Ensure compatibility across huggingface_hub versions: some versions removed
# `cached_download`, which older packages expect. If it's missing, alias it
# to the newer `hf_hub_download` so imports of sentence_transformers succeed.
try:
	import huggingface_hub
	if not hasattr(huggingface_hub, "cached_download"):
		from huggingface_hub import hf_hub_download
		def cached_download(*args, **kwargs):
			return hf_hub_download(*args, **kwargs)
		huggingface_hub.cached_download = cached_download
except Exception:
	# If huggingface_hub isn't available or another issue occurs, let the
	# normal import raise an informative error later.
	pass

from sentence_transformers import SentenceTransformer
print("Ready")  # Now compatible; prints "Ready"


Ready


In [9]:
# Cell 1: Import libraries and load dataset (With huggingface_hub compatibility patch)
# Ensure compatibility across huggingface_hub versions: some versions removed
# `cached_download`, which older packages expect. If it's missing, alias it
# to the newer `hf_hub_download` so imports of sentence_transformers succeed.
try:
    import huggingface_hub
    if not hasattr(huggingface_hub, "cached_download"):
        from huggingface_hub import hf_hub_download
        def cached_download(*args, **kwargs):
            return hf_hub_download(*args, **kwargs)
        huggingface_hub.cached_download = cached_download
except Exception:
    # If huggingface_hub isn't available or another issue occurs, let the
    # normal import raise an informative error later.
    pass

import pandas as pd
import os
import time
import re
import json
import numpy as np
from bs4 import BeautifulSoup
import requests
import textstat

print("Core libraries imported. Creating folders and loading dataset...")

# Auto-create folders
os.makedirs('data', exist_ok=True)
os.makedirs('models', exist_ok=True)
print("Folders 'data/' and 'models/' ready.")

# Path to dataset
data_path = os.path.join(os.getcwd(), 'data', 'data.csv')

# Load dataset
df = pd.DataFrame()
try:
    df = pd.read_csv(data_path, encoding='utf-8')
    print(f"Dataset loaded successfully from {data_path}")
    print(f"Number of rows: {len(df)}")  # ~65
    print(f"Columns: {df.columns.tolist()}")  # ['url', 'html_content']
    print("\nSample row preview:")
    print(df.head(1).to_string())
    empty_html = df['html_content'].isna().sum() + (df['html_content'] == '').sum()
    print(f"Rows with empty/missing HTML: {empty_html}")
except FileNotFoundError:
    print("Error: data.csv not found in data/. Download from Kaggle.")
    df = pd.DataFrame({'url': [], 'html_content': []})
except UnicodeDecodeError:
    df = pd.read_csv(data_path, encoding='latin-1')
    print("Used latin-1 encoding fallback.")

# NLTK safe import
try:
    import nltk
    from nltk.tokenize import sent_tokenize
    nltk.download('punkt', quiet=True)
    print("NLTK imported successfully.")
    NLTK_AVAILABLE = True
except Exception as e:
    print(f"NLTK warning: {e}. Fallback active.")
    NLTK_AVAILABLE = False
    def sent_tokenize(text):
        return re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text) if text else []

# ML libs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import joblib

# SentenceTransformer (with patch)
try:
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('all-MiniLM-L6-v2')
    print("SentenceTransformer loaded.")
    EMBED_AVAILABLE = True
except Exception as e:
    print(f"Embedding warning: {e}. Using zero fallback (TF-IDF still works).")
    EMBED_AVAILABLE = False
    model = None
    def encode(texts):
        return [np.zeros(384) for _ in texts]

print("All imports complete. Ready for Cell 2.")


Core libraries imported. Creating folders and loading dataset...
Folders 'data/' and 'models/' ready.
Error: data.csv not found in data/. Download from Kaggle.
NLTK imported successfully.
SentenceTransformer loaded.
All imports complete. Ready for Cell 2.


In [12]:
# Cell 2: HTML Parsing and Text Extraction (Safe for empty df)
def parse_html(html_content, url):
    if not html_content or len(str(html_content)) < 100:
        return {'url': url, 'title': 'Empty HTML', 'body_text': '', 'word_count': 0}
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        title_tag = soup.title or soup.find('h1')
        title = title_tag.get_text().strip() if title_tag else 'No title'
        for unwanted in soup(['script', 'style', 'nav', 'header', 'footer', 'aside']):
            unwanted.decompose()
        content_selectors = ['p', 'article', 'main', 'div[class*="content"]', 'div[class*="article"]', 'section']
        body_parts = []
        for selector in content_selectors:
            elements = soup.select(selector)
            body_parts.extend([el.get_text().strip() for el in elements if el.get_text().strip()])
        body_text = ' '.join(body_parts) if body_parts else soup.get_text()
        body_text = re.sub(r'\s+', ' ', body_text).strip().lower()
        body_text = re.sub(r'[^\w\s\.\-]', '', body_text)
        word_count = len(re.findall(r'\b\w+\b', body_text))
        return {
            'url': url,
            'title': title[:200] + '...' if len(title) > 200 else title,
            'body_text': body_text,
            'word_count': max(0, word_count)
        }
    except Exception as e:
        print(f"Error parsing {url}: {str(e)[:100]}...")
        return {'url': url, 'title': 'Parse error', 'body_text': '', 'word_count': 0}

print("Starting HTML parsing...")
parsed_data = []
failed = 0
total = len(df)
if total == 0:
    print("Warning: No data loaded. Run Cell 1 first.")
    extracted_df = pd.DataFrame(columns=['url', 'title', 'body_text', 'word_count'])
else:
    for idx, row in df.iterrows():
        if idx % 10 == 0:
            print(f"Processed {idx}/{total} rows...")
        result = parse_html(row['html_content'], row['url'])
        parsed_data.append(result)
        if result['word_count'] == 0:
            failed += 1
    extracted_df = pd.DataFrame(parsed_data)

extracted_df.to_csv('data/extracted_content.csv', index=False, encoding='utf-8')

print(f"\nParsing complete! Total rows: {total}")
if total > 0:
    print(f"Failed/empty: {failed} ({failed/total*100:.1f}%)")
    print(f"Average word count: {extracted_df['word_count'].mean():.0f}")
    print(f"Min/Max words: {extracted_df['word_count'].min()}/{extracted_df['word_count'].max()}")
    print("\nSample extracted data (first 2 rows):")
    print(extracted_df[['url', 'title', 'word_count']].head(2).to_string(index=False))
else:
    print("No data to process. Ensure data.csv loads 65 rows in Cell 1.")

global_extracted_df = extracted_df


Starting HTML parsing...

Parsing complete! Total rows: 0
No data to process. Ensure data.csv loads 65 rows in Cell 1.


In [14]:
import os
print("Current directory:", os.getcwd())  # Should be C:\Users\MOHAMMED HANEES\OneDrive\Desktop\Big Data\seo-content-detector
print("Data folder exists:", os.path.exists('data'))
print("data.csv exists:", os.path.exists(os.path.join('data', 'data.csv')))
if os.path.exists(os.path.join('data', 'data.csv')):
    import pandas as pd
    df_test = pd.read_csv('data/data.csv', nrows=2)  # Test load 2 rows
    print("Test load success. Rows preview:")
    print(df_test)
else:
    print("File missing—copy data.csv to data/ folder.")


Current directory: c:\Users\MOHAMMED HANEES\OneDrive\Desktop\Big Data\seo-content-detector\notebooks
Data folder exists: True
data.csv exists: False
File missing—copy data.csv to data/ folder.


In [15]:
# Cell 1: Import libraries and load dataset (With path debug & fallback)
# Huggingface compatibility patch
try:
    import huggingface_hub
    if not hasattr(huggingface_hub, "cached_download"):
        from huggingface_hub import hf_hub_download
        def cached_download(*args, **kwargs):
            return hf_hub_download(*args, **kwargs)
        huggingface_hub.cached_download = cached_download
except Exception:
    pass

import pandas as pd
import os
import time
import re
import json
import numpy as np
from bs4 import BeautifulSoup
import requests
import textstat

print("Core libraries imported. Debug: Current dir =", os.getcwd())

# Auto-create folders
os.makedirs('data', exist_ok=True)
os.makedirs('models', exist_ok=True)
print("Folders ready.")

# Paths: Relative first, then full fallback
data_path = os.path.join('data', 'data.csv')
full_path = r"C:\Users\MOHAMMED HANEES\OneDrive\Desktop\Big Data\seo-content-detector\data\data.csv"  # Adjust if needed
print(f"Relative path exists: {os.path.exists(data_path)}")
print(f"Full path exists: {os.path.exists(full_path)}")

df = pd.DataFrame()
try:
    if os.path.exists(data_path):
        df = pd.read_csv(data_path, encoding='utf-8')
    elif os.path.exists(full_path):
        df = pd.read_csv(full_path, encoding='utf-8')
        print(f"Loaded from full path: {full_path}")
    else:
        raise FileNotFoundError("data.csv not found. Copy to data/ or check path.")
    
    print(f"Dataset loaded! Rows: {len(df)}")  # 65 expected
    print(f"Columns: {df.columns.tolist()}")  # ['url', 'html_content']
    print("\nSample row (first URL/HTML snippet):")
    print(df.head(1)[['url']].to_string() + "\n(HTML truncated for preview)")
    empty_html = df['html_content'].isna().sum() + (df['html_content'] == '').sum()
    print(f"Empty HTML rows: {empty_html}")
except Exception as e:
    print(f"Load error: {e}. Download data.csv from Kaggle to data/.")

# NLTK
try:
    import nltk
    from nltk.tokenize import sent_tokenize
    nltk.download('punkt', quiet=True)
    print("NLTK ready.")
    NLTK_AVAILABLE = True
except Exception as e:
    print(f"NLTK fallback: {e}")
    NLTK_AVAILABLE = False
    def sent_tokenize(text):
        return re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text) if text else []

# ML libs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import joblib

# SentenceTransformer
try:
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('all-MiniLM-L6-v2')
    print("SentenceTransformer loaded.")
    EMBED_AVAILABLE = True
except Exception as e:
    print(f"Embedding fallback: {e}")
    EMBED_AVAILABLE = False
    model = None
    def encode(texts):
        return [np.zeros(384) for _ in texts]

print("All ready. Run debug temp cell if rows != 65.")


Core libraries imported. Debug: Current dir = c:\Users\MOHAMMED HANEES\OneDrive\Desktop\Big Data\seo-content-detector\notebooks
Folders ready.
Relative path exists: False
Full path exists: True
Loaded from full path: C:\Users\MOHAMMED HANEES\OneDrive\Desktop\Big Data\seo-content-detector\data\data.csv
Dataset loaded! Rows: 81
Columns: ['url', 'html_content']

Sample row (first URL/HTML snippet):
                                              url
0  https://www.cm-alliance.com/cybersecurity-blog
(HTML truncated for preview)
Empty HTML rows: 12
NLTK ready.
SentenceTransformer loaded.
All ready. Run debug temp cell if rows != 65.


In [16]:
# Cell 3: Text Preprocessing & Feature Engineering (Safe for empty)
features_df = global_extracted_df.copy() if 'global_extracted_df' in globals() else pd.DataFrame()  # From Cell 2

if features_df.empty:
    print("Warning: No extracted data. Run Cells 1-2 first.")
    # Create empty with columns
    features_df = pd.DataFrame(columns=['url', 'title', 'body_text', 'word_count', 'sentence_count', 'flesch_reading_ease', 'top_keywords', 'embedding'])
else:
    # TF-IDF
    non_empty_texts = [row['body_text'] for _, row in features_df.iterrows() if row['body_text']]
    if non_empty_texts:
        vectorizer = TfidfVectorizer(max_features=100, stop_words='english', ngram_range=(1,2))
        tfidf_matrix = vectorizer.fit_transform(non_empty_texts)
        feature_names = vectorizer.get_feature_names_out()
    else:
        vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
        tfidf_matrix = np.zeros((len(features_df), 100))
        feature_names = np.array([])

    print("Extracting features...")
    for idx, row in features_df.iterrows():
        if idx % 10 == 0:
            print(f"Processed {idx}/{len(features_df)}...")
        
        body_text = row['body_text']
        sentences = sent_tokenize(body_text) if NLTK_AVAILABLE else re.split(r'\. ', body_text)
        sentence_count = len(sentences)
        flesch_score = textstat.flesch_reading_ease(body_text) if len(body_text) > 100 else 0.0
        
        # Embedding
        if EMBED_AVAILABLE and body_text:
            embedding = model.encode([body_text])[0].tolist()
        else:
            embedding = [0.0] * 384
        
        # Top keywords
        top_keywords = ''
        if non_empty_texts and idx < len(tfidf_matrix) and body_text:
            try:
                doc_idx = non_empty_texts.index(body_text)
                top_idx = tfidf_matrix[doc_idx].toarray().argsort()[-5:][::-1]
                top_keywords = '|'.join([feature_names[i] for i in top_idx if i < len(feature_names)])
            except:
                pass
        
        # Add columns if missing
        for col in ['sentence_count', 'flesch_reading_ease', 'top_keywords', 'embedding']:
            if col not in features_df.columns:
                features_df[col] = None
        features_df.at[idx, 'sentence_count'] = sentence_count
        features_df.at[idx, 'flesch_reading_ease'] = round(flesch_score, 2)
        features_df.at[idx, 'top_keywords'] = top_keywords
        features_df.at[idx, 'embedding'] = json.dumps(embedding)

    # Select columns safely
    cols = ['url', 'title', 'body_text', 'word_count', 'sentence_count', 'flesch_reading_ease', 'top_keywords', 'embedding']
    existing_cols = [c for c in cols if c in features_df.columns]
    features_df = features_df[existing_cols]

features_df.to_csv('data/features.csv', index=False, encoding='utf-8')

print(f"\nFeature extraction complete! Rows: {len(features_df)}")
if len(features_df) > 0:
    print(f"Avg sentence count: {features_df['sentence_count'].mean():.1f}" if 'sentence_count' in features_df.columns else "No features added.")
    print(f"Avg Flesch score: {features_df['flesch_reading_ease'].mean():.1f}" if 'flesch_reading_ease' in features_df.columns else "No features added.")
    print("\nSample row:")
    sample_cols = ['url', 'word_count'] + [c for c in ['sentence_count', 'flesch_reading_ease', 'top_keywords'] if c in features_df.columns]
    print(features_df[sample_cols].head(1).to_string(index=False))
else:
    print("Empty features. Fix data load in Cell 1.")

global_features_df = features_df
global_vectorizer = vectorizer  # For Cell 6



Feature extraction complete! Rows: 0
Empty features. Fix data load in Cell 1.


In [17]:
# Cell 4: Duplicate Detection (Safe for empty features)
if 'global_features_df' in globals():
    features_df = global_features_df.copy()
else:
    print("Warning: No features_df from Cell 3. Creating empty.")
    features_df = pd.DataFrame()

if features_df.empty:
    print("No data for duplicates. Run Cells 1-3 first.")
    dups_df = pd.DataFrame(columns=['url1', 'url2', 'similarity'])
    dups_df.to_csv('data/duplicates.csv', index=False)
    total = 0
    thin_count = 0
else:
    # Load embeddings
    embeddings = []
    valid_rows = []
    for idx, row in features_df.iterrows():
        try:
            emb = json.loads(row['embedding'])
            if len(emb) == 384:
                embeddings.append(emb)
                valid_rows.append(idx)
        except:
            continue

    embeddings = np.array(embeddings) if embeddings else np.zeros((1, 384))

    # Similarity matrix
    if len(embeddings) > 1:
        sim_matrix = cosine_similarity(embeddings)
    else:
        sim_matrix = np.array([[0]])

    # Flag duplicates
    threshold = 0.80
    duplicates = []
    for i in range(len(valid_rows)):
        for j in range(i+1, len(valid_rows)):
            sim = sim_matrix[i, j]
            if sim > threshold:
                url1 = features_df.iloc[valid_rows[i]]['url']
                url2 = features_df.iloc[valid_rows[j]]['url']
                duplicates.append({'url1': url1, 'url2': url2, 'similarity': round(sim, 3)})

    # Thin content flag
    features_df['is_thin'] = features_df['word_count'] < 500 if 'word_count' in features_df.columns else False

    # Save duplicates
    dups_df = pd.DataFrame(duplicates)
    dups_df.to_csv('data/duplicates.csv', index=False)
    if not dups_df.empty:
        print(f"Found {len(duplicates)} duplicate pairs > {threshold}.")
        print("\nSample duplicate:")
        print(dups_df.head(1).to_string(index=False))
    else:
        print("No duplicates found above threshold.")
        pd.DataFrame(columns=['url1', 'url2', 'similarity']).to_csv('data/duplicates.csv', index=False)

    # Stats
    total = len(features_df)
    thin_count = features_df['is_thin'].sum()
    print(f"Total pages: {total}")
    print(f"Thin content (<500 words): {thin_count} ({thin_count/total*100:.1f}% if total>0)")

    # Update features.csv
    features_df.to_csv('data/features.csv', index=False)

print("Duplicates analysis complete.")


No data for duplicates. Run Cells 1-3 first.
Duplicates analysis complete.


In [18]:
# Cell 5: Content Quality Scoring Model (Safe for empty)
import pandas as pd

try:
    features_df = pd.read_csv('data/features.csv')
except:
    print("No features.csv. Creating empty df for demo.")
    features_df = pd.DataFrame()

if features_df.empty:
    print("No data for training. Run Cells 1-4 first.")
    # Dummy model save
    from sklearn.ensemble import RandomForestClassifier
    dummy_model = RandomForestClassifier(n_estimators=10)
    import joblib
    joblib.dump(dummy_model, 'models/quality_model.pkl')
    print("Dummy model saved.")
else:
    # Synthetic labels based on features
    def assign_label(row):
        if 'word_count' not in features_df.columns or 'flesch_reading_ease' not in features_df.columns:
            return 'Medium'  # Fallback
        wc = row['word_count']
        fr = row['flesch_reading_ease']
        if wc > 1500 and 50 <= fr <= 70:
            return 'High'
        elif wc < 500 or fr < 30:
            return 'Low'
        else:
            return 'Medium'

    features_df['quality_label'] = features_df.apply(assign_label, axis=1)

    # Features & labels
    feat_cols = ['word_count', 'sentence_count', 'flesch_reading_ease']
    existing_feats = [c for c in feat_cols if c in features_df.columns]
    X = features_df[existing_feats].fillna(0)
    y = features_df['quality_label']
    label_map = {'Low': 0, 'Medium': 1, 'High': 2}
    y_encoded = y.map(label_map)

    # Split
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded if len(y.unique()) > 1 else None)

    print(f"Training on {len(X_train)} samples. Labels:\n{y.value_counts()}")

    # RF Model
    from sklearn.ensemble import RandomForestClassifier
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)

    # Metrics
    from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro') if len(y.unique()) > 1 else 0
    print(f"\nRF Accuracy: {acc:.3f}, F1: {f1:.3f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=list(label_map.keys()), zero_division=0))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Feature importance
    importances = pd.DataFrame({'feature': X.columns, 'importance': rf_model.feature_importances_})
    print("\nTop Features:")
    print(importances.sort_values('importance', ascending=False))

    # Baseline (word count only)
    def baseline_predict(row):
        if 'word_count' in row:
            wc = row['word_count']
            if wc > 1500: return 2
            elif wc < 500: return 0
            return 1
        return 1
    y_baseline = [baseline_predict(row) for _, row in X_test.iterrows()]
    acc_baseline = accuracy_score(y_test, y_baseline)
    print(f"\nBaseline Accuracy: {acc_baseline:.3f} (RF improvement: {acc - acc_baseline:.3f})")

# Save model
import joblib
joblib.dump(rf_model if 'rf_model' in locals() else RandomForestClassifier(), 'models/quality_model.pkl')
print("\nModel saved to models/quality_model.pkl.")

# Save labeled features
features_df.to_csv('data/features.csv', index=False)
print("Features with labels saved.")


No data for training. Run Cells 1-4 first.
Dummy model saved.

Model saved to models/quality_model.pkl.
Features with labels saved.


In [21]:
# Cell 6: Real-Time Analysis Demo (Fixed NotFittedError & Fallbacks)
import json
import time
import re
import requests
from sklearn.metrics.pairwise import cosine_similarity
import textstat
from bs4 import BeautifulSoup
import joblib
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Load or create model (fit dummy if needed)
try:
    rf_model = joblib.load('models/quality_model.pkl')
    if not hasattr(rf_model, 'classes_') or len(rf_model.classes_) == 0:
        raise ValueError("Model not fitted")
    features_df = pd.read_csv('data/features.csv')
    dataset_embeddings = np.array([json.loads(row['embedding']) for _, row in features_df.iterrows() if row.get('embedding') and len(json.loads(row['embedding'])) == 384])
    print(f"Loaded fitted model and {len(dataset_embeddings)} embeddings.")
except Exception as e:
    print(f"Fallback model (error: {e}). Using fitted dummy + rules.")
    # Create/fit dummy on synthetic data (simulates 100 samples)
    dummy_X = np.random.rand(100, 3) * [2000, 50, 80]  # word_count, sentence_count, flesch
    dummy_y = np.random.choice([0,1,2], 100)  # Low/Med/High
    rf_model = RandomForestClassifier(n_estimators=50, random_state=42)
    rf_model.fit(dummy_X, dummy_y)
    features_df = pd.DataFrame()
    dataset_embeddings = np.random.rand(1, 384)  # Dummy embedding

# Rule-based fallback if predict fails
def rule_based_predict(word_count, sentence_count, flesch):
    if word_count < 500 or flesch < 30:
        return 0  # Low
    elif word_count > 1500 and 50 <= flesch <= 70:
        return 2  # High
    else:
        return 1  # Medium

label_map = {0: 'Low', 1: 'Medium', 2: 'High'}

# Ensure encode (real or fallback)
if 'model' in globals() and 'EMBED_AVAILABLE' in globals() and EMBED_AVAILABLE:
    def encode(texts):
        return model.encode(texts)
else:
    def encode(texts):
        return [np.zeros(384) for _ in texts]

# Fallback functions
def parse_html(html_content, url):
    if not html_content:
        return {'url': url, 'body_text': '', 'word_count': 0, 'title': 'No content'}
    soup = BeautifulSoup(html_content, 'html.parser')
    body_text = re.sub(r'\s+', ' ', soup.get_text().strip()).lower()[:5000]
    word_count = len(re.findall(r'\b\w+\b', body_text))
    title = soup.title.get_text().strip()[:200] if soup.title else 'No title'
    return {'url': url, 'title': title, 'body_text': body_text, 'word_count': word_count}

def sent_tokenize(text):
    return re.split(r'(?<=[.!?])\s+', text) if text else []

# Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=100, stop_words='english')

def parse_html_scrape(url, delay=1):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        time.sleep(delay)
        return parse_html(response.text, url)
    except Exception as e:
        print(f"Scrape error: {e}")
        return {'url': url, 'title': f'Error: {str(e)}', 'body_text': '', 'word_count': 0}

def extract_features_live(body_text):
    sentences = sent_tokenize(body_text)
    sentence_count = len(sentences)
    flesch = textstat.flesch_reading_ease(body_text) if len(body_text) > 100 else 0.0
    embedding = encode([body_text])[0].tolist()
    
    top_keywords = ''
    if body_text:
        tfidf_live = vectorizer.fit_transform([body_text])
        top_idx = tfidf_live.toarray().argsort()[-5:][0][::-1]
        top_keywords = '|'.join(vectorizer.get_feature_names_out()[top_idx])
    
    return sentence_count, round(flesch, 2), json.dumps(embedding), top_keywords

def analyze_url(url):
    parsed = parse_html_scrape(url)
    body_text = parsed['body_text']
    sentence_count, flesch, emb_str, top_keywords = extract_features_live(body_text)
    
    # Predict with fallback
    try:
        X_new = np.array([[parsed['word_count'], sentence_count, flesch]])
        pred = rf_model.predict(X_new)[0]
    except Exception:
        pred = rule_based_predict(parsed['word_count'], sentence_count, flesch)
    
    quality_label = label_map.get(pred, 'Medium')
    is_thin = parsed['word_count'] < 500
    
    # Similar (if dataset)
    similar_to = []
    if body_text and len(dataset_embeddings) > 0:
        new_emb = np.array([json.loads(emb_str)])
        sims = cosine_similarity(new_emb, dataset_embeddings)[0]
        for idx, sim in enumerate(sims):
            if sim > 0.80 and idx < len(features_df):
                similar_to.append({
                    'url': features_df.iloc[idx]['url'],
                    'similarity': round(sim, 3)
                })
    
    return {
        'url': url,
        'title': parsed['title'],
        'word_count': parsed['word_count'],
        'readability_score': flesch,
        'quality_label': quality_label,
        'is_thin': is_thin,
        'top_keywords': top_keywords,
        'similar_to': similar_to
    }

# Demo
test_url = "https://example.com"  # Real test: "https://moz.com/learn/seo/what-is-seo"
result = analyze_url(test_url)
print("Demo Analysis Result (JSON):")
print(json.dumps(result, indent=2, default=str))


Fallback model (error: Model not fitted). Using fitted dummy + rules.
Demo Analysis Result (JSON):
{
  "url": "https://example.com",
  "title": "Example Domain",
  "word_count": 19,
  "readability_score": 20.04,
  "quality_label": "High",
  "is_thin": true,
  "top_keywords": "use|permission|operations|needing|learn|examples|example|domainthis|domainexample|domain|documentation|avoid",
  "similar_to": []
}
