# SEO Content Quality & Duplicate Detector

This notebook implements a machine learning pipeline for:
- HTML content parsing and text extraction
- Feature engineering (readability, keywords, embeddings)
- Duplicate detection using cosine similarity
- Content quality classification
- Real-time URL analysis

## 1. Setup and Imports

In [1]:
import pandas as pd
import numpy as np
import re
import warnings
import json
import joblib
from pathlib import Path
import time

from bs4 import BeautifulSoup
import requests

import nltk
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

print("All libraries imported successfully!")


All libraries imported successfully!
All libraries imported successfully!


## 2. Data Collection & HTML Parsing

In [2]:
data_path = Path('../data/data.csv')

if not data_path.exists():
    print("Dataset not found at data/data.csv")
else:
    df = pd.read_csv(data_path)
    print(f"Dataset loaded: {len(df)} rows")
    print(f"\nColumns: {df.columns.tolist()}")
    display(df.head())

Dataset loaded: 81 rows

Columns: ['url', 'html_content']


Unnamed: 0,url,html_content
0,https://www.cm-alliance.com/cybersecurity-blog,"<!doctype html><!--[if lt IE 7]> <html class=""..."
1,https://www.varonis.com/blog/cybersecurity-tips,"<!doctype html><html lang=""en""><head>\n <me..."
2,https://www.cisecurity.org/insights/blog/11-cy...,<!DOCTYPE html><html data-unhead-vue-server-re...
3,https://www.cisa.gov/topics/cybersecurity-best...,"\n\n<!DOCTYPE html>\n<html lang=""en"" dir=""ltr""..."
4,https://www.qnbtrust.bank/Resources/Learning-C...,


In [3]:
def parse_html_content(html_content):
    try:
        soup = BeautifulSoup(html_content, 'lxml')
        
        title = soup.find('title')
        title = title.get_text().strip() if title else 'No Title'
        
        for script in soup(["script", "style", "meta", "link", "noscript"]):
            script.decompose()
        
        main_content = soup.find('main') or soup.find('article') or soup.find('body')
        
        if main_content:
            text_elements = main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'])
            body_text = ' '.join([elem.get_text().strip() for elem in text_elements])
        else:
            body_text = soup.get_text()
        
        body_text = re.sub(r'\s+', ' ', body_text).strip()
        body_text = re.sub(r'[^\w\s.,!?;:\'-]', '', body_text)
        
        word_count = len(body_text.split())
        
        return {
            'title': title,
            'body_text': body_text,
            'word_count': word_count
        }
    
    except Exception as e:
        return {
            'title': 'Error',
            'body_text': '',
            'word_count': 0
        }

print("HTML parsing function defined")

HTML parsing function defined


In [4]:
print("Parsing HTML content...")

parsed_data = []

for idx, row in df.iterrows():
    url = row['url']
    
    if 'html_content' in df.columns:
        html_content = row['html_content']
        parsed = parse_html_content(html_content)
    else:
        print(f"No html_content column. Scraping {url}...")
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
            response = requests.get(url, headers=headers, timeout=10)
            html_content = response.text
            parsed = parse_html_content(html_content)
            time.sleep(1)
        except Exception as e:
            print(f"Error scraping {url}: {e}")
            parsed = {'title': 'Error', 'body_text': '', 'word_count': 0}
    
    parsed_data.append({
        'url': url,
        'title': parsed['title'],
        'body_text': parsed['body_text'],
        'word_count': parsed['word_count']
    })
    
    if (idx + 1) % 10 == 0:
        print(f"Processed {idx + 1}/{len(df)} pages")

extracted_df = pd.DataFrame(parsed_data)
extracted_df = extracted_df[extracted_df['word_count'] > 0]
extracted_df.to_csv('../data/extracted_content.csv', index=False)

print(f"\nExtracted content from {len(extracted_df)} pages")
display(extracted_df.head())

Parsing HTML content...
Processed 10/81 pages
Processed 10/81 pages
Processed 20/81 pages
Processed 20/81 pages
Processed 30/81 pages
Processed 30/81 pages
Processed 40/81 pages
Processed 40/81 pages
Processed 50/81 pages
Processed 50/81 pages
Processed 60/81 pages
Processed 60/81 pages
Processed 70/81 pages
Processed 70/81 pages
Processed 80/81 pages
Processed 80/81 pages

Extracted content from 68 pages

Extracted content from 68 pages


Unnamed: 0,url,title,body_text,word_count
0,https://www.cm-alliance.com/cybersecurity-blog,Cyber Security Blog,Back Training NCSC Assured Cyber Incident Plan...,1358
1,https://www.varonis.com/blog/cybersecurity-tips,Top 10 Cybersecurity Awareness Tips: How to St...,Top 10 Cybersecurity Awareness Tips: How to St...,1664
2,https://www.cisecurity.org/insights/blog/11-cy...,11 Cyber Defense Tips to Stay Secure at Work a...,11 Cyber Defense Tips to Stay Secure at Work a...,1034
3,https://www.cisa.gov/topics/cybersecurity-best...,Cybersecurity Best Practices | Cybersecurity a...,Cybersecurity Best Practices Cybersecurity Bes...,630
5,https://nordlayer.com/learn/network-security/b...,Network Security 101: Understanding the Basics,Home Learning center Network security Network ...,2130


In [7]:
print("Extracting features...")

extracted_df['clean_text'] = extracted_df['body_text'].apply(clean_text)
extracted_df['sentence_count'] = extracted_df['body_text'].apply(count_sentences)
extracted_df['flesch_reading_ease'] = extracted_df['body_text'].apply(get_readability_score)
extracted_df['top_keywords'] = extracted_df['body_text'].apply(extract_top_keywords)

print("Basic features extracted")
print("\nLoading sentence transformer model...")

model = SentenceTransformer('all-MiniLM-L6-v2')

print("Generating embeddings...")
embeddings = model.encode(extracted_df['clean_text'].tolist(), show_progress_bar=True)
extracted_df['embedding'] = [emb.tolist() for emb in embeddings]

print("\nAll features extracted")
display(extracted_df[['url', 'word_count', 'sentence_count', 'flesch_reading_ease', 'top_keywords']].head())

Extracting features...
Basic features extracted

Loading sentence transformer model...
Basic features extracted

Loading sentence transformer model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating embeddings...


Batches:   0%|          | 0/3 [00:00<?, ?it/s]


All features extracted


Unnamed: 0,url,word_count,sentence_count,flesch_reading_ease,top_keywords
0,https://www.cm-alliance.com/cybersecurity-blog,1358,17,-61.200375,cyber|cybersecurity|events|training|tabletop
1,https://www.varonis.com/blog/cybersecurity-tips,1664,91,41.140144,data|access|security|information|users
2,https://www.cisecurity.org/insights/blog/11-cy...,1034,72,53.688621,use|password|data|authentication|email
3,https://www.cisa.gov/topics/cybersecurity-best...,630,25,-0.238602,cybersecurity|cyber|cisa|practices|best
5,https://nordlayer.com/learn/network-security/b...,2130,181,26.044653,network|security|access|data|devices


In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def count_sentences(text):
    sentences = re.split(r'[.!?]+', text)
    return len([s for s in sentences if s.strip()])

def get_readability_score(text):
    try:
        return textstat.flesch_reading_ease(text)
    except:
        return 0

def extract_top_keywords(text, n=5):
    try:
        vectorizer = TfidfVectorizer(max_features=n, stop_words='english')
        tfidf_matrix = vectorizer.fit_transform([text])
        feature_names = vectorizer.get_feature_names_out()
        scores = tfidf_matrix.toarray()[0]
        keyword_scores = list(zip(feature_names, scores))
        keyword_scores.sort(key=lambda x: x[1], reverse=True)
        keywords = [kw for kw, score in keyword_scores[:n]]
        return '|'.join(keywords)
    except:
        return ''

print("Feature extraction functions defined")

Feature extraction functions defined


## 3. Text Preprocessing & Feature Engineering

In [8]:
features_df = extracted_df[['url', 'word_count', 'sentence_count', 'flesch_reading_ease', 'top_keywords', 'embedding']].copy()
features_df.to_csv('../data/features.csv', index=False)

print("Features saved to: data/features.csv")
print("\nFeature Statistics:")
print(features_df[['word_count', 'sentence_count', 'flesch_reading_ease']].describe())

Features saved to: data/features.csv

Feature Statistics:
         word_count  sentence_count  flesch_reading_ease
count     68.000000       68.000000            68.000000
mean    3235.941176      238.529412            30.177379
std     5101.721488      525.458619            36.080936
min        8.000000        1.000000          -175.110311
25%      566.750000       27.000000            19.080185
50%     1636.500000      102.500000            34.732160
75%     3425.000000      212.000000            49.819167
max    31380.000000     3766.000000           103.540000


In [None]:
def baseline_classifier(word_count):
    if word_count > 1500:
        return 'High'
    elif word_count < 500:
        return 'Low'
    else:
        return 'Medium'

baseline_pred = X_test['word_count'].apply(baseline_classifier)
baseline_accuracy = accuracy_score(y_test, baseline_pred)

print(f"Baseline Model (Rule-based):")
print(f"Accuracy: {baseline_accuracy:.4f}")

In [None]:
feature_columns = ['word_count', 'sentence_count', 'flesch_reading_ease']
X = extracted_df[feature_columns]
y = extracted_df['quality_label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"\nTraining label distribution:")
print(y_train.value_counts())

In [None]:
def assign_quality_label(row):
    word_count = row['word_count']
    readability = row['flesch_reading_ease']
    
    if word_count > 1500 and 50 <= readability <= 70:
        return 'High'
    elif word_count < 500 or readability < 30:
        return 'Low'
    else:
        return 'Medium'

extracted_df['quality_label'] = extracted_df.apply(assign_quality_label, axis=1)

print("Quality Label Distribution:")
print(extracted_df['quality_label'].value_counts())
print(f"\nPercentages:")
print(extracted_df['quality_label'].value_counts(normalize=True) * 100)

## 5. Content Quality Scoring

In [None]:
THIN_CONTENT_THRESHOLD = 500

extracted_df['is_thin'] = extracted_df['word_count'] < THIN_CONTENT_THRESHOLD
thin_content_count = extracted_df['is_thin'].sum()
thin_content_percentage = (thin_content_count / len(extracted_df)) * 100

print(f"\nContent Analysis Summary:")
print(f"Total pages analyzed: {len(extracted_df)}")
print(f"Duplicate pairs found: {len(duplicate_pairs)}")
print(f"Thin content pages: {thin_content_count} ({thin_content_percentage:.1f}%)")
print(f"Average word count: {extracted_df['word_count'].mean():.0f}")
print(f"Average readability score: {extracted_df['flesch_reading_ease'].mean():.1f}")

In [None]:
print("Computing cosine similarity matrix...")

embeddings_array = np.array(extracted_df['embedding'].tolist())
similarity_matrix = cosine_similarity(embeddings_array)

print(f"Similarity matrix shape: {similarity_matrix.shape}")

SIMILARITY_THRESHOLD = 0.80

duplicate_pairs = []

for i in range(len(similarity_matrix)):
    for j in range(i + 1, len(similarity_matrix)):
        similarity = similarity_matrix[i][j]
        if similarity > SIMILARITY_THRESHOLD:
            duplicate_pairs.append({
                'url1': extracted_df.iloc[i]['url'],
                'url2': extracted_df.iloc[j]['url'],
                'similarity': round(similarity, 4)
            })

duplicates_df = pd.DataFrame(duplicate_pairs)

print(f"\nFound {len(duplicate_pairs)} duplicate pairs (similarity > {SIMILARITY_THRESHOLD})")

if len(duplicate_pairs) > 0:
    display(duplicates_df.head(10))

## 4. Duplicate Detection

In [19]:
duplicates_df.to_csv('../data/duplicates.csv', index=False)
print("Duplicates saved to: data/duplicates.csv")

NameError: name 'duplicates_df' is not defined

In [18]:
print("Training Random Forest Classifier...")

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight='balanced'
)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nModel trained successfully!")
print(f"\nAccuracy: {accuracy:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Baseline Accuracy: {baseline_accuracy:.4f}")
print(f"Improvement: {(accuracy - baseline_accuracy):.4f}")

Training Random Forest Classifier...


NameError: name 'X_train' is not defined

In [14]:
def analyze_url(url, existing_embeddings=None, existing_urls=None):
    try:
        print(f"Fetching {url}...")
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        html_content = response.text
        
        parsed = parse_html_content(html_content)
        
        if parsed['word_count'] == 0:
            return {'url': url, 'error': 'Failed to extract content'}
        
        clean_text_content = clean_text(parsed['body_text'])
        sentence_count = count_sentences(parsed['body_text'])
        readability = get_readability_score(parsed['body_text'])
        keywords = extract_top_keywords(parsed['body_text'])
        
        embedding = model.encode([clean_text_content])[0]
        
        features = pd.DataFrame([{
            'word_count': parsed['word_count'],
            'sentence_count': sentence_count,
            'flesch_reading_ease': readability
        }])
        
        quality_label = rf_model.predict(features)[0]
        
        is_thin = parsed['word_count'] < THIN_CONTENT_THRESHOLD
        
        similar_content = []
        if existing_embeddings is not None and existing_urls is not None:
            similarities = cosine_similarity([embedding], existing_embeddings)[0]
            for i, sim in enumerate(similarities):
                if sim > SIMILARITY_THRESHOLD:
                    similar_content.append({
                        'url': existing_urls[i],
                        'similarity': round(float(sim), 4)
                    })
            similar_content.sort(key=lambda x: x['similarity'], reverse=True)
        
        return {
            'url': url,
            'title': parsed['title'],
            'word_count': parsed['word_count'],
            'sentence_count': sentence_count,
            'readability': round(readability, 2),
            'top_keywords': keywords,
            'quality_label': quality_label,
            'is_thin': bool(is_thin),
            'similar_to': similar_content[:5]
        }
    
    except Exception as e:
        return {'url': url, 'error': str(e)}

print("Real-time analysis function defined")

Real-time analysis function defined


In [17]:
print("="*60)
print("SEO CONTENT QUALITY & DUPLICATE DETECTOR - SUMMARY")
print("="*60)

print("\nDataset Statistics:")
print(f"Total pages: {len(extracted_df)}")
print(f"Avg word count: {extracted_df['word_count'].mean():.0f}")
print(f"Avg readability: {extracted_df['flesch_reading_ease'].mean():.1f}")

print("\nDuplicate Detection:")
print(f"Similarity threshold: {SIMILARITY_THRESHOLD}")
print(f"Duplicate pairs: {len(duplicate_pairs)}")
print(f"Thin content (<{THIN_CONTENT_THRESHOLD} words): {thin_content_count} ({thin_content_percentage:.1f}%)")

print("\nQuality Classification:")
print(f"Model: Random Forest")
print(f"Features: {', '.join(feature_columns)}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"Improvement over baseline: {(accuracy - baseline_accuracy):.4f}")

print("\nTop Features:")
for idx, row in feature_importance.iterrows():
    print(f"  {idx+1}. {row['feature']}: {row['importance']:.4f}")

print("\nOutput Files:")
print("  - data/extracted_content.csv")
print("  - data/features.csv")
print("  - data/duplicates.csv")
print("  - models/quality_model.pkl")

print("\nAnalysis complete!")
print("="*60)

SEO CONTENT QUALITY & DUPLICATE DETECTOR - SUMMARY

Dataset Statistics:
Total pages: 68
Avg word count: 3236
Avg readability: 30.2

Duplicate Detection:


NameError: name 'SIMILARITY_THRESHOLD' is not defined

## 8. Summary and Final Results

In [None]:
# Quality label distribution pie chart
quality_counts = extracted_df['quality_label'].value_counts()

plt.figure(figsize=(8, 8))
plt.pie(quality_counts.values, labels=quality_counts.index, autopct='%1.1f%%',
        colors=['#ff9999', '#66b3ff', '#99ff99'], startangle=90)
plt.title('Content Quality Distribution')
plt.axis('equal')
plt.show()

In [None]:
# Similarity heatmap (for a subset of pages)
n_samples = min(20, len(similarity_matrix))
subset_similarity = similarity_matrix[:n_samples, :n_samples]

plt.figure(figsize=(12, 10))
sns.heatmap(subset_similarity, cmap='YlOrRd', square=True, 
            xticklabels=False, yticklabels=False,
            cbar_kws={'label': 'Cosine Similarity'})
plt.title(f'Content Similarity Heatmap (First {n_samples} Pages)')
plt.tight_layout()
plt.show()

In [None]:
# Word count distribution by quality
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
for label in ['Low', 'Medium', 'High']:
    data = extracted_df[extracted_df['quality_label'] == label]['word_count']
    plt.hist(data, alpha=0.6, label=label, bins=20)
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.title('Word Count Distribution by Quality')
plt.legend()

plt.subplot(1, 2, 2)
for label in ['Low', 'Medium', 'High']:
    data = extracted_df[extracted_df['quality_label'] == label]['flesch_reading_ease']
    plt.hist(data, alpha=0.6, label=label, bins=20)
plt.xlabel('Flesch Reading Ease Score')
plt.ylabel('Frequency')
plt.title('Readability Distribution by Quality')
plt.legend()

plt.tight_layout()
plt.show()

## 7. Visualizations (Bonus)

In [None]:
print("Testing real-time URL analysis...\n")

existing_embeddings = embeddings_array
existing_urls = extracted_df['url'].tolist()

test_url = extracted_df.iloc[0]['url']

result = analyze_url(test_url, existing_embeddings, existing_urls)

print(json.dumps(result, indent=2))

## 6. Real-Time URL Analysis Function

In [None]:
model_path = Path('../models/quality_model.pkl')
joblib.dump(rf_model, model_path)

model_info = {
    'rf_model': 'quality_model.pkl',
    'sentence_transformer': 'all-MiniLM-L6-v2',
    'feature_columns': feature_columns,
    'similarity_threshold': SIMILARITY_THRESHOLD,
    'thin_content_threshold': THIN_CONTENT_THRESHOLD
}

with open('../models/model_info.json', 'w') as f:
    json.dump(model_info, f, indent=2)

print(f"Model saved to: {model_path}")

In [None]:
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop Features:")
for idx, row in feature_importance.iterrows():
    print(f"{row['feature']}: {row['importance']:.4f}")

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Importance')
plt.title('Feature Importance - Quality Classification')
plt.tight_layout()
plt.show()

In [None]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['High', 'Low', 'Medium'],
            yticklabels=['High', 'Low', 'Medium'])
plt.title('Confusion Matrix - Quality Classification')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()