<a href="https://colab.research.google.com/github/JestyJohnson21/SEO-Sensei-Intelligent-Web-Content-Quality-Duplicate-Detector/blob/main/seo_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()


Saving archive (30).zip to archive (30) (1).zip


In [None]:
!unzip "archive (30) (1).zip" -d data/



Archive:  archive (30) (1).zip
  inflating: data/data.csv           


In [None]:
!unzip "archive (30) (1).zip" -d data/


Archive:  archive (30) (1).zip
replace data/data.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
import pandas as pd
data = pd.read_csv("data/data.csv")  #  file name is data
data.head()

In [None]:
from bs4 import BeautifulSoup

def extract_text(html):
    """Extract page title, body text, and word count from HTML content."""
    try:
        soup = BeautifulSoup(html, "html.parser")
        title = soup.title.string if soup.title else ""
        body = ' '.join([p.get_text() for p in soup.find_all(['p','article','main'])])
        word_count = len(body.split())
        return pd.Series([title, body, word_count])
    except Exception as e:
        return pd.Series(["", "", 0])

# Apply function to dataset
data[['title','body_text','word_count']] = data['html_content'].apply(extract_text)
data.to_csv("data/extracted_content.csv", index=False)
data.head(3)


In [1]:
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer

# Basic textual features
data['sentence_count'] = data['body_text'].apply(lambda x: x.count('.'))
data['flesch_reading_ease'] = data['body_text'].apply(lambda x: textstat.flesch_reading_ease(x))

# TF-IDF feature extraction
vectorizer = TfidfVectorizer(max_features=200, stop_words='english')
X = vectorizer.fit_transform(data['body_text'])
keywords = vectorizer.get_feature_names_out()

# Top 5 keywords per document
data['top_keywords'] = ['|'.join([keywords[i] for i in x.indices[:5]]) for x in X]

# Save processed features
data.to_csv("data/features.csv", index=False)
data[['url','word_count','sentence_count','flesch_reading_ease','top_keywords']].head()



ModuleNotFoundError: No module named 'textstat'

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

sim_matrix = cosine_similarity(X)
pairs = []
threshold = 0.8

for i in range(len(sim_matrix)):
    for j in range(i+1, len(sim_matrix)):
        if sim_matrix[i, j] > threshold:
            pairs.append((data.url[i], data.url[j], sim_matrix[i, j]))

dup_df = pd.DataFrame(pairs, columns=['url1','url2','similarity'])
dup_df.to_csv("data/duplicates.csv", index=False)

data['is_thin'] = data['word_count'] < 500
print("Duplicate pairs:", len(dup_df))
print("Thin pages:", data['is_thin'].sum())
dup_df.head()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

def label_quality(row):
    if row['word_count'] > 1500 and 50 <= row['flesch_reading_ease'] <= 70:
        return "High"
    elif row['word_count'] < 500 or row['flesch_reading_ease'] < 30:
        return "Low"
    else:
        return "Medium"

data['label'] = data.apply(label_quality, axis=1)

X_feat = data[['word_count','sentence_count','flesch_reading_ease']]
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X_feat, y, test_size=0.3, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

pred = model.predict(X_test)
print(classification_report(y_test, pred))

joblib.dump(model, 'models/quality_model.pkl')


In [None]:
import requests, time
from sklearn.metrics.pairwise import cosine_similarity

def scrape_html(url):
    """Fetch HTML source of a page with error handling."""
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        r = requests.get(url, headers=headers, timeout=5)
        time.sleep(1)
        return r.text
    except Exception:
        return None

def analyze_url(url):
    """Analyze any URL for quality and duplication."""
    html = scrape_html(url)
    if not html:
        return {"error": "Unable to fetch URL"}

    title, body, wc = extract_text(html)
    readability = textstat.flesch_reading_ease(body)
    label = label_quality({'word_count': wc, 'flesch_reading_ease': readability})

    vec = vectorizer.transform([body])
    sims = cosine_similarity(vec, X).flatten()
    similar = data.loc[sims > 0.75, ['url']].assign(similarity=sims[sims > 0.75])

    return {
        "url": url,
        "word_count": wc,
        "readability": readability,
        "quality_label": label,
        "is_thin": wc < 500,
        "similar_to": similar.to_dict(orient='records')
    }


result = analyze_url("https://result.com")
result


In [None]:
data.to_csv("data/final_output.csv", index=False)
print("âœ… All outputs saved to /data and model to /models.")
