In [5]:
import pandas as pd
from bs4 import BeautifulSoup

In [6]:
df = pd.read_csv('../data/data.csv')

In [7]:
def extract_text(html):
    try:
        soup = BeautifulSoup(html, 'html.parser')
        title = soup.title.string if soup.title else ''
        body = ' '.join([p.get_text() for p in soup.find_all(['p', 'article', 'main'])])
        return title, body, len(body.split())
    except:
        return '', '', 0

df[['title', 'body_text', 'word_count']] = df['html_content'].apply(
    lambda x: pd.Series(extract_text(x))
)
df.to_csv('../data/extracted_content.csv', index=False)


In [8]:
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
df['flesch_reading_ease'] = df['body_text'].apply(lambda x: textstat.flesch_reading_ease(x))
df['sentence_count'] = df['body_text'].apply(lambda x: len(x.split('.')))

In [10]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5)
X_keywords = vectorizer.fit_transform(df['body_text'])
df['top_keywords'] = [list(vectorizer.get_feature_names_out()) for _ in range(len(df))]

In [11]:
import pandas as pd
df = pd.read_csv('../data/extracted_content.csv')
missing = df[df['html_content'].isna() | df['body_text'].isna() | (df['body_text'].str.strip() == '')]
missing[['url']].to_csv('../data/missing_urls.csv', index=False)
print("‚úÖ Saved missing URLs to data/missing_urls.csv")


‚úÖ Saved missing URLs to data/missing_urls.csv


In [12]:
import requests
from bs4 import BeautifulSoup
import time

def scrape_page(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        resp = requests.get(url, headers=headers, timeout=10)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, 'html.parser')
        title = soup.title.string if soup.title else ''
        body = ' '.join([p.get_text() for p in soup.find_all(['p', 'article', 'main'])])
        return title, body, len(body.split())
    except Exception as e:
        print(f"‚ö†Ô∏è Error scraping {url}: {e}")
        return '', '', 0

for i, row in df.iterrows():
    if pd.isna(row['body_text']) or row['body_text'].strip() == '':
        title, body, wc = scrape_page(row['url'])
        df.at[i, 'title'] = title
        df.at[i, 'body_text'] = body
        df.at[i, 'word_count'] = wc
        time.sleep(1.5)  # polite delay between requests


‚ö†Ô∏è Error scraping https://www.qnbtrust.bank/Resources/Learning-Center/Blog/7-cyber-security-tips: 403 Client Error: Forbidden for url: https://www.qnbtrust.bank/Resources/Learning-Center/Blog/7-cyber-security-tips
‚ö†Ô∏è Error scraping https://www.cloudflare.com/learning/access-management/what-is-ztna/: 403 Client Error: Forbidden for url: https://www.cloudflare.com/learning/access-management/what-is-ztna/
‚ö†Ô∏è Error scraping https://towardsdatascience.com/machine-learning-basics-with-examples-part-1-c2d37247ec3d: 404 Client Error: Not Found for url: https://towardsdatascience.com/machine-learning-basics-with-examples-part-1-c2d37247ec3d
‚ö†Ô∏è Error scraping https://www.analyticsvidhya.com/blog/2021/09/comprehensive-guide-on-machine-learning/: 404 Client Error: Not Found for url: https://www.analyticsvidhya.com/blog/2021/09/comprehensive-guide-on-machine-learning/
‚ö†Ô∏è Error scraping https://www.investopedia.com/terms/s/seo.asp: 404 Client Error: Not Found for url: https://www

In [14]:
import random, requests, time
from bs4 import BeautifulSoup

USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)',
    'Mozilla/5.0 (X11; Linux x86_64)',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X)',
]

def scrape_page(url):
    for attempt in range(3):  # try up to 3 times
        try:
            headers = {'User-Agent': random.choice(USER_AGENTS)}
            resp = requests.get(url, headers=headers, timeout=15)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, 'html.parser')
            title = soup.title.string if soup.title else ''
            body = ' '.join([p.get_text() for p in soup.find_all(['p', 'article', 'main'])])
            return title, body, len(body.split())
        except requests.exceptions.RequestException as e:
            print(f"‚ö†Ô∏è Retry {attempt+1} failed for {url}: {e}")
            time.sleep(2)
    print(f"‚ùå Could not scrape {url} after 3 attempts.")
    return '', '', 0


In [15]:
if title == '' and body == '':
    df.at[i, 'body_text'] = 'Page unavailable or restricted'
    df.at[i, 'title'] = 'Unavailable'


In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')


In [16]:

embeddings = model.encode(df['body_text'].tolist(), show_progress_bar=True)
df['embedding'] = embeddings.tolist()
df.to_csv('../data/features.csv', index=False)

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
import sys
print(sys.executable)


C:\Users\Harsha\anaconda3\python.exe


In [13]:
import sys
!{sys.executable} -m pip uninstall -y tokenizers transformers sentence-transformers

Found existing installation: tokenizers 0.22.1
Uninstalling tokenizers-0.22.1:
  Successfully uninstalled tokenizers-0.22.1
Found existing installation: transformers 4.57.1
Uninstalling transformers-4.57.1:
  Successfully uninstalled transformers-4.57.1
Found existing installation: sentence-transformers 5.1.2
Uninstalling sentence-transformers-5.1.2:
  Successfully uninstalled sentence-transformers-5.1.2


In [None]:
import sys
!{sys.executable} -m pip install "tokenizers>=0.22,<0.24"
!{sys.executable} -m pip install "transformers==4.44.2"
!{sys.executable} -m pip install "sentence-transformers==3.0.1"

Collecting tokenizers<0.24,>=0.22
  Using cached tokenizers-0.22.1-cp39-abi3-win_amd64.whl (2.7 MB)
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
    Uninstalling tokenizers-0.15.2:
      Successfully uninstalled tokenizers-0.15.2
Successfully installed tokenizers-0.22.1


In [2]:
import sys
!{sys.executable} -m pip install sentence-transformers


Collecting sentence-transformers
  Using cached sentence_transformers-5.1.2-py3-none-any.whl (488 kB)
Collecting transformers<5.0.0,>=4.41.0
  Using cached transformers-4.57.1-py3-none-any.whl (12.0 MB)
Installing collected packages: transformers, sentence-transformers
Successfully installed sentence-transformers-5.1.2 transformers-4.57.1


In [17]:
import pandas as pd
import requests
import random
import time
from bs4 import BeautifulSoup

# --- Load existing extracted_content file ---
df = pd.read_csv('../data/extracted_content.csv')

# --- Define multiple user-agents for rotation (bypasses 403 blocks) ---
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1'
]

# --- Scraper function with retries, delays, and graceful fallback ---
def scrape_page(url):
    for attempt in range(3):  # try 3 times
        try:
            headers = {'User-Agent': random.choice(USER_AGENTS)}
            response = requests.get(url, headers=headers, timeout=15)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')
            title = soup.title.string.strip() if soup.title else ''
            body = ' '.join([p.get_text(" ", strip=True) for p in soup.find_all(['p', 'article', 'main'])])
            word_count = len(body.split())
            if word_count > 0:
                return title, body, word_count

        except requests.exceptions.RequestException as e:
            print(f"‚ö†Ô∏è Attempt {attempt+1} failed for {url}: {e}")
            time.sleep(2)

    # If still fails after retries:
    print(f"‚ùå Could not fetch {url} after 3 retries ‚Äî marking as Unavailable.")
    return 'Unavailable', 'Page unavailable or restricted', 0


# --- Re-scrape only missing or empty body_text rows ---
for i, row in df.iterrows():
    if pd.isna(row.get('body_text')) or str(row['body_text']).strip() == '' or row['word_count'] == 0:
        print(f"\nüîÑ Scraping missing content for: {row['url']}")
        title, body, wc = scrape_page(row['url'])
        df.at[i, 'title'] = title
        df.at[i, 'body_text'] = body
        df.at[i, 'word_count'] = wc
        time.sleep(1.5)  # polite delay

# --- Final clean-up & save ---
if 'html_content' in df.columns:
    df = df.drop(columns=['html_content'])

df.to_csv('../data/extracted_content.csv', index=False)
print("\n‚úÖ Full dataset repaired and saved successfully! All 81 rows retained.")



üîÑ Scraping missing content for: https://www.qnbtrust.bank/Resources/Learning-Center/Blog/7-cyber-security-tips
‚ö†Ô∏è Attempt 1 failed for https://www.qnbtrust.bank/Resources/Learning-Center/Blog/7-cyber-security-tips: 403 Client Error: Forbidden for url: https://www.qnbtrust.bank/Resources/Learning-Center/Blog/7-cyber-security-tips
‚ö†Ô∏è Attempt 2 failed for https://www.qnbtrust.bank/Resources/Learning-Center/Blog/7-cyber-security-tips: 403 Client Error: Forbidden for url: https://www.qnbtrust.bank/Resources/Learning-Center/Blog/7-cyber-security-tips
‚ö†Ô∏è Attempt 3 failed for https://www.qnbtrust.bank/Resources/Learning-Center/Blog/7-cyber-security-tips: 403 Client Error: Forbidden for url: https://www.qnbtrust.bank/Resources/Learning-Center/Blog/7-cyber-security-tips
‚ùå Could not fetch https://www.qnbtrust.bank/Resources/Learning-Center/Blog/7-cyber-security-tips after 3 retries ‚Äî marking as Unavailable.

üîÑ Scraping missing content for: https://www.connectwise.com/blog/p

In [25]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer

# === STEP 1: LOAD DATA ===
df = pd.read_csv('../data/features.csv')
print(f"‚úÖ Loaded {len(df)} rows")

# === STEP 2: HANDLE MISSING DATA ===
df = df.fillna({'flesch_reading_ease': 0, 'word_count': 0, 'sentence_count': 0, 'top_keywords': ''})

# === STEP 3: ADD EXTRA FEATURES ===
df['keyword_density'] = df['top_keywords'].apply(lambda x: len(str(x).split('|')) if isinstance(x, str) else 0)
df['readability_bin'] = pd.cut(df['flesch_reading_ease'], bins=[0, 30, 50, 70, 100],
                               labels=[1, 2, 3, 4]).astype(float).fillna(0)

# === STEP 4: ADD SEMANTIC EMBEDDINGS ===
print("üîÑ Generating semantic embeddings (this may take ~2‚Äì3 minutes)...")
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['body_text'].astype(str).tolist(), show_progress_bar=True)
embeddings = np.array(embeddings)

# Dimensionality reduction to make it faster
pca = PCA(n_components=10, random_state=42)
reduced_embeddings = pca.fit_transform(embeddings)

# === STEP 5: COMBINE ALL FEATURES ===
extra_features = df[['word_count', 'sentence_count', 'flesch_reading_ease',
                     'keyword_density', 'readability_bin']].to_numpy()
X_full = np.hstack([extra_features, reduced_embeddings])

# === STEP 6: LABEL QUALITY ===
def label_quality(row):
    if row['word_count'] > 1500 and 50 <= row['flesch_reading_ease'] <= 70:
        return 'High'
    elif row['word_count'] < 500 or row['flesch_reading_ease'] < 30:
        return 'Low'
    else:
        return 'Medium'

df['quality_label'] = df.apply(label_quality, axis=1)
y = df['quality_label']

# === STEP 7: SPLIT DATA ===
X_train, X_test, y_train, y_test = train_test_split(X_full, y, test_size=0.3,
                                                    random_state=42, stratify=y)

# === STEP 8: SCALE ===
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# === STEP 9: TRAIN HIGH-ACCURACY RANDOM FOREST ===
params = {
    'n_estimators': [200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True]
}

rf = RandomForestClassifier(random_state=42)
grid = GridSearchCV(rf, param_grid=params, cv=3, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

best_rf = grid.best_estimator_
print("\nüèÜ Best Parameters:", grid.best_params_)

# === STEP 10: EVALUATE ===
y_pred = best_rf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("\nüìä Classification Report:")
print(classification_report(y_test, y_pred))
print(f"‚úÖ Accuracy: {acc:.3f}")

# === STEP 11: SAVE MODEL ===
joblib.dump(best_rf, '../models/quality_model_hybrid.pkl')
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(pca, '../models/pca.pkl')
print("üíæ Saved hybrid model (model + scaler + PCA) to ../models/")


‚úÖ Loaded 81 rows
üîÑ Generating semantic embeddings (this may take ~2‚Äì3 minutes)...


KeyError: 'body_text'

In [19]:
df = pd.read_csv('../data/extracted_content.csv')
df.head()

Unnamed: 0,url,title,body_text,word_count
0,https://www.cm-alliance.com/cybersecurity-blog,Cyber Security Blog,Cyber Crisis Tabletop Exercise Cyber Security ...,326
1,https://www.varonis.com/blog/cybersecurity-tips,Top 10 Cybersecurity Awareness Tips: How to St...,The #1 Data Security Platform WHERE TO BUY CAP...,5460
2,https://www.cisecurity.org/insights/blog/11-cy...,11 Cyber Defense Tips to Stay Secure at Work a...,Home Insights Blog Posts 11 Cyber Defense Tips...,2011
3,https://www.cisa.gov/topics/cybersecurity-best...,Cybersecurity Best Practices | Cybersecurity a...,An official website of the United States gover...,1438
4,https://www.qnbtrust.bank/Resources/Learning-C...,Unavailable,Page unavailable or restricted,0


In [20]:
import pandas as pd
import numpy as np
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

# === STEP 1: LOAD CLEAN EXTRACTED CONTENT ===
df = pd.read_csv('../data/extracted_content.csv')
print(f"‚úÖ Loaded {len(df)} rows from extracted_content.csv")

# === STEP 2: BASIC CLEANING ===
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.replace('\n', ' ').replace('\r', ' ')
    text = ' '.join(text.split())  # remove extra spaces
    return text.lower()

df['clean_text'] = df['body_text'].apply(clean_text)

# === STEP 3: FEATURE EXTRACTION ===

# Word & sentence counts (already have word_count from earlier)
df['sentence_count'] = df['clean_text'].apply(lambda x: len(x.split('.')))
df['flesch_reading_ease'] = df['clean_text'].apply(lambda x: textstat.flesch_reading_ease(x) if len(x) > 20 else 0)

# === STEP 4: TOP 5 KEYWORDS (TF-IDF) ===
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_tfidf = vectorizer.fit_transform(df['clean_text'])

# Get top 5 keywords per document
def top_keywords_per_doc(tfidf_vector, feature_names, top_n=5):
    sorted_nzs = np.argsort(tfidf_vector.toarray()).flatten()[::-1]
    top_features = [feature_names[i] for i in sorted_nzs[:top_n]]
    return '|'.join(top_features)

feature_names = np.array(vectorizer.get_feature_names_out())
df['top_keywords'] = [
    '|'.join(feature_names[idx] for idx in np.argsort(row.toarray()).flatten()[-5:])
    for row in X_tfidf
]

# === STEP 5: EMBEDDINGS (for duplicate detection later) ===
print("üîÑ Generating embeddings ‚Äî this may take 2‚Äì3 minutes...")
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['clean_text'].tolist(), show_progress_bar=True)
df['embedding'] = embeddings.tolist()

# === STEP 6: SAVE FEATURES ===
df_features = df[['url', 'title', 'word_count', 'sentence_count',
                  'flesch_reading_ease', 'top_keywords', 'embedding']]

df_features.to_csv('../data/features.csv', index=False)
print("‚úÖ Saved features.csv successfully!")

# === STEP 7: QUICK SUMMARY ===
print("\nüìä Summary:")
print("Average readability:", round(df['flesch_reading_ease'].mean(), 2))
print("Average sentence count:", round(df['sentence_count'].mean(), 2))
print("Sample keywords for first 3 rows:")
print(df_features[['url', 'top_keywords']].head(3))


‚úÖ Loaded 81 rows from extracted_content.csv
üîÑ Generating embeddings ‚Äî this may take 2‚Äì3 minutes...


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

‚úÖ Saved features.csv successfully!

üìä Summary:
Average readability: 34.34
Average sentence count: 307.83
Sample keywords for first 3 rows:
                                                 url  \
0     https://www.cm-alliance.com/cybersecurity-blog   
1    https://www.varonis.com/blog/cybersecurity-tips   
2  https://www.cisecurity.org/insights/blog/11-cy...   

                                       top_keywords  
0  security|management|training|cybersecurity|cyber  
1                sensitive|app|access|security|data  
2             device|use|protect|don|authentication  


In [21]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# === STEP 1: LOAD FEATURES ===
df = pd.read_csv('../data/features.csv')
print(f"‚úÖ Loaded {len(df)} rows from features.csv")

# Convert embedding strings back to numeric arrays
def parse_embedding(x):
    try:
        if isinstance(x, str) and x.startswith('['):
            return np.fromstring(x.strip('[]'), sep=',')
        elif isinstance(x, list) or isinstance(x, np.ndarray):
            return np.array(x)
        else:
            return np.zeros(384)
    except:
        return np.zeros(384)

df['embedding'] = df['embedding'].apply(parse_embedding)

# === STEP 2: COSINE SIMILARITY MATRIX ===
print("üîÑ Computing cosine similarity between pages...")
embeddings_matrix = np.vstack(df['embedding'].to_numpy())
similarity_matrix = cosine_similarity(embeddings_matrix)

# === STEP 3: IDENTIFY DUPLICATES ===
threshold = 0.80
duplicates = []

for i in range(len(similarity_matrix)):
    for j in range(i + 1, len(similarity_matrix)):
        sim = similarity_matrix[i, j]
        if sim > threshold:
            duplicates.append([df.iloc[i]['url'], df.iloc[j]['url'], round(sim, 3)])

dup_df = pd.DataFrame(duplicates, columns=['url1', 'url2', 'similarity'])
dup_df.to_csv('../data/duplicates.csv', index=False)

# === STEP 4: FLAG THIN CONTENT PAGES (<500 words) ===
df['is_thin'] = df['word_count'] < 500

# === STEP 5: PRINT SUMMARY ===
print("\nüìä Duplicate Detection Summary")
print(f"Total pages analyzed: {len(df)}")
print(f"Duplicate pairs found: {len(dup_df)}")
print(f"Thin content pages (<500 words): {df['is_thin'].sum()}")
print("\n‚úÖ Saved duplicate pairs to ../data/duplicates.csv")

dup_df.head(10)


‚úÖ Loaded 81 rows from features.csv
üîÑ Computing cosine similarity between pages...

üìä Duplicate Detection Summary
Total pages analyzed: 81
Duplicate pairs found: 49
Thin content pages (<500 words): 23

‚úÖ Saved duplicate pairs to ../data/duplicates.csv


Unnamed: 0,url1,url2,similarity
0,https://www.qnbtrust.bank/Resources/Learning-C...,https://remotedesktop.google.com/,1.0
1,https://www.qnbtrust.bank/Resources/Learning-C...,https://www.cloudflare.com/learning/access-man...,1.0
2,https://www.qnbtrust.bank/Resources/Learning-C...,https://towardsdatascience.com/machine-learnin...,1.0
3,https://www.qnbtrust.bank/Resources/Learning-C...,https://www.analyticsvidhya.com/blog/2021/09/c...,1.0
4,https://www.qnbtrust.bank/Resources/Learning-C...,https://www.investopedia.com/terms/s/seo.asp,1.0
5,https://www.qnbtrust.bank/Resources/Learning-C...,https://www.reuters.com/technology/artificial-...,1.0
6,https://www.qnbtrust.bank/Resources/Learning-C...,https://www.cnbc.com/artificial-intelligence/,1.0
7,https://www.qnbtrust.bank/Resources/Learning-C...,https://www.bbc.com/news/topics/c404v061z99t,1.0
8,https://www.fortinet.com/resources/cyberglossa...,https://www.fortinet.com/resources/cyberglossa...,1.0
9,https://www.fortinet.com/resources/cyberglossa...,https://www.fortinet.com/solutions/enterprise-...,1.0


‚úÖ Loaded 81 rows
Fitting 3 folds for each of 24 candidates, totalling 72 fits

üèÜ Best Parameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

üìä Classification Report:
              precision    recall  f1-score   support

        High       1.00      0.67      0.80         3
         Low       0.87      1.00      0.93        13
      Medium       0.88      0.78      0.82         9

    accuracy                           0.88        25
   macro avg       0.91      0.81      0.85        25
weighted avg       0.89      0.88      0.88        25

‚úÖ Accuracy: 0.880
üíæ Saved hybrid model (model + scaler + PCA) to ../models/


In [27]:
import pandas as pd
df = pd.read_csv('../data/features.csv')
print(df.columns)


Index(['url', 'title', 'word_count', 'sentence_count', 'flesch_reading_ease',
       'top_keywords', 'embedding'],
      dtype='object')


‚úÖ Accuracy: 0.762
              precision    recall  f1-score   support

        High       0.50      0.33      0.40         3
         Low       0.85      1.00      0.92        11
      Medium       0.67      0.57      0.62         7

    accuracy                           0.76        21
   macro avg       0.67      0.63      0.64        21
weighted avg       0.74      0.76      0.74        21



In [31]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.utils import resample

# === STEP 1: LOAD DATA ===
df = pd.read_csv('../data/features.csv')
print(f"‚úÖ Loaded {len(df)} rows")

# === STEP 2: HANDLE MISSING DATA ===
df = df.fillna({
    'flesch_reading_ease': 0, 'word_count': 0, 'sentence_count': 0,
    'top_keywords': '', 'embedding': ''
})

# === STEP 3: EXTRA FEATURES ===
df['keyword_density'] = df['top_keywords'].apply(
    lambda x: len(str(x).split('|')) if isinstance(x, str) else 0
)
df['readability_bin'] = pd.cut(
    df['flesch_reading_ease'], bins=[0, 30, 50, 70, 100],
    labels=[1, 2, 3, 4]
).astype(float).fillna(0)

# === STEP 4: LOAD EMBEDDINGS ===
def parse_embedding(x):
    try:
        return np.array(eval(x))
    except:
        return np.zeros(384)

embeddings = np.vstack(df['embedding'].apply(parse_embedding).values)

# === STEP 5: REDUCE DIMENSIONALITY ===
pca = PCA(n_components=10, random_state=42)
reduced_embeddings = pca.fit_transform(embeddings)

# === STEP 6: COMBINE FEATURES ===
extra_features = df[['word_count', 'sentence_count', 'flesch_reading_ease',
                     'keyword_density', 'readability_bin']].to_numpy()
X_full = np.hstack([extra_features, reduced_embeddings])

# === STEP 7: LABEL QUALITY ===
def label_quality(row):
    if row['word_count'] > 1500 and 50 <= row['flesch_reading_ease'] <= 70:
        return 'High'
    elif row['word_count'] < 500 or row['flesch_reading_ease'] < 30:
        return 'Low'
    else:
        return 'Medium'

df['quality_label'] = df.apply(label_quality, axis=1)
y = df['quality_label']

# === STEP 8: BALANCE CLASSES ===
low = df[df.quality_label == 'Low']
med = df[df.quality_label == 'Medium']
high = df[df.quality_label == 'High']

low_up = resample(low, replace=True, n_samples=len(med), random_state=42)
high_up = resample(high, replace=True, n_samples=len(med), random_state=42)
df_balanced = pd.concat([low_up, med, high_up])

# Update features and labels after balancing
extra_features = df_balanced[['word_count', 'sentence_count', 'flesch_reading_ease',
                              'keyword_density', 'readability_bin']].to_numpy()
embeddings = np.vstack(df_balanced['embedding'].apply(parse_embedding).values)
reduced_embeddings = pca.fit_transform(embeddings)
X_full = np.hstack([extra_features, reduced_embeddings])
y = df_balanced['quality_label']

print("üîÑ Class distribution after balancing:")
print(y.value_counts())

# === STEP 9: SPLIT + SCALE ===
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y, test_size=0.3, random_state=42, stratify=y
)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# === STEP 10: RANDOM FOREST + GRIDSEARCH ===
params = {
    'n_estimators': [300, 400],
    'max_depth': [15, 20, None],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True]
}

rf = RandomForestClassifier(random_state=42)
grid = GridSearchCV(rf, param_grid=params, cv=3, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)
best_rf = grid.best_estimator_
print("\nüèÜ Best Parameters:", grid.best_params_)

# === STEP 11: EVALUATE ===
y_pred = best_rf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("\nüìä Classification Report:")
print(classification_report(y_test, y_pred))
print(f"‚úÖ Accuracy (single split): {acc:.3f}")

# === STEP 12: 5-FOLD CROSS VALIDATION ===
X_scaled = scaler.fit_transform(X_full)
cv_scores = cross_val_score(best_rf, X_scaled, y, cv=5)
print("\nüîÅ Cross-validation accuracies:", np.round(cv_scores, 3))
print("‚úÖ Mean CV accuracy:", round(cv_scores.mean(), 3))

# === STEP 13: SAVE MODEL ===
joblib.dump(best_rf, '../models/quality_model_hybrid.pkl')
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(pca, '../models/pca.pkl')
print("üíæ Saved hybrid model (model + scaler + PCA) to ../models/")


‚úÖ Loaded 81 rows
üîÑ Class distribution after balancing:
Low       29
Medium    29
High      29
Name: quality_label, dtype: int64
Fitting 3 folds for each of 24 candidates, totalling 72 fits

üèÜ Best Parameters: {'bootstrap': True, 'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

üìä Classification Report:
              precision    recall  f1-score   support

        High       0.90      1.00      0.95         9
         Low       1.00      1.00      1.00         9
      Medium       1.00      0.89      0.94         9

    accuracy                           0.96        27
   macro avg       0.97      0.96      0.96        27
weighted avg       0.97      0.96      0.96        27

‚úÖ Accuracy (single split): 0.963

üîÅ Cross-validation accuracies: [0.944 0.778 1.    0.765 0.941]
‚úÖ Mean CV accuracy: 0.886
üíæ Saved hybrid model (model + scaler + PCA) to ../models/


In [40]:
import pandas as pd
import numpy as np
import joblib
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import requests
from bs4 import BeautifulSoup

# === LOAD TRAINED MODEL ===
model = joblib.load('../models/quality_model_hybrid.pkl')
scaler = joblib.load('../models/scaler.pkl')
pca = joblib.load('../models/pca.pkl')

# === LOAD BASE DATA FOR DUPLICATE CHECK ===
df_base = pd.read_csv('../data/features.csv')

# Parse stored embeddings safely
def parse_embedding(x):
    try:
        return np.array(eval(x))
    except:
        return np.zeros(384)

base_embeddings = np.vstack(df_base['embedding'].apply(parse_embedding).values)
base_reduced = pca.transform(base_embeddings)

# === UTILITIES ===
def clean_text(text):
    return ' '.join(text.split())

def fetch_page_text(url):
    """Fetches and extracts visible text from a URL."""
    try:
        r = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(r.text, 'html.parser')
        for script in soup(["script", "style"]):
            script.extract()
        text = clean_text(soup.get_text(separator=' '))
        return text
    except Exception as e:
        print(f"‚ö†Ô∏è Error fetching {url}: {e}")
        return ""

# === MAIN ANALYSIS FUNCTION ===
def analyze_url(url):
    print(f"\nüîé Analyzing: {url}")
    text = fetch_page_text(url)
    if len(text.strip()) == 0:
        print("‚ö†Ô∏è Empty or unreadable page.")
        return

    # Text stats
    word_count = len(text.split())
    sentence_count = text.count('.') + text.count('!') + text.count('?')

    # Compute readability (simple approximation)
    avg_words_per_sentence = word_count / max(sentence_count, 1)
    flesch_score = 206.835 - (1.015 * avg_words_per_sentence) - (84.6 * (word_count / max(len(text.split()), 1)))

    # Embedding + PCA
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    new_embed = embed_model.encode([text])
    new_reduced = pca.transform(new_embed)

    # Assemble features
    keyword_density = len(set(text.lower().split())) / max(word_count, 1) * 100
    readability_bin = np.digitize(flesch_score, [0, 30, 50, 70, 100])
    features = np.array([[word_count, sentence_count, flesch_score,
                          keyword_density, readability_bin]])
    X_new = np.hstack([features, new_reduced])
    X_scaled = scaler.transform(X_new)

    # Predict
    pred = model.predict(X_scaled)[0]
    proba = model.predict_proba(X_scaled)[0]
    print(f"‚úÖ Predicted Quality: {pred}")
    print(f"üìä Probabilities ‚Üí High: {proba[model.classes_ == 'High'][0]:.2f}, "
          f"Medium: {proba[model.classes_ == 'Medium'][0]:.2f}, "
          f"Low: {proba[model.classes_ == 'Low'][0]:.2f}")

    # Duplicate check
    dup_sim = cosine_similarity(new_reduced, base_reduced)[0]
    top_idx = np.argmax(dup_sim)
    print(f"ü™û Most similar page: {df_base.iloc[top_idx]['url']}")
    print(f"üîÅ Similarity score: {dup_sim[top_idx]:.3f}")

    if dup_sim[top_idx] > 0.85:
        print("‚ö†Ô∏è Duplicate or near-duplicate content detected!")
    elif dup_sim[top_idx] > 0.70:
        print("‚ö†Ô∏è Potential partial overlap.")
    else:
        print("‚úÖ Unique content.")
analyze_url("https://www.ibm.com/topics/cybersecurity")
analyze_url("https://www.varonis.com/blog/cybersecurity-tips")





üîé Analyzing: https://www.ibm.com/topics/cybersecurity
‚úÖ Predicted Quality: High
üìä Probabilities ‚Üí High: 0.51, Medium: 0.45, Low: 0.04
ü™û Most similar page: https://www.varonis.com/blog/cybersecurity-tips
üîÅ Similarity score: 0.868
‚ö†Ô∏è Duplicate or near-duplicate content detected!

üîé Analyzing: https://www.varonis.com/blog/cybersecurity-tips
‚úÖ Predicted Quality: Medium
üìä Probabilities ‚Üí High: 0.40, Medium: 0.55, Low: 0.05
ü™û Most similar page: https://www.varonis.com/blog/cybersecurity-tips
üîÅ Similarity score: 0.980
‚ö†Ô∏è Duplicate or near-duplicate content detected!
