## Importing packages:

In [1]:
import sys
import subprocess
import pkgutil

required = [
    "beautifulsoup4",
    "lxml",
    "requests",
    "pandas",
    "scikit-learn",
    "nltk",
    "textstat",
    "sentence_transformers",
    "tqdm",
]

In [2]:
for pkg in required:
    if not pkgutil.find_loader(pkg):
        try:
            print(f"Installing {pkg}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
        except Exception as e:
            print(f"Could not install {pkg}: {e}")

  if not pkgutil.find_loader(pkg):


Installing beautifulsoup4...
Installing scikit-learn...


In [3]:
import os
import time
import json
from urllib.parse import urlparse
from collections import Counter

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score

import textstat

try:
    from sentence_transformers import SentenceTransformer
    have_sbert = True
except Exception:
    have_sbert = False
    print("sentence_transformers not available — will fallback to TF-IDF+SVD for embeddings")

[nltk_data] Downloading package punkt to C:\Users\Neelanjan
[nltk_data]     Dutta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


sentence_transformers not available — will fallback to TF-IDF+SVD for embeddings


In [4]:
INPUT_CSV_PATH = "G:/seo-content-detector/data/data.csv"
EXTRACTED_CSV = "G:/seo-content-detector/data/extracted_pages.csv"
FEATURES_CSV = "G:/seo-content-detector/data/features.csv"
DUPLICATES_CSV = "G:/seo-content-detector/data/duplicates.csv"
QUALITY_CSV = "G:/seo-content-detector/data/quality_results.csv"

USER_AGENT = 'Mozilla/5.0 (compatible; SEOQualityBot/1.0; +https://example.com/bot)'
REQUEST_DELAY = 1.2  # seconds between requests when scraping
SIMILARITY_THRESHOLD = 0.80
EMBEDDING_DIM = 384 

In [5]:
session = requests.Session()
session.headers.update({'User-Agent': USER_AGENT})


def safe_get(url, timeout=10):
    try:
        r = session.get(url, timeout=timeout)
        r.raise_for_status()
        return r.text
    except Exception as e:
        print(f"Request failed for {url}: {e}")
        return None

## HTML Parsing:

In [6]:
def extract_from_html(html, url=None):
    """Extract title and main body text from HTML content."""
    try:
        soup = BeautifulSoup(html, 'lxml')
    except Exception:
        soup = BeautifulSoup(html, 'html.parser')

    # Title
    title = ''
    if soup.title and soup.title.string:
        title = soup.title.string.strip()

    # Try common containers for main content
    candidates = []
    for selector in ['article', 'main', 'div[class*="content"]', 'div[class*="post"]',
                     'div[id*="content"]', 'section']:
        found = soup.select(selector)
        if found:
            for f in found:
                text = f.get_text(separator=' ', strip=True)
                if len(text.split()) > 30:
                    candidates.append(text)
    # fallback to concatenation of <p> tags
    if not candidates:
        p_tags = soup.find_all('p')
        p_texts = [p.get_text(separator=' ', strip=True) for p in p_tags if p.get_text(strip=True)]
        # join paragraphs
        joined = '\n\n'.join(p_texts)
        candidates.append(joined)

    # choose the longest candidate
    body_text = max(candidates, key=lambda t: len(t)) if candidates else ''

    # Clean excessive whitespace
    body_text = ' '.join(body_text.split())

    # Word count
    words = body_text.split()
    word_count = len(words)

    return {
        'title': title,
        'body_text': body_text,
        'word_count': word_count
    }

## Reading input csv and extracting content:

In [7]:
def build_extracted_csv(input_csv=INPUT_CSV_PATH, output_csv=EXTRACTED_CSV, force_scrape=False):
    df = pd.read_csv(input_csv)
    # Expect columns: url, html_content (optional)

    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df), desc='Processing rows'):
        url = row.get('url') if 'url' in row.index else None
        html = None
        if 'html_content' in row.index and not pd.isna(row['html_content']) and not force_scrape:
            html = row['html_content']
        else:
            if not url:
                print(f"Row {idx} has no url and no html_content. Skipping")
                continue
            html = safe_get(url)
            time.sleep(REQUEST_DELAY)

        if not html:
            # write empty placeholders
            results.append({'url': url, 'title': '', 'body_text': '', 'word_count': 0})
            continue

        try:
            extracted = extract_from_html(html, url=url)
            results.append({'url': url, 'title': extracted['title'], 'body_text': extracted['body_text'], 'word_count': extracted['word_count']})
        except Exception as e:
            print(f"Extraction failed for {url}: {e}")
            results.append({'url': url, 'title': '', 'body_text': '', 'word_count': 0})

    outdf = pd.DataFrame(results)
    outdf.to_csv(output_csv, index=False)
    print(f"Saved extracted content to {output_csv}")
    return outdf

## Feature engineering:

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer


def compute_readability(text):
    try:
        # textstat returns a float. If failure, return np.nan
        return float(textstat.flesch_reading_ease(text))
    except Exception:
        # fallback: compute approximate Flesch score
        try:
            sentences = max(1, len(sent_tokenize(text)))
            words = word_tokenize(text)
            word_count = len(words) if words else 0
            # estimate syllables roughly using textstat if available else naive rule
            syllables = textstat.syllable_count(text) if hasattr(textstat, 'syllable_count') else max(1, int(0.5 * word_count))
            flesch = 206.835 - 1.015 * (word_count / sentences) - 84.6 * (syllables / max(1, word_count))
            return float(flesch)
        except Exception:
            return np.nan


def top_keywords_tfidf(texts, top_n=5, ngram_range=(1,2), max_features=5000):
    # texts: list of documents. We'll compute TF-IDF and return top keywords per document
    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features, ngram_range=ngram_range)
    X = vectorizer.fit_transform(texts)
    feature_names = np.array(vectorizer.get_feature_names_out())
    top_keywords = []
    for i in range(X.shape[0]):
        row = X[i].toarray().ravel()
        if row.sum() == 0:
            top_keywords.append([])
            continue
        topn_ids = row.argsort()[-top_n:][::-1]
        top_keywords.append(feature_names[topn_ids].tolist())
    return top_keywords, vectorizer, X


# Embeddings: try SBERT, else TF-IDF + TruncatedSVD

def get_embeddings(documents, model_name='all-MiniLM-L6-v2', svd_dim=128):
    documents = [d if isinstance(d, str) else '' for d in documents]
    if have_sbert:
        try:
            model = SentenceTransformer(model_name)
            embs = model.encode(documents, show_progress_bar=True, batch_size=32)
            return np.array(embs)
        except Exception as e:
            print(f"SBERT encoding failed: {e} — falling back to TF-IDF+SVD")

    # fallback
    tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
    X = tfidf.fit_transform(documents)
    # reduce to dense vector via SVD
    svd = TruncatedSVD(n_components=min(svd_dim, X.shape[1]-1 if X.shape[1]>1 else 1), random_state=42)
    X_reduced = svd.fit_transform(X)
    # normalize rows
    norms = np.linalg.norm(X_reduced, axis=1, keepdims=True)
    norms[norms==0] = 1.0
    X_reduced = X_reduced / norms
    return X_reduced

## Build features data frame:

In [9]:
def build_features(extracted_df, features_csv=FEATURES_CSV):
    df = extracted_df.copy()
    df['text_clean'] = df['body_text'].fillna('').str.lower().str.replace('\\s+', ' ', regex=True).str.strip()
    df['sentence_count'] = df['text_clean'].apply(lambda t: len(sent_tokenize(t)) if t.strip() else 0)
    df['flesch_reading_ease'] = df['text_clean'].apply(lambda t: compute_readability(t) if t.strip() else np.nan)

    texts = df['text_clean'].tolist()
    top_keywords, tfidf_vectorizer, tfidf_matrix = top_keywords_tfidf(texts, top_n=5)
    df['top_keywords'] = top_keywords

    # === NEW EMBEDDING LOGIC ===
    # We fit the models here, assuming SBERT fallback
    print("Fitting TF-IDF and SVD models...")
    tfidf_model = TfidfVectorizer(stop_words='english', max_features=5000)
    X_tfidf = tfidf_model.fit_transform(texts)

    # Determine SVD components, using 128 as the target (from old get_embeddings)
    n_comps = min(128, X_tfidf.shape[1] - 1 if X_tfidf.shape[1] > 1 else 1)
    if n_comps <= 0: n_comps = 1 # Failsafe for empty/tiny corpus

    svd_model = TruncatedSVD(n_components=n_comps, random_state=42)
    embeddings = svd_model.fit_transform(X_tfidf)

    # Normalize
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    norms[norms==0] = 1.0
    embeddings = embeddings / norms
    
    print(f"Created embeddings with shape {embeddings.shape}")
    
    df['embedding'] = embeddings.tolist()
    df['is_thin'] = df['word_count'] < 500
    df.to_csv(features_csv, index=False)
    print(f"Saved features to {features_csv}")
    
    # === RETURN THE FITTED MODELS ===
    return df, embeddings, tfidf_model, svd_model

## Duplicate detection:

In [10]:
def detect_duplicates(df, embeddings, threshold=SIMILARITY_THRESHOLD, output_csv=DUPLICATES_CSV):
    # Compute pairwise cosine similarity
    if embeddings is None or len(embeddings) == 0:
        print("No embeddings provided")
        return pd.DataFrame(columns=['url1','url2','similarity'])

    # If embeddings are not normalized, cosine_similarity still works
    sim_matrix = cosine_similarity(embeddings)
    n = sim_matrix.shape[0]
    pairs = []
    for i in range(n):
        for j in range(i+1, n):
            sim = float(sim_matrix[i,j])
            if sim >= threshold:
                pairs.append({'url1': df.iloc[i]['url'], 'url2': df.iloc[j]['url'], 'similarity': sim})
    pairs_df = pd.DataFrame(pairs)
    pairs_df.to_csv(output_csv, index=False)
    print(f"Saved {len(pairs)} duplicate pairs to {output_csv}")
    return pairs_df

## Labelling and classification:

In [11]:
def create_quality_labels(df):
    # Synthetic labels per assignment rules
    labels = []
    for _, row in df.iterrows():
        wc = row['word_count'] if not pd.isna(row['word_count']) else 0
        fr = row['flesch_reading_ease'] if not pd.isna(row['flesch_reading_ease']) else -999
        if wc > 1500 and 50 <= fr <= 70:
            labels.append('High')
        elif wc < 500 or fr < 30:
            labels.append('Low')
        else:
            labels.append('Medium')
    df['quality_label'] = labels
    return df


def train_quality_model(df, features_to_use=['word_count','sentence_count','flesch_reading_ease']):
    # Prepare dataset
    X = df[features_to_use].fillna(0).values
    y = df['quality_label'].values

    # Encode labels
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    y_enc = le.fit_transform(y)

    X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X, y_enc, df.index, test_size=0.3, random_state=42, stratify=y_enc)

    # Train RandomForest
    clf = RandomForestClassifier(n_estimators=200, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    report = classification_report(y_test, y_pred, target_names=le.classes_, output_dict=True)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)

    # Baseline: rule-based using only word_count
    baseline_pred = []
    for row in X_test:
        wc = row[0]
        if wc > 1500:
            baseline_pred.append(le.transform(['High'])[0])
        elif wc < 500:
            baseline_pred.append(le.transform(['Low'])[0])
        else:
            baseline_pred.append(le.transform(['Medium'])[0])
    baseline_acc = accuracy_score(y_test, baseline_pred)

    # Feature importances
    importances = clf.feature_importances_
    feat_imp = sorted(list(zip(features_to_use, importances)), key=lambda x: x[1], reverse=True)

    # assemble results dataframe for test set
    results_df = df.loc[idx_test].copy()
    results_df['true_label'] = le.inverse_transform(y_test)
    results_df['pred_label'] = le.inverse_transform(y_pred)

    summary = {
        'accuracy': acc,
        'f1_weighted': f1,
        'baseline_accuracy': baseline_acc,
        'classification_report': report,
        'confusion_matrix': cm.tolist(),
        'feature_importances': feat_imp
    }

    return clf, le, summary, results_df

## Real-time analysis function analyze_url:

In [12]:
def analyze_url(url, existing_features_df, embeddings, tfidf_model, svd_model, similarity_threshold=SIMILARITY_THRESHOLD):
    html = safe_get(url)
    if not html:
        return {'url': url, 'error': 'Failed to fetch URL'}

    extracted = extract_from_html(html, url=url)
    text_clean = extracted['body_text'].lower().strip()
    sentence_count = len(sent_tokenize(text_clean)) if text_clean else 0
    flesch = compute_readability(text_clean) if text_clean else np.nan

    # === NEW EMBEDDING LOGIC ===
    # Use the FITTED models to transform the new text
    try:
        X_tfidf_new = tfidf_model.transform([text_clean])
        emb_vector_reduced = svd_model.transform(X_tfidf_new)
        
        # Normalize
        norm = np.linalg.norm(emb_vector_reduced)
        if norm == 0: norm = 1.0
        emb_vector_normalized = emb_vector_reduced / norm
        
        # This vector is shape (1, n_comps), e.g., (1, 81)
        
    except Exception as e:
        print(f"Error during embedding transform: {e}")
        # Create a zero vector of the correct shape so comparison doesn't fail
        emb_vector_normalized = np.zeros((1, embeddings.shape[1])) 
                
    # === COMPARE TO EXISTING EMBEDDINGS ===
    similar_to = []
    if existing_features_df is not None and embeddings is not None and len(embeddings) > 0:
        # emb_vector_normalized is (1, n_comps). embeddings is (81, n_comps). This works.
        sims = cosine_similarity(emb_vector_normalized, embeddings)[0]
        for i, s in enumerate(sims):
            if s >= similarity_threshold:
                similar_to.append({'url': existing_features_df.iloc[i]['url'], 'similarity': float(s)})

    # quality label rules
    wc = extracted['word_count']
    if wc > 1500 and 50 <= flesch <= 70:
        label = 'High'
    elif wc < 500 or (not np.isnan(flesch) and flesch < 30):
        label = 'Low'
    else:
        label = 'Medium'

    result = {
        'url': url,
        'word_count': wc,
        'readability': flesch,
        'quality_label': label,
        'is_thin': wc < 500,
        'similar_to': similar_to
    }
    return result

## Pipeline:

In [14]:
if __name__ == '__main__':
    if not os.path.exists(INPUT_CSV_PATH):
        print(f"Input CSV not found at {INPUT_CSV_PATH}.")
        print("Please make sure 'data/data.csv' exists.")
    else:
        # --- ORIGINAL CODE ---
        # 1) We are now RUNNING the slow function to create the file.
        print("Running 'build_extracted_csv' to parse all 81 rows...")
        extracted_df = build_extracted_csv(INPUT_CSV_PATH, EXTRACTED_CSV)
        print("Extraction complete.")
        # --- END OF ORIGINAL CODE ---

        if extracted_df.empty or extracted_df['word_count'].sum() == 0:
            print("Extraction resulted in no data. Stopping pipeline.")
        else:
            # 2) Build features and get fitted models
            # These variables are made global so the next cells can use them
            global features_df, embeddings, global_tfidf_model, global_svd_model
            features_df, embeddings, global_tfidf_model, global_svd_model = build_features(extracted_df, FEATURES_CSV)
            embeddings = np.array(embeddings) 
            
            # 3) Duplicate detection
            print("\nRunning duplicate detection...")
            duplicates_df = detect_duplicates(features_df, embeddings, SIMILARITY_THRESHOLD, DUPLICATES_CSV)

            # 4) Create labels and train classifier
            print("\nTraining classification model...")
            features_labeled = create_quality_labels(features_df)
            
            # 'clf' is our trained classifier
            global clf, le
            clf, le, summary, results_df = train_quality_model(features_labeled)

            if clf is not None:
                # Save quality results
                os.makedirs(os.path.dirname(QUALITY_CSV), exist_ok=True)
                results_df.to_csv(QUALITY_CSV, index=False)
                print("\n--- Model Training Summary ---")
                print(json.dumps(summary, indent=2))
            else:
                print("Model training was skipped.")

Running 'build_extracted_csv' to parse all 81 rows...


Processing rows:   2%|█▋                                                                | 2/81 [00:00<00:07,  9.93it/s]

Request failed for https://www.qnbtrust.bank/Resources/Learning-Center/Blog/7-cyber-security-tips: 403 Client Error: Forbidden for url: https://www.qnbtrust.bank/Resources/Learning-Center/Blog/7-cyber-security-tips


Processing rows:  22%|██████████████▍                                                  | 18/81 [00:07<00:19,  3.18it/s]

Request failed for https://www.hpe.com/us/en/what-is/sd-wan.html: HTTPSConnectionPool(host='www.hpe.com', port=443): Read timed out. (read timeout=10)


Processing rows:  28%|██████████████████▍                                              | 23/81 [00:22<01:24,  1.45s/it]

Request failed for https://www.cloudflare.com/learning/access-management/what-is-ztna/: 403 Client Error: Forbidden for url: https://www.cloudflare.com/learning/access-management/what-is-ztna/


Processing rows:  48%|███████████████████████████████▎                                 | 39/81 [00:26<00:05,  7.58it/s]

Request failed for https://towardsdatascience.com/machine-learning-basics-with-examples-part-1-c2d37247ec3d: 404 Client Error: Not Found for url: https://towardsdatascience.com/machine-learning-basics-with-examples-part-1-c2d37247ec3d


Processing rows:  51%|████████████████████████████████▉                                | 41/81 [00:28<00:21,  1.86it/s]

Request failed for https://www.analyticsvidhya.com/blog/2021/09/comprehensive-guide-on-machine-learning/: 404 Client Error: Not Found for url: https://www.analyticsvidhya.com/blog/2021/09/comprehensive-guide-on-machine-learning/


Processing rows:  62%|████████████████████████████████████████                         | 50/81 [00:32<00:08,  3.74it/s]

Request failed for https://www.investopedia.com/terms/s/seo.asp: 404 Client Error: Not Found for url: https://www.investopedia.com/terms/s/seo.asp


Processing rows:  91%|███████████████████████████████████████████████████████████▍     | 74/81 [00:45<00:02,  2.71it/s]

Request failed for https://www.reuters.com/technology/artificial-intelligence/: 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/technology/artificial-intelligence/


Processing rows:  94%|████████████████████████████████████████████████████████████▉    | 76/81 [00:47<00:03,  1.34it/s]

Request failed for https://www.cnbc.com/artificial-intelligence/: 404 Client Error: Not Found for url: https://www.cnbc.com/artificial-intelligence/


Processing rows:  95%|█████████████████████████████████████████████████████████████▊   | 77/81 [00:52<00:06,  1.71s/it]

Request failed for https://www.bbc.com/news/topics/c404v061z99t: 404 Client Error: Not Found for url: https://www.bbc.com/news/topics/c404v061z99t


Processing rows: 100%|█████████████████████████████████████████████████████████████████| 81/81 [00:55<00:00,  1.46it/s]


Saved extracted content to G:/seo-content-detector/data/extracted_pages.csv
Extraction complete.
Fitting TF-IDF and SVD models...
Created embeddings with shape (81, 81)
Saved features to G:/seo-content-detector/data/features.csv

Running duplicate detection...
Saved 4 duplicate pairs to G:/seo-content-detector/data/duplicates.csv

Training classification model...

--- Model Training Summary ---
{
  "accuracy": 0.92,
  "f1_weighted": 0.914074074074074,
  "baseline_accuracy": 0.36,
  "classification_report": {
    "High": {
      "precision": 1.0,
      "recall": 0.5,
      "f1-score": 0.6666666666666666,
      "support": 2.0
    },
    "Low": {
      "precision": 0.9285714285714286,
      "recall": 1.0,
      "f1-score": 0.9629629629629629,
      "support": 13.0
    },
    "Medium": {
      "precision": 0.9,
      "recall": 0.9,
      "f1-score": 0.9,
      "support": 10.0
    },
    "accuracy": 0.92,
    "macro avg": {
      "precision": 0.942857142857143,
      "recall": 0.79999999999

## Testing real-time analyzer:

In [15]:
import json
import numpy as np

# --- FIX 1: This must be a dictionary (key: value pairs) ---
test_cases = {
    "1": "https://www.google.com/",
    "2": "https://www.wikipedia.org/",
    "3": "https://en.wikipedia.org/wiki/Machine_learning"
}

print("--- Testing All 3 Real-Time Analyzer Cases ---")

try:
    # This loop now works because test_cases is a dict
    for case_name, test_url in test_cases.items():
        print(f"\n\n==============================================")
        print(f"Running Case: {case_name}")
        print(f"Analyzing URL: {test_url}")
        print("----------------------------------------------")
        
        # --- FIX 2: Cleaned up the function call (removed stray characters and spaces) ---
        result = analyze_url(test_url, 
                             existing_features_df=features_df, 
                             embeddings=embeddings, 
                             tfidf_model=global_tfidf_model,  # Pass the model
                             svd_model=global_svd_model,      # Pass the model
                             similarity_threshold=SIMILARITY_THRESHOLD)
        
        print(json.dumps(result, indent=2))
        print(f"==============================================")

except NameError as e:
    print(f"\n\n[ERROR] A required variable was not found. ({e})")
except Exception as e:
    print(f"\n\n[ERROR] An error occurred while analyzing: {e}")

--- Testing All 3 Real-Time Analyzer Cases ---


Running Case: 1
Analyzing URL: https://www.google.com/
----------------------------------------------
{
  "url": "https://www.google.com/",
  "word_count": 6,
  "readability": 62.79000000000002,
  "quality_label": "Low",
  "is_thin": true,
  "similar_to": []
}


Running Case: 2
Analyzing URL: https://www.wikipedia.org/
----------------------------------------------
{
  "url": "https://www.wikipedia.org/",
  "word_count": 934,
  "readability": 45.875150334075755,
  "quality_label": "Medium",
  "is_thin": false,
  "similar_to": []
}


Running Case: 3
Analyzing URL: https://en.wikipedia.org/wiki/Machine_learning
----------------------------------------------
{
  "url": "https://en.wikipedia.org/wiki/Machine_learning",
  "word_count": 18773,
  "readability": 30.68587377490718,
  "quality_label": "Medium",
  "is_thin": false,
  "similar_to": [
    {
      "url": "https://en.wikipedia.org/wiki/Machine_learning",
      "similarity": 1.000000000

In [16]:
import joblib
import os
import numpy as np

# --- Define all file paths ---
MODEL_PATH = "../models/quality_model.pkl"
TFIDF_PATH = "../models/tfidf_model.pkl"
SVD_PATH = "../models/svd_model.pkl"
EMBEDDINGS_PATH = "../data/embeddings.npy"

print("\n--- Saving all files for Streamlit app ---")

try:
    # Create directories if they don't exist
    os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
    os.makedirs(os.path.dirname(EMBEDDINGS_PATH), exist_ok=True)

    # 1. Save the classification model (clf)
    joblib.dump(clf, MODEL_PATH)
    print(f"Successfully saved model to {MODEL_PATH}")
    
    # 2. Save the TF-IDF model
    joblib.dump(global_tfidf_model, TFIDF_PATH)
    print(f"Successfully saved TF-IDF model to {TFIDF_PATH}")
    
    # 3. Save the SVD model
    joblib.dump(global_svd_model, SVD_PATH)
    print(f"Successfully saved SVD model to {SVD_PATH}")
    
    # 4. Save the embeddings array
    np.save(EMBEDDINGS_PATH, embeddings)
    print(f"Successfully saved embeddings to {EMBEDDINGS_PATH}")
    
    print("\nAll files saved successfully!")

except NameError as e:
    print(f"\n[ERROR] A required variable was not found. ({e})")
    print("Please make sure you have run the main pipeline cell (Cell 5) first.")
except Exception as e:
    print(f"\n[ERROR] An error occurred while saving the files: {e}")


--- Saving all files for Streamlit app ---
Successfully saved model to ../models/quality_model.pkl
Successfully saved TF-IDF model to ../models/tfidf_model.pkl
Successfully saved SVD model to ../models/svd_model.pkl
Successfully saved embeddings to ../data/embeddings.npy

All files saved successfully!
