In [2]:
import mysql.connector
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
database = mysql.connector.connect(
    host="localhost",
    user="root",
    password="",
)
cursor = database.cursor()
cursor.execute("USE nlp_thesis_similarity")

In [4]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [5]:
# # First, make sure the columns exist
# try:
#     # Check if columns exist first
#     cursor.execute("SHOW COLUMNS FROM dewey_papers LIKE 'title_embeddings_all-MiniLM-L6-v2'")
#     if not cursor.fetchone():
#         cursor.execute("ALTER TABLE dewey_papers ADD COLUMN `title_embeddings_all-MiniLM-L6-v2` JSON")
        
#     cursor.execute("SHOW COLUMNS FROM dewey_papers LIKE 'abstract_embeddings_all-MiniLM-L6-v2'")
#     if not cursor.fetchone():
#         cursor.execute("ALTER TABLE dewey_papers ADD COLUMN `abstract_embeddings_all-MiniLM-L6-v2` JSON")
        
#     database.commit()
#     print("Columns checked and added if needed")
# except Exception as e:
#     print(f"Error managing columns: {e}")
#     database.rollback()  # Rollback in case of error

In [6]:
import json

def get_embedding(text, model):
    # Handle None values or empty strings
    if text is None or text == "":
        return None
    # Convert text to embedding using the provided model
    return model.encode(text).tolist()

def update_embeddings_in_batches(model, batch_size=100):
    try:
        # First, retrieve all papers that need embeddings
        cursor.execute("""
            SELECT id, title, abstract FROM dewey_papers 
            WHERE `title_embeddings_all-MiniLM-L6-v2` IS NULL 
            OR `abstract_embeddings_all-MiniLM-L6-v2` IS NULL
        """)
        all_papers = cursor.fetchall()
        
        print(f"Processing {len(all_papers)} papers that need embeddings...")
        
        if not all_papers:
            print("No papers need embeddings. Process complete.")
            return
            
        # Process in batches to avoid memory issues
        total_batches = (len(all_papers) + batch_size - 1) // batch_size
        
        for i in tqdm(range(0, len(all_papers), batch_size), total=total_batches, desc="Processing batches"):
            batch = all_papers[i:i+batch_size]
            updates = []
            
            for paper_id, title, abstract in tqdm(batch, leave=False, desc="Papers in batch"):
                title_embedding = get_embedding(title, model)
                abstract_embedding = get_embedding(abstract, model)
                
                if title_embedding is not None or abstract_embedding is not None:
                    updates.append((
                        json.dumps(title_embedding) if title_embedding else None, 
                        json.dumps(abstract_embedding) if abstract_embedding else None, 
                        paper_id
                    ))
            
            # Check if we have any updates to make
            if updates:
                # Update the database with the new embeddings
                update_query = """
                UPDATE dewey_papers 
                SET `title_embeddings_all-MiniLM-L6-v2` = %s, `abstract_embeddings_all-MiniLM-L6-v2` = %s 
                WHERE id = %s
                """
                cursor.executemany(update_query, updates)
                database.commit()
                print(f"Updated embeddings for {len(updates)} papers in batch")
                
    except Exception as e:
        print(f"Error during embedding update: {e}")
        database.rollback()
        raise

# Run the update process with the loaded model
update_embeddings_in_batches(model)

print("Embedding process completed")

Processing 7463 papers that need embeddings...


Processing batches:   1%|▏         | 1/75 [00:01<01:21,  1.10s/it]

Updated embeddings for 100 papers in batch


Processing batches:   3%|▎         | 2/75 [00:02<01:16,  1.05s/it]

Updated embeddings for 100 papers in batch


Processing batches:   4%|▍         | 3/75 [00:03<01:18,  1.10s/it]

Updated embeddings for 100 papers in batch


Processing batches:   5%|▌         | 4/75 [00:04<01:17,  1.09s/it]

Updated embeddings for 100 papers in batch


Processing batches:   7%|▋         | 5/75 [00:05<01:15,  1.08s/it]

Updated embeddings for 100 papers in batch


Processing batches:   8%|▊         | 6/75 [00:06<01:14,  1.08s/it]

Updated embeddings for 100 papers in batch


Processing batches:   9%|▉         | 7/75 [00:07<01:15,  1.11s/it]

Updated embeddings for 100 papers in batch


Processing batches:  11%|█         | 8/75 [00:08<01:14,  1.11s/it]

Updated embeddings for 100 papers in batch


Processing batches:  12%|█▏        | 9/75 [00:09<01:11,  1.08s/it]

Updated embeddings for 100 papers in batch


Processing batches:  13%|█▎        | 10/75 [00:10<01:10,  1.08s/it]

Updated embeddings for 100 papers in batch


Processing batches:  15%|█▍        | 11/75 [00:11<01:08,  1.08s/it]

Updated embeddings for 100 papers in batch


Processing batches:  16%|█▌        | 12/75 [00:13<01:07,  1.08s/it]

Updated embeddings for 100 papers in batch


Processing batches:  17%|█▋        | 13/75 [00:14<01:06,  1.06s/it]

Updated embeddings for 100 papers in batch


Processing batches:  19%|█▊        | 14/75 [00:15<01:03,  1.05s/it]

Updated embeddings for 100 papers in batch


Processing batches:  20%|██        | 15/75 [00:16<01:01,  1.02s/it]

Updated embeddings for 100 papers in batch


Processing batches:  21%|██▏       | 16/75 [00:16<00:59,  1.00s/it]

Updated embeddings for 100 papers in batch


Processing batches:  23%|██▎       | 17/75 [00:17<00:57,  1.01it/s]

Updated embeddings for 100 papers in batch


Processing batches:  24%|██▍       | 18/75 [00:18<00:56,  1.00it/s]

Updated embeddings for 100 papers in batch


Processing batches:  25%|██▌       | 19/75 [00:19<00:55,  1.00it/s]

Updated embeddings for 100 papers in batch


Processing batches:  27%|██▋       | 20/75 [00:20<00:54,  1.00it/s]

Updated embeddings for 100 papers in batch


Processing batches:  28%|██▊       | 21/75 [00:21<00:54,  1.01s/it]

Updated embeddings for 100 papers in batch


Processing batches:  29%|██▉       | 22/75 [00:23<00:53,  1.01s/it]

Updated embeddings for 100 papers in batch


Processing batches:  31%|███       | 23/75 [00:24<00:53,  1.04s/it]

Updated embeddings for 100 papers in batch


Processing batches:  32%|███▏      | 24/75 [00:25<00:52,  1.02s/it]

Updated embeddings for 100 papers in batch


Processing batches:  33%|███▎      | 25/75 [00:26<00:50,  1.01s/it]

Updated embeddings for 100 papers in batch


Processing batches:  35%|███▍      | 26/75 [00:27<00:50,  1.04s/it]

Updated embeddings for 100 papers in batch


Processing batches:  36%|███▌      | 27/75 [00:28<00:50,  1.06s/it]

Updated embeddings for 100 papers in batch


Processing batches:  37%|███▋      | 28/75 [00:29<00:49,  1.06s/it]

Updated embeddings for 100 papers in batch


Processing batches:  39%|███▊      | 29/75 [00:30<00:50,  1.09s/it]

Updated embeddings for 100 papers in batch


Processing batches:  40%|████      | 30/75 [00:31<00:48,  1.07s/it]

Updated embeddings for 100 papers in batch


Processing batches:  41%|████▏     | 31/75 [00:32<00:45,  1.04s/it]

Updated embeddings for 100 papers in batch


Processing batches:  43%|████▎     | 32/75 [00:33<00:44,  1.03s/it]

Updated embeddings for 100 papers in batch


Processing batches:  44%|████▍     | 33/75 [00:34<00:42,  1.01s/it]

Updated embeddings for 100 papers in batch


Processing batches:  45%|████▌     | 34/75 [00:36<00:52,  1.27s/it]

Updated embeddings for 100 papers in batch


Processing batches:  47%|████▋     | 35/75 [00:39<01:18,  1.96s/it]

Updated embeddings for 100 papers in batch


Processing batches:  48%|████▊     | 36/75 [00:43<01:37,  2.50s/it]

Updated embeddings for 100 papers in batch


Processing batches:  49%|████▉     | 37/75 [00:47<01:48,  2.86s/it]

Updated embeddings for 100 papers in batch


Processing batches:  51%|█████     | 38/75 [00:51<01:54,  3.09s/it]

Updated embeddings for 100 papers in batch


Processing batches:  52%|█████▏    | 39/75 [00:54<01:55,  3.22s/it]

Updated embeddings for 100 papers in batch


Processing batches:  53%|█████▎    | 40/75 [00:58<01:56,  3.33s/it]

Updated embeddings for 100 papers in batch


Processing batches:  55%|█████▍    | 41/75 [01:01<01:55,  3.40s/it]

Updated embeddings for 100 papers in batch


Processing batches:  56%|█████▌    | 42/75 [01:05<01:53,  3.43s/it]

Updated embeddings for 100 papers in batch


Processing batches:  57%|█████▋    | 43/75 [01:08<01:49,  3.43s/it]

Updated embeddings for 100 papers in batch


Processing batches:  59%|█████▊    | 44/75 [01:12<01:47,  3.47s/it]

Updated embeddings for 100 papers in batch


Processing batches:  60%|██████    | 45/75 [01:15<01:43,  3.45s/it]

Updated embeddings for 100 papers in batch


Processing batches:  61%|██████▏   | 46/75 [01:18<01:37,  3.36s/it]

Updated embeddings for 100 papers in batch


Processing batches:  63%|██████▎   | 47/75 [01:21<01:32,  3.31s/it]

Updated embeddings for 100 papers in batch


Processing batches:  64%|██████▍   | 48/75 [01:25<01:29,  3.32s/it]

Updated embeddings for 100 papers in batch


Processing batches:  65%|██████▌   | 49/75 [01:28<01:28,  3.39s/it]

Updated embeddings for 100 papers in batch


Processing batches:  67%|██████▋   | 50/75 [01:32<01:26,  3.45s/it]

Updated embeddings for 100 papers in batch


Processing batches:  68%|██████▊   | 51/75 [01:36<01:25,  3.55s/it]

Updated embeddings for 100 papers in batch


Processing batches:  69%|██████▉   | 52/75 [01:39<01:20,  3.50s/it]

Updated embeddings for 100 papers in batch


Processing batches:  71%|███████   | 53/75 [01:42<01:16,  3.46s/it]

Updated embeddings for 100 papers in batch


Processing batches:  72%|███████▏  | 54/75 [01:46<01:11,  3.39s/it]

Updated embeddings for 100 papers in batch


Processing batches:  73%|███████▎  | 55/75 [01:49<01:08,  3.40s/it]

Updated embeddings for 100 papers in batch


Processing batches:  75%|███████▍  | 56/75 [01:53<01:04,  3.41s/it]

Updated embeddings for 100 papers in batch


Processing batches:  76%|███████▌  | 57/75 [01:56<01:00,  3.36s/it]

Updated embeddings for 100 papers in batch


Processing batches:  77%|███████▋  | 58/75 [01:59<00:57,  3.36s/it]

Updated embeddings for 100 papers in batch


Processing batches:  79%|███████▊  | 59/75 [02:02<00:53,  3.32s/it]

Updated embeddings for 100 papers in batch


Processing batches:  80%|████████  | 60/75 [02:06<00:50,  3.35s/it]

Updated embeddings for 100 papers in batch


Processing batches:  81%|████████▏ | 61/75 [02:09<00:47,  3.37s/it]

Updated embeddings for 100 papers in batch


Processing batches:  83%|████████▎ | 62/75 [02:13<00:43,  3.36s/it]

Updated embeddings for 100 papers in batch


Processing batches:  84%|████████▍ | 63/75 [02:16<00:40,  3.41s/it]

Updated embeddings for 100 papers in batch


Processing batches:  85%|████████▌ | 64/75 [02:20<00:37,  3.42s/it]

Updated embeddings for 100 papers in batch


Processing batches:  87%|████████▋ | 65/75 [02:23<00:34,  3.47s/it]

Updated embeddings for 100 papers in batch


Processing batches:  88%|████████▊ | 66/75 [02:27<00:31,  3.47s/it]

Updated embeddings for 100 papers in batch


Processing batches:  89%|████████▉ | 67/75 [02:30<00:28,  3.52s/it]

Updated embeddings for 100 papers in batch


Processing batches:  91%|█████████ | 68/75 [02:34<00:24,  3.47s/it]

Updated embeddings for 100 papers in batch


Processing batches:  92%|█████████▏| 69/75 [02:37<00:20,  3.46s/it]

Updated embeddings for 100 papers in batch


Processing batches:  93%|█████████▎| 70/75 [02:41<00:17,  3.49s/it]

Updated embeddings for 100 papers in batch


Processing batches:  95%|█████████▍| 71/75 [02:44<00:14,  3.59s/it]

Updated embeddings for 100 papers in batch


Processing batches:  96%|█████████▌| 72/75 [02:48<00:10,  3.58s/it]

Updated embeddings for 100 papers in batch


Processing batches:  97%|█████████▋| 73/75 [02:51<00:07,  3.53s/it]

Updated embeddings for 100 papers in batch


Processing batches:  99%|█████████▊| 74/75 [02:55<00:03,  3.51s/it]

Updated embeddings for 100 papers in batch


Processing batches: 100%|██████████| 75/75 [02:57<00:00,  2.37s/it]

Updated embeddings for 63 papers in batch
Embedding process completed



