In [None]:
# --- 1. Import the tools ---
from transformers import AutoTokenizer, AutoModel

print("Import successful!")

# --- 2. Load the pre-trained BERT model ---
print("Loading BERT model... (This will download ~420MB the first time)...")

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

print("---")
print("Success! BERT model and tokenizer are loaded into memory.")
print("Your AI development environment is ready!")

Import successful!
Loading BERT model... (This will download ~420MB the first time)...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

---
Success! BERT model and tokenizer are loaded into memory.
Your AI development environment is ready!


In [2]:
# --- Cell 2: The Embedding Function ---
# We need a function to turn a sentence into a single vector (embedding)
import torch # We need this to handle the model's output

def get_sentence_embedding(sentence):
    # 1. Tokenize: Turn the sentence into numbers (token IDs)
    #    return_tensors='pt' tells it to return PyTorch tensors
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)
    
    # 2. Get Embeddings: Pass the tokens through the loaded BERT model
    #    We wrap this in 'torch.no_grad()' for efficiency (we're not training, just inferring)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 3. Get the [CLS] Token:
    #    The 'last_hidden_state' has the vector for every token.
    #    The very first token is the [CLS] token, which is trained to 
    #    represent the meaning of the whole sentence.
    #    'outputs.last_hidden_state' is [batch_size, num_tokens, hidden_size]
    #    We take [0, 0, :] to get the [CLS] token of the first (and only) sentence.
    cls_embedding = outputs.last_hidden_state[0, 0, :]
    
    return cls_embedding

print("Helper function 'get_sentence_embedding' is defined.")
print("This function will now turn any sentence into a 768-dimension vector.")

Helper function 'get_sentence_embedding' is defined.
This function will now turn any sentence into a 768-dimension vector.


In [3]:
# --- Cell 3: Run the Similarity Test ---
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np # We need numpy to reshape the vectors for the similarity function

# --- Our Test Sentences ---
# Case 1: Paraphrased (Should be HIGH similarity)
text_a = "The AI-Driven Intelligent Exam Integrity System is a project that helps to make exams more fair."
text_b = "A project called the AI-Driven Intelligent Exam Integrity System makes exams more honest."

# Case 2: Different Topics (Should be LOW similarity)
text_c = "The student submitted the final report on Tuesday."
text_d = "The cat slept on the warm keyboard."

# --- Generate Embeddings ---
print("Generating embeddings for all sentences...")
emb_a = get_sentence_embedding(text_a)
emb_b = get_sentence_embedding(text_b)
emb_c = get_sentence_embedding(text_c)
emb_d = get_sentence_embedding(text_d)

# --- Calculate Similarity ---
# Cosine similarity expects 2D arrays, so we use .reshape(1, -1) to format them
print("Calculating similarity...")

# Test Case 1
sim_paraphrased = cosine_similarity(emb_a.reshape(1, -1), emb_b.reshape(1, -1))[0][0]

# Test Case 2
sim_different = cosine_similarity(emb_c.reshape(1, -1), emb_d.reshape(1, -1))[0][0]


print("\n--- Results ---")
print(f"Similarity (Paraphrased): {sim_paraphrased:.4f}")
print(f"Similarity (Different Topics): {sim_different:.4f}")

Generating embeddings for all sentences...
Calculating similarity...

--- Results ---
Similarity (Paraphrased): 0.9370
Similarity (Different Topics): 0.8092


In [4]:
# --- Cell 4: Install the SBERT library ---
%pip install -U sentence-transformers

print("Sentence-Transformers library installed.")

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting Pillow (from sentence-transformers)
  Downloading pillow-12.0.0-cp313-cp313-win_amd64.whl.metadata (9.0 kB)
Downloading sentence_transformers-5.1.2-py3-none-any.whl (488 kB)
Downloading pillow-12.0.0-cp313-cp313-win_amd64.whl (7.0 MB)
   ---------------------------------------- 0.0/7.0 MB ? eta -:--:--
   -- ------------------------------------- 0.5/7.0 MB 4.9 MB/s eta 0:00:02
   ---------- ----------------------------- 1.8/7.0 MB 5.0 MB/s eta 0:00:02
   ----------------- ---------------------- 3.1/7.0 MB 5.5 MB/s eta 0:00:01
   ----------------------- ---------------- 4.2/7.0 MB 5.6 MB/s eta 0:00:01
   ------------------------------- -------- 5.5/7.0 MB 5.5 MB/s eta 0:00:01
   -------------------------------------- - 6.8/7.0 MB 5.7 MB/s eta 0:00:01
   ---------------------------------------- 7.0/7.0 MB 5.7 MB/s  0:00:01
Installing collected packages: Pillow, sentence


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
# --- Cell 5: Re-run test with a specialized SBERT model ---
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Load the specialized SBERT model
#    This will download the new model (~80MB)
print("Loading specialized SBERT model (all-MiniLM-L6-v2)...")
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# --- Our Test Sentences (Same as before) ---
# Case 1: Paraphrased
text_a = "The AI-Driven Intelligent Exam Integrity System is a project that helps to make exams more fair."
text_b = "A project called the AI-Driven Intelligent Exam Integrity System makes exams more honest."

# Case 2: Different Topics
text_c = "The student submitted the final report on Tuesday."
text_d = "The cat slept on the warm keyboard."

# --- 2. Generate Embeddings (Simpler!) ---
# The new model encodes a list of sentences all at once
embeddings = sbert_model.encode([text_a, text_b, text_c, text_d])

# --- 3. Calculate Similarity ---
sim_paraphrased = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
sim_different = cosine_similarity([embeddings[2]], [embeddings[3]])[0][0]

print("\n--- Results from Specialized SBERT Model ---")
print(f"Similarity (Paraphrased): {sim_paraphrased:.4f}")
print(f"Similarity (Different Topics): {sim_different:.4f}")

Loading specialized SBERT model (all-MiniLM-L6-v2)...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


--- Results from Specialized SBERT Model ---
Similarity (Paraphrased): 0.9421
Similarity (Different Topics): 0.0760


Testing

In [6]:
# --- Cell 6: Find Best Match from a "Database" ---
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# --- 1. Load the Model (if in a new session) ---
# If you just ran Cell 5, the model is already in memory
# If not, uncomment the line below:
# sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model is loaded.")


# --- 2. Define our Data ---

# This is the new submission we're checking
student_answer = "This AI-driven intelligent system will help make exams more fair and honest."

# This is our "database" of known sources
source_database = [
    "The cat slept on the warm keyboard.", # Source 0 (Different)
    "An AI-driven intelligent system is a project that helps to make exams more fair.", # Source 1 (Paraphrased)
    "The student submitted the final report on Tuesday." # Source 2 (Different)
]

# --- 3. Encode ALL texts ---
# Encode the student answer
student_embedding = sbert_model.encode(student_answer)

# Encode all the documents in our database
source_embeddings = sbert_model.encode(source_database)


# --- 4. Calculate Similarity ---
# We compare the single student embedding against ALL the source embeddings
# This returns a list of scores, e.g., [[score_vs_0, score_vs_1, score_vs_2]]
similarity_scores = cosine_similarity(
    [student_embedding],  # Needs to be in a list (2D array)
    source_embeddings     # Is already a list of embeddings (2D array)
)[0] # Get the first (and only) row of scores

# --- 5. Find the Best Match ---
best_match_index = np.argmax(similarity_scores)
best_match_score = similarity_scores[best_match_index]
best_match_source = source_database[best_match_index]

print("\n--- Plagiarism Check Results ---")
print(f"Student Answer: '{student_answer}'")
print(f"Similarity Scores (vs Sources 0, 1, 2): {similarity_scores}")
print("---")
print(f"Highest Score: {best_match_score:.4f}")
print(f"Source Document (Index {best_match_index}): '{best_match_source}'")

Model is loaded.

--- Plagiarism Check Results ---
Student Answer: 'This AI-driven intelligent system will help make exams more fair and honest.'
Similarity Scores (vs Sources 0, 1, 2): [0.00295392 0.86295974 0.16966324]
---
Highest Score: 0.8630
Source Document (Index 1): 'An AI-driven intelligent system is a project that helps to make exams more fair.'
