<a href="https://colab.research.google.com/github/Inzamam1234/PlagiAI_A-Multi_Document_Authenticity_Detection_System/blob/main/Build_Semantic_Index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================================
# CELL 1: Setup and Installation
# ============================================================================
!pip install -q sentence-transformers faiss-cpu pdfminer.six nltk

import torch
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m105.2 MB/s[0m eta [36m0:00:00[0m
[?25hGPU Available: True
GPU Device: Tesla T4


In [None]:
# ============================================================================
# CELL 2: Import Libraries
# ============================================================================
import os
import json
import pickle
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from pdfminer.high_level import extract_text
import nltk
from nltk.tokenize import sent_tokenize
from tqdm.auto import tqdm

# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True) # Download the punkt_tab resource
print("✓ Libraries imported successfully")

✓ Libraries imported successfully


In [None]:
# ============================================================================
# CELL 3: Upload Reference Documents
# ============================================================================
"""
Upload your reference corpus:
- Research papers (PDFs)
- Text files
- Create a folder structure or upload directly
"""

# Create upload directory
!mkdir -p /content/reference_papers

# Upload files
from google.colab import files
print("📤 Please upload your reference papers (PDF/TXT files)")
print("   Recommended: 10-100+ papers for good coverage")
print()
uploaded = files.upload()

# Move uploaded files to reference directory
for filename in uploaded.keys():
    !mv "{filename}" /content/reference_papers/

print(f"\n✓ Uploaded {len(uploaded)} files")

# Alternative: Mount Google Drive with papers
# Uncomment these lines if papers are in Google Drive:
# from google.colab import drive
# drive.mount('/content/drive')
# REFERENCE_DIR = '/content/drive/MyDrive/reference_papers'

REFERENCE_DIR = '/content/reference_papers'

# List uploaded files
print(f"\nFiles in {REFERENCE_DIR}:")
for f in os.listdir(REFERENCE_DIR):
    size = os.path.getsize(os.path.join(REFERENCE_DIR, f)) / 1024
    print(f"  - {f} ({size:.1f} KB)")

📤 Please upload your reference papers (PDF/TXT files)
   Recommended: 10-100+ papers for good coverage



Saving EJ1172284.pdf to EJ1172284.pdf

✓ Uploaded 1 files

Files in /content/reference_papers:
  - EJ1172284.pdf (370.4 KB)


In [None]:
# ============================================================================
# CELL 4: Document Extraction Functions
# ============================================================================
def extract_text_from_pdf(pdf_path):
    """Extract text from PDF"""
    try:
        text = extract_text(pdf_path)
        return text.strip()
    except Exception as e:
        print(f"⚠️ Error extracting {pdf_path}: {e}")
        return ""

def extract_text_from_txt(txt_path):
    """Extract text from TXT file"""
    try:
        with open(txt_path, 'r', encoding='utf-8') as f:
            return f.read().strip()
    except UnicodeDecodeError:
        # Try with different encoding
        try:
            with open(txt_path, 'r', encoding='latin-1') as f:
                return f.read().strip()
        except Exception as e:
            print(f"⚠️ Error reading {txt_path}: {e}")
            return ""
    except Exception as e:
        print(f"⚠️ Error reading {txt_path}: {e}")
        return ""

def clean_text(text):
    """Clean extracted text"""
    import re
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters but keep punctuation
    text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)\[\]\"\']+', '', text)
    return text.strip()

def split_into_sentences(text, min_length=20):
    """Split text into sentences"""
    sentences = sent_tokenize(text)
    # Filter short sentences and common artifacts
    filtered = []
    for s in sentences:
        s = s.strip()
        # Skip very short or header-like sentences
        if len(s) >= min_length and not s.isupper():
            filtered.append(s)
    return filtered

print("✓ Extraction functions defined")

✓ Extraction functions defined


In [None]:
# ============================================================================
# CELL 5: Process Reference Corpus
# ============================================================================
print("\n" + "="*60)
print("PROCESSING REFERENCE DOCUMENTS")
print("="*60 + "\n")

reference_documents = []
document_id = 0
failed_files = []

for filename in tqdm(os.listdir(REFERENCE_DIR), desc="Processing files"):
    filepath = os.path.join(REFERENCE_DIR, filename)

    # Skip directories
    if os.path.isdir(filepath):
        continue

    # Extract text based on file type
    if filename.lower().endswith('.pdf'):
        text = extract_text_from_pdf(filepath)
    elif filename.lower().endswith('.txt'):
        text = extract_text_from_txt(filepath)
    else:
        print(f"⚠️ Skipping unsupported file: {filename}")
        continue

    if not text or len(text) < 100:
        failed_files.append(filename)
        continue

    # Clean and split
    text = clean_text(text)
    sentences = split_into_sentences(text)

    if sentences:
        reference_documents.append({
            'doc_id': document_id,
            'filename': filename,
            'sentences': sentences,
            'num_sentences': len(sentences),
            'preview': text[:200] + '...'
        })
        document_id += 1

print(f"\n{'='*60}")
print("PROCESSING RESULTS")
print(f"{'='*60}")
print(f"✓ Successfully processed: {len(reference_documents)} documents")
total_sentences = sum(doc['num_sentences'] for doc in reference_documents)
print(f"✓ Total sentences in corpus: {total_sentences}")
print(f"✓ Average sentences per document: {total_sentences/len(reference_documents):.1f}")

if failed_files:
    print(f"\n⚠️ Failed to process {len(failed_files)} files:")
    for f in failed_files[:5]:  # Show first 5
        print(f"   - {f}")
    if len(failed_files) > 5:
        print(f"   ... and {len(failed_files)-5} more")

if total_sentences == 0:
    raise ValueError("❌ No sentences found! Please check your reference documents.")

# Show sample
print(f"\nSample document:")
print(f"  File: {reference_documents[0]['filename']}")
print(f"  Sentences: {reference_documents[0]['num_sentences']}")
print(f"  Preview: {reference_documents[0]['preview']}")


PROCESSING REFERENCE DOCUMENTS



Processing files:   0%|          | 0/1 [00:00<?, ?it/s]


PROCESSING RESULTS
✓ Successfully processed: 1 documents
✓ Total sentences in corpus: 307
✓ Average sentences per document: 307.0

Sample document:
  File: EJ1172284.pdf
  Sentences: 307
  Preview: The EUROCALL Review, Volume 25, No. 2, September 2017 Research paper A look at advanced learners use of mobile devices for English language study: Insights from interview data Mariusz Kruk University ...


In [None]:
# ============================================================================
# CELL 6: Initialize Sentence-BERT Model
# ============================================================================
MODEL_NAME = 'all-MiniLM-L6-v2'  # Fast and efficient (384 dims)
# Alternative models:
# 'all-mpnet-base-v2'  # Better quality (768 dims), slower
# 'paraphrase-multilingual-MiniLM-L12-v2'  # For multiple languages

print(f"\n{'='*60}")
print(f"LOADING SENTENCE-BERT MODEL")
print(f"{'='*60}\n")
print(f"Model: {MODEL_NAME}")

model = SentenceTransformer(MODEL_NAME)

if torch.cuda.is_available():
    model = model.to('cuda')
    print(f"✓ Model moved to GPU: {torch.cuda.get_device_name(0)}")
else:
    print("⚠️ Running on CPU (this will be slower)")

print(f"✓ Model loaded successfully")
print(f"   Embedding dimension: {model.get_sentence_embedding_dimension()}")


LOADING SENTENCE-BERT MODEL

Model: all-MiniLM-L6-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✓ Model moved to GPU: Tesla T4
✓ Model loaded successfully
   Embedding dimension: 384


In [None]:
# ============================================================================
# CELL 7: Build FAISS Index
# ============================================================================
print(f"\n{'='*60}")
print("BUILDING FAISS INDEX")
print(f"{'='*60}\n")

# Collect all sentences and metadata
all_sentences = []
metadata = []

for doc in reference_documents:
    doc_id = doc['doc_id']
    filename = doc['filename']

    for sent_idx, sentence in enumerate(doc['sentences']):
        all_sentences.append(sentence)
        metadata.append({
            'doc_id': doc_id,
            'filename': filename,
            'sentence_idx': sent_idx
        })

print(f"📊 Total sentences to encode: {len(all_sentences)}")
print(f"📦 Starting encoding (this may take a few minutes)...\n")

# Encode in batches for efficiency
batch_size = 64
embeddings_list = []

for i in tqdm(range(0, len(all_sentences), batch_size), desc="Encoding batches"):
    batch = all_sentences[i:i+batch_size]
    batch_embeddings = model.encode(
        batch,
        convert_to_numpy=True,
        show_progress_bar=False,
        batch_size=batch_size
    )
    embeddings_list.append(batch_embeddings)

# Stack all embeddings
embeddings = np.vstack(embeddings_list)
print(f"\n✓ Encoding complete!")
print(f"   Embeddings shape: {embeddings.shape}")
print(f"   Memory usage: {embeddings.nbytes / (1024**2):.1f} MB")

# Normalize embeddings for cosine similarity
print(f"\n🔧 Normalizing embeddings for cosine similarity...")
faiss.normalize_L2(embeddings)

# Build FAISS index
print(f"🔧 Building FAISS index...")
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner product = cosine similarity after normalization

# Add vectors to index
index.add(embeddings)

print(f"\n✓ FAISS index built successfully!")
print(f"   Index type: IndexFlatIP (exact search)")
print(f"   Vectors in index: {index.ntotal}")
print(f"   Dimension: {dimension}")


BUILDING FAISS INDEX

📊 Total sentences to encode: 307
📦 Starting encoding (this may take a few minutes)...



Encoding batches:   0%|          | 0/5 [00:00<?, ?it/s]


✓ Encoding complete!
   Embeddings shape: (307, 384)
   Memory usage: 0.4 MB

🔧 Normalizing embeddings for cosine similarity...
🔧 Building FAISS index...

✓ FAISS index built successfully!
   Index type: IndexFlatIP (exact search)
   Vectors in index: 307
   Dimension: 384


In [None]:
# ============================================================================
# CELL 8: Test the Index
# ============================================================================
print(f"\n{'='*60}")
print("TESTING INDEX WITH SAMPLE QUERIES")
print(f"{'='*60}\n")

test_queries = [
    "Machine learning has revolutionized data analysis.",
    "Neural networks are powerful computational models.",
    "This research presents a novel approach to the problem."
]

for query_idx, test_query in enumerate(test_queries, 1):
    print(f"Query {query_idx}: {test_query}")

    # Encode query
    query_embedding = model.encode([test_query], convert_to_numpy=True)
    faiss.normalize_L2(query_embedding)

    # Search for top 3 matches
    k = 3
    similarities, indices = index.search(query_embedding, k)

    print("Top matches:")
    for rank, (sim, idx) in enumerate(zip(similarities[0], indices[0]), 1):
        matched_sentence = all_sentences[idx]
        meta = metadata[idx]
        print(f"  {rank}. Similarity: {sim:.3f}")
        print(f"     Source: {meta['filename']}")
        print(f"     Text: {matched_sentence[:80]}...")
    print()


TESTING INDEX WITH SAMPLE QUERIES

Query 1: Machine learning has revolutionized data analysis.
Top matches:
  1. Similarity: 0.370
     Source: EJ1172284.pdf
     Text: It should also be noted that the obtained data were analyzed quantitatively....
  2. Similarity: 0.370
     Source: EJ1172284.pdf
     Text: The analysis started with partial transcription of the important parts of the da...
  3. Similarity: 0.361
     Source: EJ1172284.pdf
     Text: The gathered data were subjected to qualitative and quantitative analysis....

Query 2: Neural networks are powerful computational models.
Top matches:
  1. Similarity: 0.393
     Source: EJ1172284.pdf
     Text: self-directed learning)....
  2. Similarity: 0.322
     Source: EJ1172284.pdf
     Text: At the close of this section, a few words are in order on the notion of autonomo...
  3. Similarity: 0.295
     Source: EJ1172284.pdf
     Text: As stated by Benson and Chik (2010), the latest generations of new technologies,...

Query 3: Thi

In [None]:
# ============================================================================
# CELL 9: Save Index and Metadata
# ============================================================================
OUTPUT_DIR = './semantic_index'
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"{'='*60}")
print("SAVING INDEX FILES")
print(f"{'='*60}\n")

# Save FAISS index
index_path = f"{OUTPUT_DIR}/faiss_index.bin"
faiss.write_index(index, index_path)
index_size = os.path.getsize(index_path) / (1024**2)
print(f"✓ Saved FAISS index ({index_size:.1f} MB)")

# Save sentences
sentences_path = f"{OUTPUT_DIR}/reference_sentences.pkl"
with open(sentences_path, 'wb') as f:
    pickle.dump(all_sentences, f)
sentences_size = os.path.getsize(sentences_path) / (1024**2)
print(f"✓ Saved reference sentences ({sentences_size:.1f} MB)")

# Save metadata
metadata_path = f"{OUTPUT_DIR}/reference_metadata.pkl"
with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)
metadata_size = os.path.getsize(metadata_path) / (1024**2)
print(f"✓ Saved metadata ({metadata_size:.1f} MB)")

# Save document info (for reference)
docs_path = f"{OUTPUT_DIR}/documents_info.json"
with open(docs_path, 'w') as f:
    json.dump(reference_documents, f, indent=2)
docs_size = os.path.getsize(docs_path) / 1024
print(f"✓ Saved document info ({docs_size:.1f} KB)")

# Save configuration
config = {
    'model_name': MODEL_NAME,
    'total_documents': len(reference_documents),
    'total_sentences': len(all_sentences),
    'embedding_dimension': dimension,
    'index_size': index.ntotal,
    'creation_date': str(np.datetime64('now')),
    'gpu_used': torch.cuda.is_available()
}

config_path = f"{OUTPUT_DIR}/config.json"
with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)
print(f"✓ Saved configuration")

total_size = index_size + sentences_size + metadata_size + (docs_size/1024)
print(f"\nTotal size: {total_size:.1f} MB")

SAVING INDEX FILES

✓ Saved FAISS index (0.4 MB)
✓ Saved reference sentences (0.0 MB)
✓ Saved metadata (0.0 MB)
✓ Saved document info (42.9 KB)
✓ Saved configuration

Total size: 0.5 MB


In [None]:
# ============================================================================
# CELL 10: Create Summary Report
# ============================================================================
print(f"\n{'='*60}")
print("INDEX SUMMARY REPORT")
print(f"{'='*60}\n")

print(f"📚 Corpus Statistics:")
print(f"   Documents processed: {len(reference_documents)}")
print(f"   Total sentences: {len(all_sentences)}")
print(f"   Avg sentences per doc: {len(all_sentences)/len(reference_documents):.1f}")
print(f"   Embedding dimension: {dimension}")
print()

print(f"🔧 Model Information:")
print(f"   Model: {MODEL_NAME}")
print(f"   Device: {'GPU - ' + torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
print()

print(f"💾 Output Files:")
print(f"   Location: {OUTPUT_DIR}/")
print(f"   Total size: {total_size:.1f} MB")
print()

print(f"📊 Top 5 Largest Documents:")
sorted_docs = sorted(reference_documents, key=lambda x: x['num_sentences'], reverse=True)
for i, doc in enumerate(sorted_docs[:5], 1):
    print(f"   {i}. {doc['filename']}: {doc['num_sentences']} sentences")


INDEX SUMMARY REPORT

📚 Corpus Statistics:
   Documents processed: 1
   Total sentences: 307
   Avg sentences per doc: 307.0
   Embedding dimension: 384

🔧 Model Information:
   Model: all-MiniLM-L6-v2
   Device: GPU - Tesla T4

💾 Output Files:
   Location: ./semantic_index/
   Total size: 0.5 MB

📊 Top 5 Largest Documents:
   1. EJ1172284.pdf: 307 sentences


In [None]:
# ============================================================================
# CELL 11: Download Index Files
# ============================================================================
print(f"\n{'='*60}")
print("PREPARING DOWNLOAD")
print(f"{'='*60}\n")

# Create zip file
print("📦 Creating zip archive...")
!zip -r -q semantic_index.zip {OUTPUT_DIR}

zip_size = os.path.getsize('semantic_index.zip') / (1024**2)
print(f"✓ Archive created: semantic_index.zip ({zip_size:.1f} MB)")

# Download
print(f"\n📥 Starting download...")
from google.colab import files
files.download('semantic_index.zip')

print(f"\n{'='*60}")
print("✅ INDEX CREATION COMPLETE!")
print(f"{'='*60}\n")

print("📥 Download Instructions:")
print("   1. semantic_index.zip should be downloading")
print("   2. Extract in your VSCode project")
print("   3. Place in: ./models/semantic_index/")
print()

print("📂 Expected structure after extraction:")
print("   ./models/semantic_index/")
print("   ├── faiss_index.bin")
print("   ├── reference_sentences.pkl")
print("   ├── reference_metadata.pkl")
print("   ├── documents_info.json")
print("   └── config.json")
print()

print("🚀 Next Steps:")
print("   1. Extract semantic_index.zip")
print("   2. Move to your VSCode project: ./models/semantic_index/")
print("   3. Run your application: python app.py")
print()

print("✨ Your plagiarism detection index is ready to use!")


PREPARING DOWNLOAD

📦 Creating zip archive...
✓ Archive created: semantic_index.zip (0.4 MB)

📥 Starting download...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ INDEX CREATION COMPLETE!

📥 Download Instructions:
   1. semantic_index.zip should be downloading
   2. Extract in your VSCode project
   3. Place in: ./models/semantic_index/

📂 Expected structure after extraction:
   ./models/semantic_index/
   ├── faiss_index.bin
   ├── reference_sentences.pkl
   ├── reference_metadata.pkl
   ├── documents_info.json
   └── config.json

🚀 Next Steps:
   1. Extract semantic_index.zip
   2. Move to your VSCode project: ./models/semantic_index/
   3. Run your application: python app.py

✨ Your plagiarism detection index is ready to use!
