SpaCy is like the Swiss Army knife of NLP, while Transformers is more akin to a sledge hammer.

SpaCy is fast and lightweight. Transformers (ie. Sentence transformer) let’s you use state of the art stuff, but the trade off is usually in terms of slower runtime at inference and larger memory usage.

Another important distinction is that SpaCy has tools for more linguistics-focused tasks, such as dependency parsing, and annotations. While transformers has tools for tasks that span beyond just NLP.

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# the problem with fuzz is that it does not capture sematic meaning -> good ratio can be very bad since wording is key in TORs
str1 = 'Oracle             database'
str2 = 'Oracle database'
display(fuzz.token_sort_ratio(str1, str2)) # token based -> order does not matter as much as long as words are the same
display(fuzz.ratio(str1, str2)) # Order matters -> whitespace also effect the output

100

71

In [3]:
# test using sentence models -> pip install sentence-transformers
from sentence_transformers import SentenceTransformer, util

# load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define the two sentences
sentence1 = 'I want to really eat some ice cream at the store'
sentence2 = 'I want to really not eat some ice cream at the storesssssssssss'

# Generate embeddings for each sentence
embedding1 = model.encode(sentence1, convert_to_tensor=True)
embedding2 = model.encode(sentence2, convert_to_tensor=True)

# Compute cosine similarity between the embeddings
cosine_score = util.pytorch_cos_sim(embedding1, embedding2)
print("Cosine similarity:", cosine_score.item())

Cosine similarity: 0.8878823518753052


In [None]:
# Create spacy nlp object
# load en_core_web_md (small model), en_core_web_lg (large model), en_core_web_trf (largest)
# pip uninstall en-core-web-lg
#nlp = spacy.load("en_core_web_lg")

import spacy
nlp = spacy.load("en_core_web_trf")
print(nlp.meta)


  from .autonotebook import tqdm as notebook_tqdm


{'lang': 'en', 'name': 'core_web_trf', 'version': '3.8.0', 'description': "English transformer pipeline (Transformer(name='roberta-base', piece_encoder='byte-bpe', stride=104, type='roberta', width=768, window=144, vocab_size=50265)). Components: transformer, tagger, parser, ner, attribute_ruler, lemmatizer.", 'author': 'Explosion', 'email': 'contact@explosion.ai', 'url': 'https://explosion.ai', 'license': 'MIT', 'spacy_version': '>=3.8.0,<3.9.0', 'spacy_git_version': '5010fcbd3', 'vectors': {'width': 0, 'vectors': 0, 'keys': 0, 'name': None, 'mode': 'default'}, 'labels': {'transformer': [], 'tagger': ['$', "''", ',', '-LRB-', '-RRB-', '.', ':', 'ADD', 'AFX', 'CC', 'CD', 'DT', 'EX', 'FW', 'HYPH', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NFP', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', 'XX', '``'], 'parser': ['ROOT', 'acl', 'acomp', 'advcl', 'advmod', 'agent', 'am

In [None]:
# Statement accuracy rate, compare between sentence transformer vs spacy vs fuzzywuzzy

In [9]:
import pandas as pd # for data manipulation
from sentence_transformers import SentenceTransformer, util

# Import the two excel file - input file and reference file
df_main = pd.read_excel('Excel_file/Main.xlsx')
df_compare = pd.read_excel('Excel_file/Compare.xlsx')

# Import thai compatible model
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Encode all statements from Main.xlsx as a single batch
main_statements = df_main['Statement'].tolist()
main_embeddings = model.encode(main_statements, convert_to_tensor=True, show_progress_bar=True)
print(type(main_embeddings))
main_embeddings.shape

Batches: 100%|██████████| 11/11 [00:01<00:00,  6.43it/s]

<class 'torch.Tensor'>





torch.Size([341, 384])

In [4]:
import pickle # for caching main embeddings

# testing pickle, pk1 is pickle file, can be any file type really but pk1 just to demonstrate
student_names = ['Kay','Bob','Elena','Jane','Kyle']
with open('student_file.pkl', 'wb') as f:  # open a text file
    pickle.dump(student_names, f) # serialize the list
f.close()

In [5]:
with open('student_file.pkl', 'rb') as f:  # open a text file
    list_name = pickle.load(f) # deserialize the list
f.close()
print(list_name)

['Kay', 'Bob', 'Elena', 'Jane', 'Kyle']


In [None]:
# selecting excel test
import pandas as pd
import re

def excel_cell_to_indices(cell_str):
    """
    Converts an Excel cell address (e.g., "A5") to zero-based (row, column) indices.
    """
    match = re.match(r"([A-Za-z]+)([0-9]+)", cell_str)
    if not match:
        raise ValueError("Invalid cell format: " + cell_str)
    col_str, row_str = match.groups()
    # Convert letters to a zero-based column index:
    col_idx = 0
    for char in col_str.upper():
        col_idx = col_idx * 26 + (ord(char) - ord('A') + 1)
    col_idx -= 1  # adjust to zero-based index
    row_idx = int(row_str) - 1  # adjust to zero-based index
    return row_idx, col_idx

def slice_excel_by_cells(df, num_start, num_end, stmt_start, stmt_end):
    """
    Extracts two series from the DataFrame based on provided Excel cell ranges.
    
    Args:
        df (pd.DataFrame): DataFrame read from the Excel file.
        num_start (str): Starting cell for TOR comply numbers (e.g., "A5").
        num_end (str): Ending cell for TOR comply numbers (e.g., "A23").
        stmt_start (str): Starting cell for TOR comply statements (e.g., "B5").
        stmt_end (str): Ending cell for TOR comply statements (e.g., "B23").
    
    Returns:
        (pd.Series, pd.Series): Two series, one for numbers and one for statements.
    """
    num_start_row, num_start_col = excel_cell_to_indices(num_start)
    num_end_row, _ = excel_cell_to_indices(num_end)  # Column should be same as start for numbers
    stmt_start_row, stmt_start_col = excel_cell_to_indices(stmt_start)
    stmt_end_row, _ = excel_cell_to_indices(stmt_end)  # Column should be same as start for statements
    
    # Slicing includes the ending row so add 1 (pandas slicing is end-exclusive)
    numbers = df.iloc[num_start_row:num_end_row+1, num_start_col]
    statements = df.iloc[stmt_start_row:stmt_end_row+1, stmt_start_col]
    return numbers, statements

# Example usage:
# Read the Excel file (adjust header settings if needed)
df = pd.read_excel("Excel_file/Unformat_test.xlsx", header=None)

# Dynamically select ranges using Excel cell notation.
tor_numbers, tor_statements = slice_excel_by_cells(df, "A6", "A23", "B6", "B23")

# Combine into a new DataFrame with proper column names
result_df = pd.DataFrame({
    "TOR comply number": tor_numbers,
    "TOR comply statement": tor_statements
})

print(result_df)


   TOR comply number                               TOR comply statement
5                4.2  สามารถเลือกทำงานบนระบบปฏิบัติการ Windows หรือ ...
6                4.3  เป็นฐานข้อมูลที่มีระบบ Lock ข้อมูลในระดับ Row ...
7                4.4  มีคุณสมบัติในการทำ Multi-Version Read Consiste...
8                4.5  สามารถทำการเก็บข้อมูลและแสดงผลได้ทั้งภาษาไทยแล...
9                4.6  มีการทำงานแบบ Machine Learning เพื่อช่วยเพิ่มป...
10               4.7  มีการทำงานแบบ Query Optimization และสามารถทำงา...
11               4.8  สามารถรองรับการจัดเก็บข้อมูลในรูปแบบ JSON โดยส...
12               4.9       สามารถทำงานในรูปแบบระบบฐานข้อมูลแบบ Graph ได
13               4.1  มีเครื่องมือรองรับในการจัดการระบบไฟล์สำหรับไฟล...
14               NaN  4.10.1 รองรับการช่วยกระจาย I/O ไปยังดิสก์ข้อมู...
15               NaN  4.10.2 รองรับการเพิ่มหรือลดจำนวน disk ได้โดยไม...
16               NaN  4.10.3 รองรับการจัดเรียงการกระจายของข้อมูลใหม่...
17               NaN  4.10.4 รองรับการ Mirror Resync ข้อมูลระหว่