In [2]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm





In [154]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [3]:
import pandas as pd
import os

file_path = '../Data/merged_dataset.csv'

if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print("File loaded successfully.")
    # print(df.head())
else:
    print(f"File not found: {file_path}")

df.info()

File loaded successfully.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88124 entries, 0 to 88123
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   text1         88124 non-null  object 
 1   text2         21624 non-null  object 
 2   is_duplicate  21624 non-null  float64
dtypes: float64(1), object(2)
memory usage: 2.0+ MB


In [6]:
len(df)

88124

In [5]:
for index, row in df.iterrows():
    if pd.isnull(row.text2) and not pd.isnull(row.is_duplicate):
        print(row)

### Dataset modifications
Use 256 as the token limit, split all rows inside the datset into chunks less than 256 tokens (sentence-based)

we can use the same dataset for every model if it falls within the token limit

Max length:
- Sent-T5: 256
- InstructorEmbedding: 512
- e5-base-v2: 512
- bge-base-en-v1.5: 512


Instructor Embedding does matter when spacing is irregular, especially for tokens like `., !,..etc`, but as long as we only split based on empty spaces we should be fine

Since we can only modify `text1` and not `text2` without losing ground truth, we need to make sure all samples that have ground truth be less than 256 tokens

Violations:
- With all-miniLM-L6-V2: we have one violations at index 71032
    - Remove index 71032
- With Instructor Embeddings (this have the max length of 512 tokens, so it can easily work with the text we currently for all-miniLM-L6-V2):
    - No violations
- With e5-base-v2 (This model is less sensitive so different `., !` spacing won't affect it):
    - No violations
- with bge-base-en-v1.5 (Same goes for this model -- not as sensitive)
    - No violations

Next step create a seperate dataset for each model (✅):
- Sent-T5 ✅
- InstructorEmbedding ✅
- e5-base-v2 ✅
- bge-base-en-v1.5 ✅


In [None]:
ground_truth_df = df[(df['text2'].notnull()) & (df['is_duplicate'].notnull())]

21624

In [None]:
model = SentenceTransformer('BAAI/bge-base-en-v1.5')
tokenizer = model.tokenizer

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


### New dataframe creation

In [170]:
def split_string(s, tokenizer, verbose=False) -> list:
    """ Splits long text into smaller chunks based on sentence boundaries"""
    import nltk
    nltk.download('punkt', quiet=True)
    from nltk.tokenize import sent_tokenize
    sentences = sent_tokenize(s)
    
    cap = tokenizer.model_max_length
    chunks = []
    current_chunk = ""
    current_tokens = 0

    if verbose:
        total = 0
        print(f"Splitting string of length {len(tokenizer.tokenize(s))} into chunks with max {cap} tokens.")
    
    for sentence in sentences:
        tokenized = tokenizer.tokenize(sentence)
        if verbose:
            total += len(tokenized)
            print(f"Sentence length: {len(tokenized)}, Sequence: {sentence} Total so far: {total}")
        if current_tokens + len(tokenized) > cap:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence
            current_tokens = len(tokenized)
        else:
            current_chunk += " " + sentence if current_chunk else sentence
            current_tokens += len(tokenized)
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

def reformat_dataframe(df: pd.DataFrame, tokenizer, verbose=False) -> pd.DataFrame:
    """ Reformats the dataframe by splitting long texts into smaller chunks"""

    result_df = pd.DataFrame(columns=['text1', 'text2', 'is_duplicate'])
    
    for index, row in df.iterrows():
        if not pd.isnull(row.text2): 
            """Appends the full row"""
            result_df = pd.concat([result_df, pd.DataFrame([row])], ignore_index=True)
            continue
        text = row['text1']
        tokenized = tokenizer.tokenize(text)
        if len(tokenized) > tokenizer.model_max_length:
            if verbose:
                print(f"Index {index} text length {len(tokenized)} exceeds max length {tokenizer.model_max_length}. Splitting...")
            # split_string should return a list of text chunks instead
            chunks = split_string(text, tokenizer, verbose=verbose)
            if verbose:
                print(f"Split into {len(chunks)} chunks.")
            for chunk in chunks:
                new_row = {'text1': chunk, 'text2': None, 'is_duplicate': None}
                result_df = pd.concat([result_df, pd.DataFrame([new_row])], ignore_index=True)
        else:
            result_df = pd.concat([result_df, pd.DataFrame([row])], ignore_index=True)
    
    return result_df

In [184]:
model_names = [
    'BAAI/bge-base-en-v1.5',
    'sentence-transformers/all-MiniLM-L6-v2',
    'hkunlp/instructor-base',
    'intfloat/e5-base-v2'
]

# resulting_dfs = {}

# for model_name in model_names:

#     reference_df = df.copy()

#     if model_name == 'sentence-transformers/all-MiniLM-L6-v2':
#         # remove the 71032 index row which causes issues with this model
#         reference_df = reference_df.drop(index=71032)
#         reference_df.reset_index(drop=True, inplace=True)

#     print(f"Processing for model: {model_name}")
#     model = SentenceTransformer(model_name)
#     tokenizer = model.tokenizer
#     reformatted_df = reformat_dataframe(reference_df, tokenizer, verbose=False)

#     resulting_dfs[model_name] = reformatted_df

#     output_file = f'../Data/{model_name.split("/")[-1]}_dataset.xlsx'
    
#     with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
#         reformatted_df.to_excel(writer, index=False, sheet_name='Sheet1')

#     print(f"Reformatted data saved to {output_file}")

In [2]:
import os
import pandas as pd

file_names = [
    'bge-base-en-v1.5_dataset.xlsx',
    'all-MiniLM-L6-v2_dataset.xlsx',
    'instructor-base_dataset.xlsx',
    'e5-base-v2_dataset.xlsx'
]

for index, file_name in enumerate(file_names):
    file_path = os.path.join('../Data/', file_name)
    if os.path.exists(file_path):
        df = pd.read_excel(file_path)
        print(f"Loaded {file_name} with {len(df)} rows.")
    else:
        print(f"File not found: {file_path}")
    

    count_text2 = 0
    count_null = 0
    # model = SentenceTransformer(model_names[index])
    # tokenizer = model.tokenizer

    # dropped = False

    for idx, row in df.iterrows():
        if pd.notnull(row.text2):
            count_text2 += 1
        if pd.notnull(row.is_duplicate):
            count_null += 1
    
    print(f"In file {file_name}, count of non-null text2: {count_text2}, count of non-null isduplicate: {count_null}")

    # if dropped:
    #     print(f"Dropped rows exceeding token limit in {file_name}. Updating file...")
    #     df.reset_index(drop=True, inplace=True)
    #     #update the excel file
    #     with pd.ExcelWriter(file_path + '-version2.xlsx', engine='xlsxwriter') as writer:
    #         df.to_excel(writer, index=False, sheet_name='Sheet1')

Loaded bge-base-en-v1.5_dataset.xlsx with 93630 rows.
In file bge-base-en-v1.5_dataset.xlsx, count of non-null text2: 21624, count of non-null isduplicate: 21624
Loaded all-MiniLM-L6-v2_dataset.xlsx with 101942 rows.
In file all-MiniLM-L6-v2_dataset.xlsx, count of non-null text2: 21623, count of non-null isduplicate: 21623
Loaded instructor-base_dataset.xlsx with 94621 rows.
In file instructor-base_dataset.xlsx, count of non-null text2: 21624, count of non-null isduplicate: 21624
Loaded e5-base-v2_dataset.xlsx with 93630 rows.
In file e5-base-v2_dataset.xlsx, count of non-null text2: 21624, count of non-null isduplicate: 21624


### Apply a text - sentence-based split using the tokenizer of the corresponding model

### All-minilm-L6-V2

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')
"""This model is suitable for general-purpose sentence embeddings 
and it can extend to paragraphs"""

tokenizer = model.tokenizer
max_length = tokenizer.model_max_length

In [None]:
"""Stores the embeddings in a new column dataframe, that has columns 'embeddings' and 'no' 
is the row number in the original dataframe"""


"""Verify the final embeddings df have the same number of rows as the original df"""
from tqdm import tqdm

target_df = pd.DataFrame(columns=['embeddings', 'no'])

embeddings_list = []
no_list = []


for row in tqdm(df.itertuples(), desc="Generating embeddings", total=len(df)):
    text1_embedding = model.encode(row.text1)
    embeddings_list.append(text1_embedding)
    no_list.append(row.Index)

    if not pd.isnull(row.text2) and row.text2.strip() != "":
        text2_embedding = model.encode(row.text2)
        embeddings_list.append(text2_embedding)
        no_list.append(row.Index)

target_df = pd.DataFrame({'embeddings': embeddings_list, 'no': no_list})

"""We can save using either .pkl or .parquet, pkl is easier to use in pandas, parquet is more efficient in storage"""
output_path = '../Data/embeddings.pkl'
target_df.to_pickle(output_path)

Generating embeddings: 100%|██████████| 88127/88127 [22:34<00:00, 65.07it/s]


In [5]:
# Install InstructorEmbedding if not already installed
from InstructorEmbedding import INSTRUCTOR
import pandas as pd
from tqdm import tqdm

# Load the instructor model with compatibility fix
try:
    instructor_model = INSTRUCTOR('hkunlp/instructor-large')
except TypeError as e:
    # Alternative approach if there are compatibility issues
    print(f"Using alternative initialization due to error: {e}")
    try:
        instructor_model = INSTRUCTOR('hkunlp/instructor-large', cache_folder=None)
    except:
        print("Fallback: Using sentence-transformers directly")
        from sentence_transformers import SentenceTransformer
        instructor_model = SentenceTransformer('hkunlp/instructor-large')

# Create a new dataframe for instructor embeddings
instructor_df = pd.DataFrame(columns=['embeddings', 'no'])

instructor_embeddings_list = []
instructor_no_list = []

# Define instruction for the embedding task
instruction = "Represent the text for similarity comparison:"

for row in tqdm(df.itertuples(), desc="Generating Instructor embeddings", total=len(df)):
    # Encode text1 with instruction
    text1_instructor_embedding = instructor_model.encode([[instruction, row.text1]])
    instructor_embeddings_list.append(text1_instructor_embedding[0])  # Get the first (and only) embedding
    instructor_no_list.append(row.Index)
    
    # Encode text2 if it exists and is not empty
    if not pd.isnull(row.text2) and row.text2.strip() != "":
        text2_instructor_embedding = instructor_model.encode([[instruction, row.text2]])
        instructor_embeddings_list.append(text2_instructor_embedding[0])
        instructor_no_list.append(row.Index)

# Create the instructor embeddings dataframe
instructor_df = pd.DataFrame({'embeddings': instructor_embeddings_list, 'no': instructor_no_list})

# Save the instructor embeddings
instructor_output_path = '../Data/instructor_embeddings.pkl'
instructor_df.to_pickle(instructor_output_path)

print(f"Instructor embeddings saved to {instructor_output_path}")
print(f"Shape of instructor embeddings dataframe: {instructor_df.shape}")
print(f"Shape of original dataframe: {df.shape}")

Using alternative initialization due to error: INSTRUCTOR._load_sbert_model() got an unexpected keyword argument 'token'
Fallback: Using sentence-transformers directly


Generating Instructor embeddings: 100%|██████████| 88127/88127 [2:53:47<00:00,  8.45it/s]      


Instructor embeddings saved to ../Data/instructor_embeddings.pkl
Shape of instructor embeddings dataframe: (109753, 2)
Shape of original dataframe: (88127, 3)
