In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
"""This model is suitable for general-purpose sentence embeddings 
and it can extend to paragraphs"""

  from .autonotebook import tqdm as notebook_tqdm





In [4]:
import pandas as pd
import os

file_path = '../Data/merged_dataset.csv'

if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print("File loaded successfully.")
    # print(df.head())
else:
    print(f"File not found: {file_path}")

df.info()

File loaded successfully.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88127 entries, 0 to 88126
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   text1         88127 non-null  object 
 1   text2         21626 non-null  object 
 2   is_duplicate  21627 non-null  float64
dtypes: float64(1), object(2)
memory usage: 2.0+ MB


In [29]:
df.iloc[0:20]

Unnamed: 0,text1,text2,is_duplicate
0,My son got this toy for his birthday. The kids...,,
1,A person I work with highly recommended it. He...,,
2,This DVD is real old school and cheezeball. Ba...,,
3,Mr. Spong has attempted to reduce God to base ...,,
4,I am the fourth review and the third to compla...,,
5,I think that the whole aspect of hitting the r...,,
6,"I own all the cd's, been to 3 excellent concer...",,
7,Excellent first and only effort from this band...,,
8,"Sean proves at least one point in this book, a...",,
9,The first two volumes can be rather slow in pa...,,


In [34]:
"""Stores the embeddings in a new column dataframe, that has columns 'embeddings' and 'no' 
is the row number in the original dataframe"""

"""Verify the final embeddings df have the same number of rows as the original df"""
from tqdm import tqdm

target_df = pd.DataFrame(columns=['embeddings', 'no'])

embeddings_list = []
no_list = []


for row in tqdm(df.itertuples(), desc="Generating embeddings", total=len(df)):
    text1_embedding = model.encode(row.text1)
    embeddings_list.append(text1_embedding)
    no_list.append(row.Index)

    if not pd.isnull(row.text2) and row.text2.strip() != "":
        text2_embedding = model.encode(row.text2)
        embeddings_list.append(text2_embedding)
        no_list.append(row.Index)

target_df = pd.DataFrame({'embeddings': embeddings_list, 'no': no_list})

"""We can save using either .pkl or .parquet, pkl is easier to use in pandas, parquet is more efficient in storage"""
output_path = '../Data/embeddings.pkl'
target_df.to_pickle(output_path)

Generating embeddings: 100%|██████████| 88127/88127 [22:34<00:00, 65.07it/s]


In [5]:
# Install InstructorEmbedding if not already installed
from InstructorEmbedding import INSTRUCTOR
import pandas as pd
from tqdm import tqdm

# Load the instructor model with compatibility fix
try:
    instructor_model = INSTRUCTOR('hkunlp/instructor-large')
except TypeError as e:
    # Alternative approach if there are compatibility issues
    print(f"Using alternative initialization due to error: {e}")
    try:
        instructor_model = INSTRUCTOR('hkunlp/instructor-large', cache_folder=None)
    except:
        print("Fallback: Using sentence-transformers directly")
        from sentence_transformers import SentenceTransformer
        instructor_model = SentenceTransformer('hkunlp/instructor-large')

# Create a new dataframe for instructor embeddings
instructor_df = pd.DataFrame(columns=['embeddings', 'no'])

instructor_embeddings_list = []
instructor_no_list = []

# Define instruction for the embedding task
instruction = "Represent the text for similarity comparison:"

for row in tqdm(df.itertuples(), desc="Generating Instructor embeddings", total=len(df)):
    # Encode text1 with instruction
    text1_instructor_embedding = instructor_model.encode([[instruction, row.text1]])
    instructor_embeddings_list.append(text1_instructor_embedding[0])  # Get the first (and only) embedding
    instructor_no_list.append(row.Index)
    
    # Encode text2 if it exists and is not empty
    if not pd.isnull(row.text2) and row.text2.strip() != "":
        text2_instructor_embedding = instructor_model.encode([[instruction, row.text2]])
        instructor_embeddings_list.append(text2_instructor_embedding[0])
        instructor_no_list.append(row.Index)

# Create the instructor embeddings dataframe
instructor_df = pd.DataFrame({'embeddings': instructor_embeddings_list, 'no': instructor_no_list})

# Save the instructor embeddings
instructor_output_path = '../Data/instructor_embeddings.pkl'
instructor_df.to_pickle(instructor_output_path)

print(f"Instructor embeddings saved to {instructor_output_path}")
print(f"Shape of instructor embeddings dataframe: {instructor_df.shape}")
print(f"Shape of original dataframe: {df.shape}")

Using alternative initialization due to error: INSTRUCTOR._load_sbert_model() got an unexpected keyword argument 'token'
Fallback: Using sentence-transformers directly


Generating Instructor embeddings: 100%|██████████| 88127/88127 [2:53:47<00:00,  8.45it/s]      


Instructor embeddings saved to ../Data/instructor_embeddings.pkl
Shape of instructor embeddings dataframe: (109753, 2)
Shape of original dataframe: (88127, 3)
