In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import random

def get_norms(path, sheet_name, header):
    # Load the Excel sheet into a DataFrame
    df1 = pd.read_excel(path, sheet_name=sheet_name, header=header)

    # Filter the relevant columns and drop rows with missing values in important columns
    df = df1[["Norm Name", "BOQ Description", "Item Name", "Specification", "SubSpec"]].dropna(subset=["Norm Name", "BOQ Description", "Item Name", "Specification", "SubSpec"])

    # Create a new 'BOQ Description' by concatenating the relevant columns
    df['Combined BOQ Description'] = df.apply(lambda row: ' '.join([str(row['BOQ Description']), str(row['Item Name']), str(row['Specification']), str(row['SubSpec'])]), axis=1)

    # Select only the necessary columns for the final DataFrame
    final_df = df[["Norm Name", "Combined BOQ Description"]]

    return final_df

# Load data from each sheet into DataFrames
target_descriptions1 = get_norms("data For training.xlsx", sheet_name="Project 1", header=0)
target_descriptions2 = get_norms("data For training.xlsx", sheet_name="data ", header=0)
target_descriptions3 = get_norms("data For training.xlsx", sheet_name="Project 2", header=0)
target_descriptions4 = get_norms("data For training.xlsx", sheet_name="Project 3", header=0)
target_descriptions5 = get_norms("data For training.xlsx", sheet_name="Project 4", header=0)

# Combine all data into a single DataFrame
combined_df = pd.concat([target_descriptions1, target_descriptions2, target_descriptions3, target_descriptions4, target_descriptions5], ignore_index=True)

# Create training data with sentence pairs and labels
train_examples = [InputExample(texts=[row["Norm Name"], row["Combined BOQ Description"]], label=1.0) for index, row in combined_df.iterrows()]

# Optional: Create negative examples by pairing mismatched Norms and BOQ Descriptions
negative_examples = []
boq_descriptions = combined_df["Combined BOQ Description"].tolist()
for index, row in combined_df.iterrows():
    random_boq = random.choice(boq_descriptions)
    if random_boq != row["Combined BOQ Description"]:  # Ensure we don't accidentally pick the matching pair
        negative_examples.append(InputExample(texts=[row["Norm Name"], random_boq], label=0.0))

# Combine positive and negative examples
train_examples.extend(negative_examples)

# Load a pre-trained sentence transformer model
model_name = 'distilbert-base-nli-stsb-mean-tokens'
model = SentenceTransformer(model_name)

# Create a DataLoader with InputExamples
train_dataloader = DataLoader(train_examples, shuffle=False, batch_size=16)

# Define the loss function (CosineSimilarityLoss is good for similarity tasks)
train_loss = losses.CosineSimilarityLoss(model)

# Train the model on CPU
num_epochs = 20
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=100,  # Usually a good practice to avoid high initial learning rates
    output_path='./trained_model',  # Where to save the model
    use_amp=False  # Automatic mixed precision, not used here since we're on CPU
)

# Save the trained model
model.save('./trained_model')

print("Model training complete and saved at './trained_model'.")


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.05k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

NameError: name 'Dataset' is not defined

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Load the trained model
model = SentenceTransformer('./trained_model')

# Re-load combined data (same as the training data)
combined_df = pd.concat([
    get_norms("data For training.xlsx", sheet_name="Project 1", header=0),
    get_norms("data For training.xlsx", sheet_name="data ", header=0),
    get_norms("data For training.xlsx", sheet_name="Project 2", header=0),
    get_norms("data For training.xlsx", sheet_name="Project 3", header=0),
    get_norms("data For training.xlsx", sheet_name="Project 4", header=0)
], ignore_index=True)

# Encode all norms (this should match how you trained the model)
norms = combined_df["Norm Name"].tolist()
norm_embeddings = model.encode(norms, convert_to_tensor=True)

def find_most_similar_norm(boq_description):
    # Encode the provided BOQ description
    boq_embedding = model.encode([boq_description], convert_to_tensor=True)

    # Compute the cosine similarities between the BOQ description and all norms
    cosine_similarities = util.pytorch_cos_sim(boq_embedding, norm_embeddings)

    # Find the index of the most similar norm
    most_similar_idx = cosine_similarities.argmax().item()

    # Retrieve the most similar norm and its similarity score
    most_similar_norm = norms[most_similar_idx]
    similarity_score = cosine_similarities[0, most_similar_idx].item()

    return most_similar_norm, similarity_score

# Example BOQ description for testing
boq_description = "floor Tile Works in Terrace & Balcony Tiles "

# Find and print the most similar norm
most_similar_norm, similarity_score = find_most_similar_norm(boq_description)
print(f"The most similar norm for '{boq_description}' is '{most_similar_norm}' with a similarity score of {similarity_score:.4f}")


The most similar norm for 'floor Tile Works in Terrace & Balcony Tiles ' is 'Tile skirting works in 1:4 cement sand mortar' with a similarity score of 0.9150
