In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# 1) Read the CSV files
df1 = pd.read_csv("tree_species.csv")
df2 = pd.read_csv("merged_df1_df2.csv")

# 2) Extract the relevant columns into separate DataFrames
df1_names = df1[['Botanical Name']].copy()
df2_names = df2[['plant_name']].copy()

# 3) Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# 4) Create embeddings for each botanical name in both DataFrames
df1_names['embedding'] = df1_names['Botanical Name'].apply(lambda x: model.encode(str(x)))
df2_names['embedding'] = df2_names['plant_name'].apply(lambda x: model.encode(str(x)))

# 5) Set a cosine similarity threshold for matching
threshold = 0.80

# 6) Find plants in df1 that do NOT have any similar match in df2 above the threshold
unmatched_plants = []

for i, row1 in df1_names.iterrows():
    emb1 = row1['embedding'].reshape(1, -1)
    match_found = False

    # Check against each plant in df2
    for j, row2 in df2_names.iterrows():
        emb2 = row2['embedding'].reshape(1, -1)
        sim = cosine_similarity(emb1, emb2)[0][0]
        if sim >= threshold:
            match_found = True
            break  # No need to check further if a match is found

    # If no match was found, add the plant to the unmatched list
    if not match_found:
        unmatched_plants.append(row1['Botanical Name'])

# 7) Print the list of unmatched plants
print("Plants in tree_species.csv not present in merged_df1_df2.csv (based on vector similarity):")
for plant in unmatched_plants:
    print(plant)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Plants in tree_species.csv not present in merged_df1_df2.csv (based on vector similarity):
Terminalia Chebula
Terminalia Arjuna
Terminalia Tomentosa
Hardwickia Binata
Dalbergia Sissoo
Albizia Procera
Putranjiva Roxburghii Wall
Spathodea Campanulata
Anogeissus Latifolia
Oroxylum Indicum
Senegalia Catechu
Putranjiva Roxburghii Wall
Saraca Asoca
Jacaranda Mimosifolia
Dalbergia Gardenia
Cestrum Nocturnum
Handroanthus Impetiginosus
Rauvolfia Serpentina


In [25]:
df2.head()

Unnamed: 0,plant_name,embedding,source
0,Aegle marmelos (L.) Corr.,[-4.91057150e-02 6.15545809e-02 -2.40750611e-...,df2 (intersection)
1,Alpinia galanga (L.) Willd.,[-2.07084678e-02 6.79903664e-03 -6.37864247e-...,df2 (intersection)
2,Amaranthus viridis L.,[-1.44480709e-02 4.78543751e-02 -7.27312267e-...,df2 (intersection)
3,Artocarpus heterophyllus Lam.,[ 1.08356848e-02 9.64791998e-02 -7.25910738e-...,df2 (intersection)
4,Azadirachta indica (L.) A.Juss.,[-3.44188362e-02 1.09954931e-01 -9.46124271e-...,df2 (intersection)


In [26]:
import pandas as pd

# Suppose df2 already exists in memory with columns: ['plant_name', 'embedding', 'source']

# 1) Drop unwanted columns
df2_final = df2.drop(['embedding', 'source'], axis=1)

# 2) List of new plant names to add
new_plants = [
    "Terminalia Chebula",
    "Terminalia Arjuna",
    "Terminalia Tomentosa",
    "Hardwickia Binata",
    "Dalbergia Sissoo",
    "Albizia Procera",
    "Putranjiva Roxburghii Wall",
    "Spathodea Campanulata",
    "Anogeissus Latifolia",
    "Oroxylum Indicum",
    "Senegalia Catechu",
    "Putranjiva Roxburghii Wall",
    "Saraca Asoca",
    "Jacaranda Mimosifolia",
    "Dalbergia Gardenia",
    "Cestrum Nocturnum",
    "Handroanthus Impetiginosus",
    "Rauvolfia Serpentina"
]

# 3) Create a new DataFrame with these additional names
df_new_plants = pd.DataFrame(new_plants, columns=["plant_name"])

# 4) Concatenate with the df2_final
df2_final = pd.concat([df2_final, df_new_plants], ignore_index=True)

# 5) Export to CSV
df2_final.to_csv("finalCSV_Major_Project.csv", index=False)

print("finalCSV_Major_Project.csv created successfully!")


finalCSV_Major_Project.csv created successfully!


In [27]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# 1) Read the CSV file
df = pd.read_csv("finalCSV_Major_Project.csv")

# 2) Load the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# 3) Create embeddings for each plant_name
df['embedding'] = df['plant_name'].apply(lambda x: model.encode(str(x)))

# 4) Compare all pairs using cosine similarity
duplicates = []
threshold = 0.80  # Adjust as needed

for i in range(len(df) - 1):
    emb_i = df.loc[i, 'embedding'].reshape(1, -1)
    for j in range(i + 1, len(df)):
        emb_j = df.loc[j, 'embedding'].reshape(1, -1)
        sim = cosine_similarity(emb_i, emb_j)[0][0]
        if sim >= threshold:
            duplicates.append({
                "name1": df.loc[i, 'plant_name'],
                "name2": df.loc[j, 'plant_name'],
                "similarity": sim
            })

# 5) Check and print results
if len(duplicates) > 0:
    print("Potential duplicates found (similarity >= {}):".format(threshold))
    for d in duplicates:
        print(d)
else:
    print(0)


Potential duplicates found (similarity >= 0.8):
{'name1': 'Amaranthus viridis L.', 'name2': 'Amaranthus lividus L.', 'similarity': 0.8192979}
{'name1': 'Amaranthus viridis L.', 'name2': 'Amaranthus caudatus L.', 'similarity': 0.8715073}
{'name1': 'Amaranthus viridis L.', 'name2': 'Amaranthus spinosus L.', 'similarity': 0.8272556}
{'name1': 'Crotalaria juncea L.', 'name2': 'Crotalaria retusa L.', 'similarity': 0.8603959}
{'name1': 'Hibiscus rosa-sinensis L.', 'name2': 'Hibiscus surattensis L.', 'similarity': 0.8669882}
{'name1': 'Abelmoschus manihot (L.) Medik.', 'name2': 'Abelmoschus moschatus Medik.', 'similarity': 0.82750934}
{'name1': 'Acacia catechu (L.f.) Willd.', 'name2': 'Acacia farnesiana (L.) Willd.', 'similarity': 0.82852256}
{'name1': 'Acacia catechu (L.f.) Willd.', 'name2': 'Acacia leucophloea (Roxb.) Willd.', 'similarity': 0.8015326}
{'name1': 'Acacia catechu (L.f.) Willd.', 'name2': 'Acacia pennata (L.) Willd.', 'similarity': 0.85371786}
{'name1': 'Acacia catechu (L.f.) W

In [35]:
# Iterate over the duplicates and print only those with similarity > 0.9
for dup in duplicates:
    if dup["similarity"] > 0.9:
        print(f"Duplicate pair: {dup['name1']} and {dup['name2']} (Similarity: {dup['similarity']:.3f})")


Duplicate pair: Alhagi pseudoalhagi (Bieb.) Desv. and Alhagi pseudoalhagi (Bieb.) Desv. (Similarity: 1.000)
Duplicate pair: Cassia alata L. and Cassia alata L. (Similarity: 1.000)
Duplicate pair: Cordia myxa Roxb. and Cordia myxa Roxb. (non L.) (Similarity: 0.934)
Duplicate pair: Elaeagnus angustifolia L. and Elaeagnus latifolia L. (Similarity: 0.937)
Duplicate pair: Euphorbia nivulea Buch.-Ham. and Euphorbia nivulea Buch.-Ham. (Similarity: 1.000)
Duplicate pair: Garcinia cambogia (Gaertn.) Desr. and Garcinia morella (Gaertn.) Desr. (Similarity: 0.910)
Duplicate pair: Heliotropium bacciferum Forsk. and Heliotropium ovalifolium Forsk. (Similarity: 0.905)
Duplicate pair: Rumex acetosa L. and Rumex acetosella L. (Similarity: 0.902)
Duplicate pair: Putranjiva Roxburghii Wall and Putranjiva Roxburghii Wall (Similarity: 1.000)
