<a href="https://colab.research.google.com/github/Mwadz/Sematic-Text-Similarity/blob/main/16th_Sept_Semantic_text_Similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations

In [32]:
! pip install datasets
! pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Libraries

In [33]:
#loading training set
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
from datasets import load_dataset


# Load the English STSB dataset
stsb_dataset = load_dataset('stsb_multi_mt', 'en')
stsb_train = pd.DataFrame(stsb_dataset['train'])
stsb_test = pd.DataFrame(stsb_dataset['test'])

# Check loaded data
print(stsb_train.shape, stsb_test.shape)
stsb_test.head()



  0%|          | 0/3 [00:00<?, ?it/s]

(5749, 3) (1379, 3)


Unnamed: 0,sentence1,sentence2,similarity_score
0,A girl is styling her hair.,A girl is brushing her hair.,2.5
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,3.6
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,5.0
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,4.2
4,A man is playing a harp.,A man is playing a keyboard.,1.5


## Creating helper functions
* The first function is to pre-process texts by lemmatizing, lowercasing, and removing numbers and stop words.
* The second function takes in two columns of text embeddings and returns the row-wise cosine similarity between the two columns.

In [34]:
from sklearn.metrics.pairwise import cosine_similarity
import spacy
nlp = spacy.load("en_core_web_sm")

def text_processing(sentence):
    """
    Lemmatize, lowercase, remove numbers and stop words
    
    Args:
      sentence: The sentence we want to process.
    
    Returns:
      A list of processed words
    """
    sentence = [token.lemma_.lower()
                for token in nlp(sentence) 
                if token.is_alpha and not token.is_stop]
    
    return sentence


def cos_sim(sentence1_emb, sentence2_emb):
    """
    Cosine similarity between two columns of sentence embeddings
    
    Args:
      sentence1_emb: sentence1 embedding column
      sentence2_emb: sentence2 embedding column
    
    Returns:
      The row-wise cosine similarity between the two columns.
      For instance is sentence1_emb=[a,b,c] and sentence2_emb=[x,y,z]
      Then the result is [cosine_similarity(a,x), cosine_similarity(b,y), cosine_similarity(c,z)]
    """
    cos_sim = cosine_similarity(sentence1_emb, sentence2_emb)
    return np.diag(cos_sim)

# Data Setup

In [35]:
data = (pd.read_csv("/content/SBERT_data.csv")).drop(['Unnamed: 0'], axis = 1)

prompt = input("Enter prompt: ")
data['prompt']= prompt
data.rename(columns = {'target_text':'sentence2', 'prompt':'sentence1'}, inplace = True)
data['sentence2'] = data['sentence2'].astype('str')
data['sentence1']  = data['sentence1'].astype('str')

data.head()

Enter prompt:  property name


Unnamed: 0,input_text,sentence2,sentence1
0,DOCUMENT/DEAL_SETS/DEAL_SET/DEALS/DEAL/LOANS/L...,When true indicates the purpose of the extensi...,property name
1,DOCUMENT/DEAL_SETS/DEAL_SET/DEALS/DEAL/LOANS/L...,Specifies the creditor organization type exemp...,property name
2,DOCUMENT/DEAL_SETS/DEAL_SET/DEALS/DEAL/LOANS/L...,A free-form text field used to collect additio...,property name
3,DOCUMENT/DEAL_SETS/DEAL_SET/DEALS/DEAL/LOANS/L...,Specifies the Loan Program that exempts the tr...,property name
4,DOCUMENT/DEAL_SETS/DEAL_SET/DEALS/DEAL/LOANS/L...,A free-form text field used to collect additio...,property name


# Loop

In [36]:
from sentence_transformers import CrossEncoder
XpathFinder = CrossEncoder("cross-encoder/stsb-roberta-base")
sentence_pairs = []
for sentence1, sentence2 in zip(data['sentence1'],data['sentence2']):
  sentence_pairs.append([sentence1, sentence2])

data['SBERT CrossEncoder_Score'] = XpathFinder.predict(sentence_pairs, show_progress_bar = True)

Batches:   0%|          | 0/681 [00:00<?, ?it/s]

In [37]:
#@title Sort 
data.sort_values(by=['SBERT CrossEncoder_Score'], ascending=False)


Unnamed: 0,input_text,sentence2,sentence1,SBERT CrossEncoder_Score
21158,DOCUMENT/DEAL_SETS/DEAL_SET/DEALS/DEAL/ASSETS/...,An unparsed legal description of a parcel of r...,property name,0.724039
21155,DOCUMENT/DEAL_SETS/DEAL_SET/DEALS/DEAL/ASSETS/...,An unparsed legal description of a parcel of r...,property name,0.715632
21160,DOCUMENT/DEAL_SETS/DEAL_SET/DEALS/DEAL/COLLATE...,An unparsed legal description of a parcel of r...,property name,0.701064
21157,DOCUMENT/DEAL_SETS/DEAL_SET/DEALS/DEAL/COLLATE...,An unparsed legal description of a parcel of r...,property name,0.699485
11725,DOCUMENT/DEAL_SETS/DEAL_SET/DEALS/DEAL/LOANS/L...,The monetary amount of a charge an adjustment ...,property name,0.698021
...,...,...,...,...
20898,DOCUMENT/DEAL_SETS/DEAL_SET/DEALS/DEAL/LOANS/L...,The number of months the trail plan was extend...,property name,0.051092
15790,DOCUMENT/DEAL_SETS/DEAL_SET/DEALS/DEAL/LOANS/L...,The dollar amount of principal reduction as re...,property name,0.050820
14518,DOCUMENT/DEAL_SETS/DEAL_SET/DEALS/DEAL/LOANS/L...,How often described as number of times per yea...,property name,0.049390
91,DOCUMENT/DEAL_SETS/DEAL_SET/ACH/ACHPendingDraf...,Identifies the date on which automated draftin...,property name,0.043005


### Download

In [38]:
import pickle

filename = 'XpathFinder1.sav'
pickle.dump(XpathFinder, open(filename, 'wb'))

# App

In [39]:
!pip install -q streamlit

In [48]:
%%writefile app.py
import io
import netrc
import pickle
import sys
import pandas as pd
import numpy as np
import streamlit as st
# let's import sentence transformer
import sentence_transformers
import torch
#######################################

st.markdown(
    f"""
<style>
    .reportview-container .main .block-container{{
        max-width: 90%;
        padding-top: 5rem;
        padding-right: 5rem;
        padding-left: 5rem;
        padding-bottom: 5rem;
    }}
    img{{
    	max-width:40%;
    	margin-bottom:40px;
    }}
</style>
""",
    unsafe_allow_html=True,
)

# # let's load the saved model
loaded_model = pickle.load(open('XpathFinder1.sav', 'rb'))
#loaded_model = pickle.load('XpathFinder1.sav', map_location='cpu')


#class CPU_Unpickler(pickle.Unpickler):
#    def find_class(self, module, name):
#        if module == 'torch.storage' and name == '_load_from_bytes':
#            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
#        else:
#            return super().find_class(module, name)
#

#loaded_model = CPU_Unpickler(open('XpathFinder1.sav', 'rb')).load()


# Containers
header_container = st.container()
mod_container = st.container()

# Header
with header_container:

    # different levels of text you can include in your app
    st.title("Xpath Finder App")


# model container
with mod_container:
    # collecting input from user
    prompt = st.text_input("Enter your description below ...")

    # Loading e data
    data = (pd.read_csv("/content/SBERT_data.csv")).drop(['Unnamed: 0'], axis = 1)

    data['prompt']= prompt
    data.rename(columns = {'target_text':'sentence2', 'prompt':'sentence1'}, inplace = True)
    data['sentence2'] = data['sentence2'].astype('str')
    data['sentence1']  = data['sentence1'].astype('str')

    # let's pass the input to the loaded_model with torch compiled with cuda
    if prompt:
        # let's get the result
        simscore = XpathFinder.predict([prompt])
        from sentence_transformers import CrossEncoder
        XpathFinder = CrossEncoder("cross-encoder/stsb-roberta-base")
        sentence_pairs = []
        for sentence1, sentence2 in zip(data['sentence1'],data['sentence2']):
          sentence_pairs.append([sentence1, sentence2])
        
        # sorting the df to get highest scoring xpath_container
        data['SBERT CrossEncoder_Score'] = XpathFinder.predict(sentence_pairs)
        most_acc = data.head(5)
        # predictions
        st.write("Highest Similarity score: ", simscore)
        st.text("Is this one of these the Xpath you're looking for?")
        st.write(st.write(most_acc["input_text"])) 
        

Overwriting app.py


In [41]:
!pip install pyngrok 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [42]:
from pyngrok import ngrok

In [43]:
ngrok.set_auth_token("29Mzs7BHkeeRGNZM41x0Rn4Xilq_7TYKeCLdR34nSS2qBCTzo")

In [44]:
!nohup streamlit run app.py --server.port 80 &
url = ngrok.connect(port = '80')
print(url)

nohup: appending output to 'nohup.out'




PyngrokNgrokHTTPError: ignored