In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from numpy import array

In [2]:
data = pd.read_csv("new.csv")
data=data[["question","answer"]]
data

Unnamed: 0,question,answer
0,crushes ice?,yes :) you can get nice slushies for sure ;)
1,Does the fan have feet? In other words how doe...,The base is broad enough that it can stay upri...
2,Can you use cold water to make the espresso???...,Yes. It won't be as robust or bitey as when us...
3,Does this work as an air purifier as well? We ...,Let me say this: When I use my rainbow to vacu...
4,What hand held air deflation pump should I buy...,I have a Quick Fill Electric Pump that I broug...
...,...,...
206502,Anyone else get one where the lid over the sha...,I have two and both are that way.
206503,Can this be used as a watermaker for a saltwat...,"No, salt is dissolved in the water, you have t..."
206504,"What are the overall dimensions, total hieght ...",17 inches high 38 inches long and 18 1/2 inche...
206505,can the latter be put on the other side,I don't think so.


In [3]:
from bs4 import BeautifulSoup
import string 
from nltk.stem.snowball import SnowballStemmer 

In [4]:
def lower_text(text):
    text = " ".join(i.lower() for i in text.split())
    return text


def stemmer(text):
    ss=SnowballStemmer(language='english')
    text= ' '.join([ss.stem(word) for word in text.split()])
    return text

def remove_noise(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    text = re.sub("\[[^]]*\]", "", text)
    return text


def remove_punc(text):
    tokens = text.split()
    re_punc = re.compile("[%s]"%re.escape(string.punctuation))
    tokens = [re_punc.sub("", w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    text = " ".join(tokens)
    
    return text

def remove_stop(text):
    tokens = text.split()
    all_stopwords = stopwords.words('english')
    tokens_without_sw = [word for word in tokens if not word in all_stopwords]
    text = " ".join(tokens_without_sw)
    return text

def preprocess_data(text):
    text = lower_text(text)
    text = remove_noise(text)
    text = remove_stop(text)
    text = remove_punc(text)
    text = stemmer(text)
    
    return text

data["question"] = data["question"].apply(preprocess_data)



In [5]:
data["question"]

0                                                 crush ice
1                 fan feet word stay upright obvious pictur
2                        use cold water make espresso thank
3         work air purifi well recent shown rainbow work...
4                         hand held air deflat pump buy bag
                                ...                        
206502    anyon els get one lid shaker screen snap secur...
206503                            use watermak saltwat boat
206504               overal dimens total hieght width depth
206505                                      latter put side
206506                                    grate chocol tool
Name: question, Length: 206507, dtype: object

In [6]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')



  from .autonotebook import tqdm as notebook_tqdm


In [7]:
FAQs_embeddings = model.encode(data["question"])

In [8]:
FAQs_test = pd.Series(['is stainless a good choice for me ?',
                       "does the  Microwave Steamer have a recipe guide and an instruction booklet?",
                       "We're concerned about the fabric not 'Breathing' in borderline weather -  Any comments?",
                       "Is the trundle adjustable and sturdy in a lower height in addition to using it as a king?",
                       '''Is the attachment with the blade made of metal or not?''',
                       "Does this book have branding on the outside of it?",
                       "Is there a lid on it? it's not mentioned in product details",
                       "I want to know if the pockets have zippers or drops?",
                       "Can you drink  from thermos or must you pour it into lid to drink?",
                       "can i replace my product with another one "
                      ]).apply(preprocess_data)

In [9]:
FAQs_test_embeddings = model.encode(FAQs_test)

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_score = cosine_similarity(FAQs_test_embeddings, FAQs_embeddings)
max=similarity_score.max(axis=1)

In [14]:
Accuracy=max.sum()/len(max)
Accuracy

0.8824389457702637

In [13]:
index = similarity_score.argmax(axis=1)

In [88]:
df2 = []
for i in index:
    df2.append(data.answer[i])



In [89]:
df1 = pd.DataFrame({"FAQs_test":FAQs_test})
df1

Unnamed: 0,FAQs_test
0,stainless good choic


In [90]:
df2 = pd.DataFrame(df2, columns=["FAQs_Answer"])
df2

Unnamed: 0,FAQs_Answer
0,It is 18/10. I will also add that it is solidl...


In [91]:
result = pd.concat([df1, df2], axis=1, join='inner')

In [92]:
display(result.FAQs_Answer[0])

"It is 18/10. I will also add that it is solidly made and not a cheap looking item at all. The mesh is ultra fine and well secured with a neat stainless rim as you can see in the picture. Again, this is a very fine mesh so perfect for skimming soups, etc. which is how I use it. I keep my utensils hanging from stainless bars and this is attractive enough for me to hang there as well. So far, I'm very pleased with it."

In [93]:
data.head()

Unnamed: 0,question,answer
0,crush ice,yes :) you can get nice slushies for sure ;)
1,fan feet word stay upright obvious pictur,The base is broad enough that it can stay upri...
2,use cold water make espresso thank,Yes. It won't be as robust or bitey as when us...
3,work air purifi well recent shown rainbow work...,Let me say this: When I use my rainbow to vacu...
4,hand held air deflat pump buy bag,I have a Quick Fill Electric Pump that I broug...
