In [9]:
import pandas as pd
import numpy as np
import re
import multiprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.csr import csr_matrix
from typing import Tuple, NamedTuple, List, Optional
from sparse_dot_topn import awesome_cossim_topn
from functools import wraps

In [57]:
def get_top_n_match(row, n_top=5):
    """
    :param row:
    :param n_top: number of results to be determined
    :return: list of tuples with index of the match and the cosine similarity score
    """

    row_count = row.getnnz()
    if row_count == 0:
        return None
    elif row_count <= n_top:
        result = zip(row.indices, row.data)
    else:
        arg_idx = np.argpartition(row.data, -n_top)[-n_top:]
        result = zip(row.indices[arg_idx], row.data[arg_idx])
    return sorted(result, key=(lambda x: -x[1]))


def match_company_name(input_name, vectorizer, comp_name_vectors, comp_name_df):
    """
    :param input_name: input company name whose matches need to be found
    :param vectorizer: TFIDF vectorizer which was initialized earlier
    :param comp_name_vectors: the company names' vectors of the whole data set
    :param comp_name_df: the company names dataframe
    :return: a dataframe with top N matching names with match score
    """

    input_name_vector = vectorizer.transform([input_name])
    result_vector = input_name_vector.dot(comp_name_vectors.T)
    matched_data = [get_top_n_match(row) for row in result_vector]
    flat_matched_data = [tup for data_row in matched_data for tup in data_row]
    lkp_idx, lkp_sim = zip(*flat_matched_data)
    nr_matches = len(lkp_idx)
    matched_names = np.empty([nr_matches], dtype=object)
    sim = np.zeros(nr_matches)
    for i in range(nr_matches):
        matched_names[i] = comp_name_df['Que'][lkp_idx[i]]
        sim[i] = lkp_sim[i]
    return pd.DataFrame({"Matching company name": matched_names,
                         "Match Score (%)": sim*100})
  
  # Example


In [2]:
f = '/home/ifte/alechat_core/assets/dataset/general_conversation.csv'

In [59]:
frame1 = pd.read_csv(f)

In [60]:
frame1['Ans'][0]

'Both are excellent technology they are helpful in many ways. For the security purpose both are super.'

In [23]:
frame2 = '/home/ifte/alechat_core/assets/dataset/wiki_QA.csv'

In [24]:
frame2 = pd.read_csv(frame2)

In [46]:
merged = pd.concat([frame1, frame2], axis=0, ignore_index=True)

In [61]:
merged.head()

Unnamed: 0.1,Unnamed: 0,Que,Ans,Sent
0,0,Are you a fan of Google or Microsoft?,Both are excellent technology they are helpful...,Curious to dive deeper
1,1,Both are excellent technology they are helpful...,"I'm not a huge fan of Google, but I use it a ...",Curious to dive deeper
2,2,"I'm not a huge fan of Google, but I use it a ...",Google provides online related services and pr...,Curious to dive deeper
3,3,Google provides online related services and pr...,"Yeah, their services are good. I'm just not a ...",Curious to dive deeper
4,4,"Yeah, their services are good. I'm just not a ...",Google is leading the alphabet subsidiary and ...,Curious to dive deeper


In [1]:
x = {}

In [2]:
if x:
    print('ok')

In [48]:
corpus = merged.Que.values.tolist()

In [11]:
corpus[0]

'Are you a fan of Google or Microsoft?'

In [49]:
vectorizer = TfidfVectorizer(max_df=0.2, analyzer='word', stop_words='english')

vectors = vectorizer.fit_transform(corpus)

In [58]:
result_df = match_company_name("ADVISORY U S EQUITY MARKET", vectorizer, vectors, corpus)
print(result_df)

TypeError: list indices must be integers or slices, not str

In [4]:
search = 'Do you like Microsoft?'

In [51]:
    input_name_vector = vectorizer.transform([search])
    result_vector = input_name_vector.dot(vectors.T)
    matched_data = [get_top_n_match(row) for row in result_vector]
    flat_matched_data = [tup for data_row in matched_data for tup in data_row]
    lkp_idx, lkp_sim = zip(*flat_matched_data)
    nr_matches = len(lkp_idx)
    matched_names = np.empty([nr_matches], dtype=object)
    sim = np.zeros(nr_matches)

In [52]:
    for i in range(nr_matches):
        matched_names[i] = merged['Que'][lkp_idx[i]]
        sim[i] = lkp_sim[i]

In [None]:
df['Que'][lkp_idx[i]]

In [None]:
df['Que'][188378]

In [38]:
lkp_idx[i]

188797

In [24]:
df['Que'][lkp_idx[0]]

'Are you a fan of Google or Microsoft?'

In [26]:
lkp_idx[1]

6773

In [54]:
matched_names

array(['Are you a fan of Google or Microsoft?',
       'what kind of company is Microsoft?',
       'Actually I read that they are behind Apple and Microsoft.',
       "No I don't. I use a cheap phone. Microsoft once held a funeral to mark the death of the iPhone when it launched its own line of phones. Microsoft really can't beat Apple at that game. ",
       'Yeah, do you use Cortana? The microsoft thing? '], dtype=object)

### awesome cossin top

In [49]:
import pandas as pd

In [88]:
main = pd.read_pickle('/home/ifte/alechat_core/corpus/raw/combined_raw_corpus.pkl')

In [52]:
df = pd.read_pickle('/home/ifte/alechat_core/corpus/nlu/raw/nlu_raw_data_frame.pkl')

In [70]:
main.head()

Unnamed: 0,intent,text
0,talking_weather,What is the weather forecast tomorrow? Is it h...
1,affirm,yes of course sure yeah ok cool yep yep will d...
2,ask_builder,can you share your boss with me? i want to get...
3,ask_howbuilt,How were you built? Tell me how you were made?...
4,ask_howdoing,Ahoy matey how are you? are you alright are yo...


In [36]:
df

Unnamed: 0,text,intent
763,nah thanks,deny
650,we started working with chatbot but now we nee...,contact_staffs
718,lets talk to sales,contact_staffs
485,What is your birthplace?,ask_wherefrom
1206,Do you charge one time price,priceInfo
...,...,...
143,who made you?,ask_builder
607,and that is it?,canthelp
562,ciao,bye
707,I would like to contact your sales team please,contact_staffs


In [54]:
from sklearn.feature_extraction.text import CountVectorizer

In [55]:
corpus = df.text.values.tolist()

In [56]:
count_vectorizer = CountVectorizer()
# Learn a vocabulary dictionary of all tokens in the raw documents.
vocabulary = count_vectorizer.fit(corpus).vocabulary_

In [None]:
dict(sorted(vocabulary.items(), key=lambda item: item[1], reverse=True))

In [57]:
search ='how are you'

In [58]:
vectorizer = TfidfVectorizer(vocabulary=vocabulary)

tfidf_spend_vendor = vectorizer.fit_transform([search])

tfidf_vendor = vectorizer.fit_transform(corpus).transpose()

results = awesome_cossim_topn(tfidf_spend_vendor, tfidf_vendor, 30, 0)

In [59]:
search

'how are you'

In [62]:
df.index[i]

236

In [66]:
collector = []
for index, i in enumerate(results.indices):
#     print('{}: {} '.format(corpus[i], results.data[index]), df['intent'][i])
    collector.append((results.data[index], df['intent'][i]))

In [90]:
sync = []
save = 1
for score, intent in collector:
    sync.append((main[main['intent'] == intent].index.values[0], score))
    save = intent
    

In [94]:
sorted(sync, key = lambda x: x[1], reverse=True)

[(32, 0.9837066467635389),
 (32, 0.9837066467635389),
 (32, 0.9837066467635389),
 (32, 0.9837066467635389),
 (32, 0.8164766225589803),
 (32, 0.6537221290201787),
 (32, 0.6537221290201787),
 (32, 0.6537221290201787),
 (34, 0.6307362077170526),
 (32, 0.6280127516074062),
 (33, 0.6007800850716278),
 (32, 0.5712156055114965),
 (32, 0.5712156055114965),
 (40, 0.568975198508233),
 (40, 0.568975198508233),
 (34, 0.5594742981902705),
 (34, 0.5594742981902705),
 (32, 0.5359731144772885),
 (34, 0.5287974241727714),
 (31, 0.5212182166965337),
 (34, 0.5201725628917543),
 (34, 0.5201725628917543),
 (34, 0.5201725628917543),
 (34, 0.5201725628917543),
 (34, 0.5201725628917543),
 (34, 0.5201725628917543),
 (34, 0.5201725628917543),
 (40, 0.5201725628917543),
 (38, 0.49967714818589715),
 (32, 0.4839472839072857)]

In [93]:
sorted(collector, key = lambda x: x[1])

[(0.5212182166965337, 'ask_howbuilt'),
 (0.9837066467635389, 'ask_howdoing'),
 (0.9837066467635389, 'ask_howdoing'),
 (0.9837066467635389, 'ask_howdoing'),
 (0.9837066467635389, 'ask_howdoing'),
 (0.8164766225589803, 'ask_howdoing'),
 (0.6537221290201787, 'ask_howdoing'),
 (0.6537221290201787, 'ask_howdoing'),
 (0.6537221290201787, 'ask_howdoing'),
 (0.6280127516074062, 'ask_howdoing'),
 (0.5712156055114965, 'ask_howdoing'),
 (0.5712156055114965, 'ask_howdoing'),
 (0.5359731144772885, 'ask_howdoing'),
 (0.4839472839072857, 'ask_howdoing'),
 (0.6007800850716278, 'ask_howold'),
 (0.6307362077170526, 'ask_isbot'),
 (0.5594742981902705, 'ask_isbot'),
 (0.5594742981902705, 'ask_isbot'),
 (0.5287974241727714, 'ask_isbot'),
 (0.5201725628917543, 'ask_isbot'),
 (0.5201725628917543, 'ask_isbot'),
 (0.5201725628917543, 'ask_isbot'),
 (0.5201725628917543, 'ask_isbot'),
 (0.5201725628917543, 'ask_isbot'),
 (0.5201725628917543, 'ask_isbot'),
 (0.5201725628917543, 'ask_isbot'),
 (0.49967714818589715

In [47]:
for index, i in enumerate(results.indices):
    print('{}: {} '.format(corpus[i], results.data[index]), corpus['text'][i])

IndexError: tuple index out of range

In [51]:
x = [(1, 2.662512538434358)]

In [53]:
p=[]

In [54]:
p.append((2,3))

In [56]:
p.append((4,5))

In [59]:
p = {'page_relevance': [{'doc_id': 32, 'doc_index': 32, 'score': 0.6388156550707736, 'keywords': [], 'positions': [], 'snippet': 'Ahoy matey how are you? are you alright are you having a good day Are you ok? are you okay Do you feel good? how are things going how are things with you? How are things? how are you doing this morning how are you feeling how are you today how is it going how is your day how is your day going how is...'}, {'doc_id': 34, 'doc_index': 34, 'score': 0.6309240716123865, 'keywords': [], 'positions': [], 'snippet': 'are you a bot? are you a real bot? are you a bot are you really a bot are you a robot are you a robot are you a chatbot bot? are you a chatbot bot? what are you a bot? tell me are you a bot? are you a Skynet ? are you a chatbot are you ai are you artificial intelligence are you artificial i guess yo...'}, {'doc_id': 31, 'doc_index': 31, 'score': 0.4973787650072693, 'keywords': [], 'positions': [], 'snippet': "How were you built? Tell me how you were made? Let me know how you were made exactly I'd like to know how you were created Can you give me an idea as to how you were created? I want to know how you were formed What was the process for making you? Can you explain how you were created? Specify how you..."}], 'NLU-Reply': ['It was nice meeting you!', "Yep, I'm a bot!", 'I was built with a lot of love and patience.'], 'WEB-Reply': {}, 'Imp-Phrases': [], 'status_code': 1}

In [58]:
import json

In [61]:
json.dumps(p)

'{"page_relevance": [{"doc_id": 32, "doc_index": 32, "score": 0.6388156550707736, "keywords": [], "positions": [], "snippet": "Ahoy matey how are you? are you alright are you having a good day Are you ok? are you okay Do you feel good? how are things going how are things with you? How are things? how are you doing this morning how are you feeling how are you today how is it going how is your day how is your day going how is..."}, {"doc_id": 34, "doc_index": 34, "score": 0.6309240716123865, "keywords": [], "positions": [], "snippet": "are you a bot? are you a real bot? are you a bot are you really a bot are you a robot are you a robot are you a chatbot bot? are you a chatbot bot? what are you a bot? tell me are you a bot? are you a Skynet ? are you a chatbot are you ai are you artificial intelligence are you artificial i guess yo..."}, {"doc_id": 31, "doc_index": 31, "score": 0.4973787650072693, "keywords": [], "positions": [], "snippet": "How were you built? Tell me how you were made? 