In [19]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer


In [2]:
df = pd.read_csv('/home/ifte-home/work/AI-system/chatbot/Models/Simple-Chatbot-NLTK/processed_data.csv')

In [12]:
df.columns

Index(['Unnamed: 0', 'storytitle', 'story', 'story_lines'], dtype='object')

In [11]:
# Use a CountVectorizer to learn the terms and term frequencies across all of the documents (carols) 
cv = CountVectorizer()
doc_term_matrix = cv.fit_transform(df['story_lines'])

In [13]:
# Get the terms - unique words excluding single char words like "a"
cv.get_feature_names()
# Check the number of terms
len(cv.get_feature_names())
# View the word counts across all of the documents
word_counts = pd.DataFrame(doc_term_matrix.toarray(), index=df["storytitle"], columns=cv.get_feature_names())
word_counts

Unnamed: 0_level_0,00,000,00am,00pm,078,10,100,1000,10000,100k,...,zoom,zoomi,zooney,zoot,zootopia,zucchini,zugbo,zumba,zuri,zyah
storytitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
David Drops the Weight,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Frustration,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Marcus Buys Khakis,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Different Opinions,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Overcoming shortcomings,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Flavor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
After Death,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Janice breaks her wrist,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Jamie marries for love,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# View the most and least frequent words
word_counts.sum().sort_values(ascending=False)

wa              63584
hi              34520
day             11737
one             11413
went            10842
                ...  
nautic              1
cpu                 1
cr                  1
nauseatingli        1
poochi              1
Length: 17398, dtype: int64

In [17]:
# View the word counts for certain words
word_counts[["cpu", "girl"]]

Unnamed: 0_level_0,cpu,girl
storytitle,Unnamed: 1_level_1,Unnamed: 2_level_1
David Drops the Weight,0,0
Frustration,0,0
Marcus Buys Khakis,0,0
Different Opinions,0,0
Overcoming shortcomings,0,0
...,...,...
Flavor,0,0
After Death,0,0
Janice breaks her wrist,0,0
Jamie marries for love,0,1


In [20]:
# We have the term frequencies, now determine the inverse document frequencies (IDFs)
idfs = TfidfTransformer() 
idfs.fit(doc_term_matrix)
TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)
# Create a data frame with the IDF values 
idfs_df = pd.DataFrame(idfs.idf_, index=cv.get_feature_names(), columns=["idfs"]) 
 
# Sort ascending and display
# High IDF (1/DF) terms are less frequent across all documents; low IDF terms are more frequent 
idfs_df.sort_values(by=['idfs'], ascending=False)

Unnamed: 0,idfs
zyah,11.178578
doorstop,11.178578
emelia,11.178578
emce,11.178578
embryo,11.178578
...,...
went,2.694425
day,2.652029
one,2.648567
hi,2.024015


In [21]:
# We have the term frequencies and inverse document frequencies - now calculate the TF-IDF scores
tf_idfs = idfs.transform(doc_term_matrix)

# Create a data frame to view the TF-IDF scores for the first document, doc = 0
doc = 0
col = "tf-idf for doc {}".format(doc)
tf_idf_doc = pd.DataFrame(tf_idfs[doc].T.todense(), index=cv.get_feature_names(), columns=[col])
tf_idf_doc.sort_values(by=[col], ascending=False)

Unnamed: 0,tf-idf for doc 0
vegetarian,0.292694
much,0.282698
examin,0.261558
habit,0.248663
start,0.235896
...,...
flumpti,0.000000
flung,0.000000
flunk,0.000000
fluoresc,0.000000


In [22]:
# Create a data frame to view all of the TF-IDF scores
tf_idf_all_docs = pd.DataFrame(tf_idfs.T.todense(), index=cv.get_feature_names())
tf_idf_all_docs

# Nicer if we re-orientate the scores so they're displayed in the same way as the term frequencies at the top
# Use np.transpose to swap array rows and columns
tf_idf_all_docs_nicer = pd.DataFrame(np.transpose(tf_idfs.T.toarray()), index=df["Carol"], columns=cv.get_feature_names())
tf_idf_all_docs_nicer

# Even better, let's just display the TF-IDFs for certain words of interest
tf_idf_all_docs_nicer[["christmas", "jingle"]]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,52655,52656,52657,52658,52659,52660,52661,52662,52663,52664
00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00am,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00pm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zucchini,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zugbo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zumba,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zuri,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# Now let's perform a simple query that looks for the following words
query = "girl mexican"

# Calculate term frequencies for the query using terms found across all of the documents
query_term_matrix = cv.transform([query])

In [24]:
# Across all of the terms, view the word counts for the query
query_counts = pd.DataFrame(query_term_matrix.toarray(), columns=cv.get_feature_names())

# Query term counts, showing all terms within the documents
# query_counts

# Query term counts, showing just the query terms (shows what we know already of course)
query_counts[query.split(" ")]

Unnamed: 0,girl,mexican
0,1,1


In [25]:
# Calculate the cosine similarity between the vector of each document and the query vector
results = cosine_similarity(tf_idfs, query_term_matrix)
results

array([[0.        ],
       [0.        ],
       [0.        ],
       ...,
       [0.        ],
       [0.20681501],
       [0.20258624]])

In [26]:
results = results.reshape((-1,))
results

array([0.        , 0.        , 0.        , ..., 0.        , 0.20681501,
       0.20258624])

In [30]:
# Print the top search results - voila, hopefully!
# Dan's note to self:
# argsort sorts an array in asc order, and then returns the indexes of the sorted values
# Useful slice notation reference: https://stackoverflow.com/questions/509211/understanding-slice-notation 
# [:-11:-1] returns the last 10 items, in reverse order
print("Search results for: '{}'".format(query))
for i in results.argsort()[:-11:-1]:
    if results[i] > 0:
        print("Carol {}. {} {}%".format(i, df.iloc[i,0], round(100*results[i])))
        print(df.iloc[i,3])



Search results for: 'girl mexican'
Carol 27015. 27015 49%
joe crave mexican bread went mexican bakeri got dozen mexican bread went store bought milk enjoy hi breakfast
Carol 17578. 17578 48%
shane walk home work notic girl lay ground tri help girl girl unrespons girl ha murder
Carol 16340. 16340 46%
boy teas girl girl got mad punch boy boy told girl girl got troubl
Carol 15593. 15593 46%
girl fun told mom mom took girl girl appreci girl savor memori fun mom
Carol 12925. 12925 46%
girl sat swing man push girl girl want go higher man tri push girl higher girl fell swing
Carol 18490. 18490 45%
boy like girl boy ask girl girl refus boy persist girl got restrain order
Carol 9523. 9523 41%
jane met girl school girl wa littl weird jane parent said girl sleepov turn girl wa perfectli normal jane becam best friend
Carol 45830. 45830 41%
tom love eat restaur hi favorit wa mexican food new mexican restaur open near immedi went tri love fajita
Carol 24531. 24531 40%
alli saw new girl school decid 