# Notebook for Week 3 lecture

In [None]:
import pandas as pd 
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction import stop_words
from sklearn.metrics.pairwise import cosine_similarity as cosine
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.tokenize.casual import casual_tokenize
import pandas as pd
from nlpia.data.loaders import get_data
from sklearn.decomposition import TruncatedSVD

## Example: Using tf-idf to do a search

### Review: Starting with documents, compute some tfidf vectors

Let's make some documents:

In [None]:
s1 = "Samsung Galaxy S7 32GB - Black - Unlocked (Renewed) 5.1-inch touchscreen display with a resolution of 1440 pixels by 2560 pixels at a PPI of 577 pixels per inch Samsung Exynos 8890 Octa-core 64-bit processor12-megapixel primary camera on the rear and a 5-megapixel front camera 32GB of internal storage can be expanded (MicroSD)"
s2 = "Samsung Smartphone Galaxy S8 64GB UK Version - Midnight Black 5.8” QuadHD + sAMOLED display – Rounded-corner Infinity Display and symmetrical metal sides that blend effortlessly for a seamless look(unlocked) IP68-rated water and dust resistant with a powerful 10nm mobile AP for multi-tasking Dual pixel 12MP camera with F1.7 lens and enhanced image processing. 8MP front camera with facial recognition for smart autofocus 3,000mAh battery with fast charging capabilities via USB Type-C. Wireless Charger Convertible Expandable memory up to 256GB for storing all your photos, movies and music"
s3 = "JOYIN Play-act Pretend Play Smart Phone, Keyfob Key Toy and Credit Cards Set Kids Toddler Cellphone Key Toys Great Value. Set includes Electronic Toy Keyfob, Electronic Toy Phone, Driver's License and Debit Card. Each Toy Accessory Provides Different Play Patterns Adding-up to Endless Hands-on Playtime. SOUND EFFECTS. Toy Mobile Phone (Requires 3 AAA Batteries Not Included) Talks Back to Kids with14 Different Unique Phrases and Music When Touched. Electronic Toy Keyfob (Requires 3 L44 Batteries Included) Features Colorful Press Buttons with Three Different Car Sounds INCREDIBLE DETAILS. Driver License and Debit Cards are all Designed with Details and Fashion.It’s Handy and Perfect for Toddler Fashionistas to Play PREMIUM QUALITY & SAFETY. Child Safe: Non-Toxic. Meet US toy standard. Safety test approved. CUSTOMER SATISFACTION. Providing a 100% satisfaction experience is our main priority to our customers. Feel free to message us through “contact sellers” if products don't meet your expectations. The celebrations start at JOYIN!"
s4 = "Ricco Kids TWO MOTORS Battery Powered i8 Style Sports Coupe Electric Ride On Toy Car (Model: KL1888) (WHITE) Electric Ride On Car with Excellent Quality and Comfort, Parental Remote Control Included Two 6V 4.5AH Batteries and Two 20W Motors, Four Wheels Suspension, LED Light, Horn, Power Level Display MP3 Music Input Interface on steering wheel, Foot Pedal Accelerator, Forward, reverse and neutral gears. Designed for Age: 3-6 Years Old, Max Capacity: 30 KGS, Max Speed: 5 KM/Hour Product Size: 102*68*44 CM, Net Weight: 13 KGS, part assembled, Charging Time:8~12 hours, Driving Time: about 45 minutes. Standards complied: GB6675 GB19865 EN71\EN62115 ASTM-F963"

documents = [s1, s2, s3, s4]

Let's use our own tokenizer for now:

In [None]:
#Louis' tokenizer from last week

lem = WordNetLemmatizer()

def my_tokeniser(doc):
    #Split on spaces
    tokens = re.split(r'[-\s.,;!?]+', doc)
    processed = []
    for t in tokens:
        #Lemmatise and make lowercase
        t = lem.lemmatize(t.lower()) #Can try changing this line to see how it impacts results (e.g., remove lemmatising, keep lowercase)
        #Remove stop words
        if not t in stop_words.ENGLISH_STOP_WORDS:
            processed = processed + [t]
    #Return an array of tokens for that document
    return processed

Let's compute the tfidf vectors for each document in our corpus:

In [None]:
#Create count vectors
cv = CountVectorizer(min_df=1, tokenizer=my_tokeniser)
count_array = cv.fit_transform(documents).toarray()

#create tfidf vectors from count vectors
transformer = TfidfTransformer()
transformer.fit(count_array)
tfidfs = transformer.transform(count_array)

It's helpful to store our tfidfs in a pandas dataframe, which is easy to display and workwith. Note though that the representation the transformer gives us is a 'compressed sparse row' matrix, which means it's stored in a compact representation. We need to convert it to a dense representation (i.e., the sort of matrix you're used to seeing) to create a dataframe from it.

In [None]:
vocab = cv.get_feature_names()
data = pd.DataFrame(tfidfs.todense(), columns = vocab) #store it in a nice data frame to make it easy to work with

In [None]:
data #just displays our tfidf data as a table

In [None]:
#More helpful to print top 10 words for each document (i.e., words with top tfidf values)
num_words = 10
for i in range(len(tfidfs.todense())):
    print("doc", i)
    print(data.iloc[i].sort_values(ascending = False).head(num_words))

Let's compute cosine similarities between all pairs of documents using this tfidf representation, like Louis showed last week:

In [None]:
#Convert to array 
tfidfs_dense = tfidfs.todense()
#Find similarities
result = cosine(tfidfs_dense)
#Put the result in a dataframe 
df = pd.DataFrame(result)
#Show with heatmap style gradients
df.style.background_gradient(cmap='Greens')

### Using tfidf vectors to query



In [None]:
query = ["phone"] #Replace this with whatever word(s) you want to use as your query
query_vector = cv.transform(query) #produce our count vectors, using the same method (including tokenizer) as we used on the corpus
query_tfidf = transformer.transform(query_vector) #produce tfidf scores for this query, using the same method as we used on the corpus

#query_tfidf is now a sparse (CSR) matrix; we need to get the "dense" version to compare it to our other tfidf vectors
dense_query = query_tfidf.todense()

#This will compute cosine similarity between dense_query and each element of the vector array
#The first element is the similarity with document 1, the second is the similarity with document 2, and so on
cosine(dense_query, tfidfs_dense)


## Latent Semantic Analysis

To explore LSA, we'll use a dataset of SMS messages -- some spam, others not -- from the NLPIA book. To get this data:

In [None]:
sms = get_data('sms-spam')

In [None]:
#Take a look:
sms

In [None]:
# Before we make a dataframe, let's make a fancy index that gives each sms a name, which ends in ! if it's spam:
index = ['sms{}{}'.format(i, '!'*j) for (i, j) in zip(range(len(sms)), sms.spam)]

#now let's plop our messages into a dataframe, where this new index array becomes the dataframe index array
# each row in the dataframe is a text message. the "text" column holds the text we actually want to analyse
sms = pd.DataFrame(sms.values, columns=sms.columns, index=index)
sms #print it

As an alterantive to the above, where we used a counter with a custom tokenizer, followed by a TfidfTransformer, we can make a TfidfVectorizer that uses an existing tokenizer. Here, casual_tokenize (from nltk toolkit) is good for tokenizing text like sms where the language may be casual, we may have emojis, etc. (https://www.nltk.org/_modules/nltk/tokenize/casual.html)

In [None]:
tfidf_vectoriser = TfidfVectorizer(tokenizer=casual_tokenize)

Note that the documentation for TfidfVectorizer says it's "Equivalent to :class:`CountVectorizer` followed by
:class:`TfidfTransformer`." :

In [None]:
?TfidfVectorizer

We can now use `tfidf_vectoriser.fit_transform` to transform the raw documents using this call; it returns a sparse matrix which we can convert into a normal matrix using `.toarray()`:

In [None]:
tfidf_docs = tfidf_vectoriser.fit_transform(raw_documents=sms.text).toarray()

Here's some code that lets us explore the tfidf outputs a bit, doing some sanity checking before proceeding:

In [None]:
len(tfidf_vectoriser.vocabulary_) #shows us size of the vocabulary

In [None]:
shape(tfidf_docs) #The rows and columns in our array (1 row per document, 1 column per term)

We'd like to make a dataframe from `tfidf_docs` for display/processing convenience, but we want to know which columns correspond to which terms. We do this complicated zip operation to get a list of column_names and a list of corresponding terms (in the same order). We can then use `terms` to set the column names in our data frame. Note that the row names in our data frame (i.e., the indexes) are the same as the original sms data frame.

In [None]:
column_nums, terms = zip(*sorted(zip(tfidf_vectoriser.vocabulary_.values(), tfidf_vectoriser.vocabulary_.keys()))) # Get the column_numbers for each term in our vocabulary
tfidf_docs_df = pd.DataFrame(tfidf_docs, columns=terms, index=sms.index)
tfidf_docs_df #show it

In [None]:
#It looks like sms2 has some &, ' and ( characters based on the table above. Let's verify, by viewing it in the original dataset.
#the iloc() function gives us the dataframe data corresponding to the integer index(es) of its argument(s).
# Here, let's grab all the columns (":") from row 2:
sms.iloc[2,:].text   

## Apply LSA using TruncatedSVD

The first thing we need to do is subtract the mean of each tfidf column from each value (sometimes called "whitening")

In [None]:
tfidf_docs_df = tfidf_docs_df - tfidf_docs_df.mean() #Centres vectorized documents by subtracting the mean

Now we can apply TruncatedSVD to the mean-subtracted values, using fit_transform. As before, this will give us a sparse matrix. Note that this may take a little while to compute.

In [None]:
svd = TruncatedSVD(n_components = 16, n_iter = 1000)
svd_topic_vectors = svd.fit_transform(tfidf_docs_df.values)

In [None]:
#put it in a dataframe, again being good to ourselves by giving it row and column names
svd_topic_vectors_df = pd.DataFrame(svd_topic_vectors, index=sms.index, columns=['topic{}'.format(i) for i in range(16)])
svd_topic_vectors_df

Notice from the above that we've managed to represent each SMS message in just 16 numbers! :D wow!

But there's more!

We can look at the weights LSA has assigned to each word within each topic. `svd.components_` is a variable that gives us these weightings:

In [None]:
shape(svd.components_) #it has 16 rows (one per topic) and 9232 columns (one per word)

We can also peek at the singular values using `svd.singular_values_` if we're interested (usually we're not)

In [None]:
svd.singular_values_

In [None]:
#let's stick this in a friendly dataframe
#below, we use .T to transpose svd.components_ into the familiar form where *rows* correspond to words and *columns* to topics
topic_weights = pd.DataFrame(svd.components_.T, index=terms, columns=['topic{}'.format(i) for i in range(16)])
topic_weights #display it

We can examine our topics by looking at which words are highly weighted in each topic. To do this for an individual topic, we can use the following code:

In [None]:
topic_weights.topic2.sort_values(ascending=False)[:10] # show top 10 weighted words for topic 2

Or we can do this for every topic, using a for-loop:

In [None]:
#Do this for all topics
for i in range(16):
    print("topic " + str(i) + ":")
    topicName = "topic" + str(i)
    weightedlist = topic_weights.get(topicName).sort_values(ascending=False)[:10]
    print(weightedlist)

We can also see the topic spread for each word in a new fake sms that we construct:

In [None]:
pd.options.display.max_columns=16 #Just to make sure we can see everything
weights_for_new_SMS = topic_weights.T['! ;) :) half off crazy deal discount'.split()].round(3) * 100
weights_for_new_SMS

The code above just shows the topic weights for each individual token in the fake new SMS. Can you figure out how to construct a query vector from this SMS -- i.e., a single vector that you could compare with other messages using cosine similarity?

## Now let's try Latent Dirichlet Allocation on the same dataset

Don't forget, we apply LDiA to the word count vectors, **not to the tf-idf vectors**.

In [None]:
# Set things up
lda_cv = CountVectorizer(stop_words='english', tokenizer=casual_tokenize,
                        max_df=.1,
                        max_features=5000)
count_data = lda_cv.fit_transform(sms.text)
lda = LatentDirichletAllocation(n_components=16,
                                random_state=123,
                                learning_method='batch')

In [None]:
# And run LDA. This could take a loooonggggg time....
lda_topics = lda.fit_transform(count_data)

In [None]:
#Now make a friendly data frame from the topics and display it
lda_topic_vectors_df = pd.DataFrame(lda_topics, index=sms.index, columns=['topic{}'.format(i) for i in range(16)])
lda_topic_vectors_df

In [None]:
#Print the top words for each topic
#This uses some fancier Python than the way we printed out LDA topics above; either is fine!
n_top_words = 10
feature_names = lda_cv.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i]
                    for i in topic.argsort()\
                        [:-n_top_words - 1:-1]]))

In [None]:
#Explore topic distributions for existing documents. 
#Think about how dense/sparse this distribution is compared to LSA topics
doc_num = 9 #can change this
print("SMS: " + sms.text.get(doc_num))
print(lda_topic_vectors_df.iloc[doc_num,:]) #see topic weighting for document # doc_num

In [None]:
#Compare to sparseness/density of distribution over LDA topics for the same text message
#(remember that topics in LDiA will not correspond to topics in LSA)
print(svd_topic_vectors_df.iloc[doc_num,:]) #see topic weighting for document # doc_num