## Exercise 1

In [5]:
# We need to download some NLTK resources first
import nltk
nltk.download('stopwords')

# If you encounter this error: Resource 'tokenizers/punkt/english.pickle' not found, follow the instructions in
# https://stackoverflow.com/questions/4867197/failed-loading-english-pickle-with-nltk-data-load
# nltk.download()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tnguyen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import string
from nltk.corpus import stopwords
import math
from collections import Counter

stemmer = PorterStemmer()

# Tokenize, stem a document
def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    return " ".join([stemmer.stem(word.lower()) for word in tokens])

# Read a list of documents from a file. Each line in a file is a document
with open("bread.txt") as f:
    content = f.readlines()
original_documents = [x.strip() for x in content] 
documents = [tokenize(d).split() for d in original_documents]

# create the vocabulary
vocabulary = set([item for sublist in documents for item in sublist])
vocabulary = [word for word in vocabulary if word not in stopwords.words('english')]
vocabulary.sort()

# compute IDF, storing idf values in a dictionary
def idf_values(vocabulary, documents):
    idf = {}
    num_documents = len(documents)
    for i, term in enumerate(vocabulary):
        idf[term] = math.log(num_documents/sum(term in document for document in documents), math.e)
    return idf

# Function to generate the vector for a document (with normalisation)
def vectorize(document, vocabulary, idf):
    vector = [0]*len(vocabulary)
    counts = Counter(document)
    max_count = counts.most_common(1)[0][1]
    for i,term in enumerate(vocabulary):
        vector[i] = idf[term] * counts[term]/max_count
    return vector

# Compute IDF values and vectors
idf = idf_values(vocabulary, documents)
document_vectors = [vectorize(s, vocabulary, idf) for s in documents]

# Function to compute cosine similarity
def cosine_similarity(v1,v2):
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    if sumxy == 0:
            result = 0
    else:
            result = sumxy/math.sqrt(sumxx*sumyy)
    return result

# computing the search result
def search_vec(query, k):
    q = query.split()
    q = [stemmer.stem(w) for w in q]
    query_vector = vectorize(q, vocabulary, idf)
    scores = [[cosine_similarity(query_vector, document_vectors[d]), d] for d in range(len(documents))]
    scores.sort(key=lambda x: -x[0])
    print(scores)
    for i in range(k):
            print(original_documents[scores[i][1]])

# HINTS

# natural logarithm function
#     math.log(n,math.e)
# Function to count term frequencies in a document
#     Counter(document)
# most common elements for a list
#     counts.most_common(1)

### Compare with scikit-learn result

In [7]:
search_vec('baking',5)

[[0.6204317403802526, 0], [0.2600896774071498, 3], [0, 1], [0, 2], [0, 4]]
How to Bake Breads Without Baking Recipes
Breads, Pastries, Pies, and Cakes: Quantity Baking Recipes
Smith Pies: Best Pies in London
Numerical Recipes: The Art of Scientific Computing
Pastry: A Book of Best French Pastry Recipes


In [11]:
# Reference code using scikit-learn
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 1, stop_words = 'english')
features = tf.fit_transform(original_documents)
npm_tfidf = features.todense()
new_features = tf.transform(['computer science'])

cosine_similarities = linear_kernel(new_features, features).flatten()
related_docs_indices = cosine_similarities.argsort()[::-1]
topk = 5
assert topk <= len(related_docs_indices)
for i in range(topk):
    print(original_documents[related_docs_indices[i]])

Pastry: A Book of Best French Pastry Recipes
Breads, Pastries, Pies, and Cakes: Quantity Baking Recipes
Numerical Recipes: The Art of Scientific Computing
Smith Pies: Best Pies in London
How to Bake Breads Without Baking Recipes
