# Natural Language Processing: information retrieval

This exercise gets you to work on a basic text analytics problem. In particular, given a list of biographies of famous jazz players, we will build a system that given a text query will return the most appropriate person realative to that query.

#### Import libraries

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
#nltk.download()  # Download text data sets, including stop words
#nltk.data.path.append('/home/[USER]/nltk_data/') # set another directory for nltk_data
from nltk.corpus import stopwords  # Import the stop word list
from nltk.stem.snowball import SnowballStemmer

#### Import data

We now import the biographies of jazzists found on wikipedia. We manually removed the citations [] since we though they did not bring anything important to the text.

In [2]:
data = pd.DataFrame(columns=['name', 'bio'])
descriptions = "Data/descriptions.txt"

with open(descriptions) as f:
    for i, line in enumerate(f):

                
        if(i == 0):
            print('Row number:', i, ') Author:', line.split('(', 1)[0],'\n') # split when encountering an open round bracket '(' 
                                                                         # and take the first value (the remaining ones are all 
                                                                         # the other text)
            print('Row number:', i, ') Bio:', line.split('(', 1)[1][0:500])

            
        newrow = pd.DataFrame([[line.split('(', 1)[0], line.strip()]], columns=['name', 'bio'])
        
        data = data.append(newrow, ignore_index=True)
        
print(data.shape)

Row number: 0 ) Author: Charles "Charlie" Parker, Jr.  

Row number: 0 ) Bio: August 29, 1920 - March 12, 1955), also known as Yardbird and Bird, was an American jazz saxophonist and composer.Parker was a highly influential jazz soloist and a leading figure in the development of bebop a form of jazz characterized by fast tempos, virtuosic technique and advanced harmonies. Parker was a blazingly fast virtuoso, and he introduced revolutionary harmonic ideas including rapid passing chords, new variants of altered chords, and chord substitutions. His tone ranged from clean an
(16, 2)


#### Clean the data
Create the function to clean the raw data text. In this function we optionally remove all non-letters, stem the word after lower-case conversion, and remove stop-words.

In [3]:
def review_to_wordlist(review, regex="[^a-zA-Z]", remove_stopwords=False, stemmer=None, verbose=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    #
    # 2. Remove non-letters
    if(regex is not None):
        review = re.sub(regex, " ", review)
    
    if(verbose):
        print('\n* RAW BIO: ')
        print(review[0:100])
    
    # 3. Convert words to lower case and split them
    words = review.lower().split()
    
    if(verbose):
        print('\n* LOWER CASE: ')
        print(words[0:100])
    
    # 4. Stemming
    if(stemmer is not None):
        words = [stemmer.stem(w) for w in words]
    
    if(verbose):
        print('\n * STEMMED WORDS: ')
        print(words[0:100])
    
    # 5. Optionally remove stop words (false by default)
    # Two elements here are new: First, we converted the stop word list to a
    # different data type, a set. This is for speed;
    # since we'll be calling this function tens of thousands of times,
    # it needs to be fast, and searching sets in Python is much faster than searching lists.
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    if(verbose):
        print('\n * STOP WORDS REMOVED: ')
        print(words[0:100])
    
    # 6. Return a list of words
    return (words)

Call the function to clean the data

In [4]:
# Mantain only letters
letters_only = re.compile("[^a-zA-Z]") # if one wants to retain only letters, pass this to the 'regex' argumento of review_to_wordlist()

snowball_stemmer = SnowballStemmer("english", ignore_stopwords=True) # define the stemmer

clean_documents = []

for i in range(0, len(data["bio"])): # for each document, convert it to a list of words
    clean_documents.append(" ".join(review_to_wordlist(data["bio"][i], regex=None,
                                                       remove_stopwords=True, stemmer=snowball_stemmer, verbose=True)))


* RAW BIO: 
Charles "Charlie" Parker, Jr. (August 29, 1920 - March 12, 1955), also known as Yardbird and Bird, w

* LOWER CASE: 
['charles', '"charlie"', 'parker,', 'jr.', '(august', '29,', '1920', '-', 'march', '12,', '1955),', 'also', 'known', 'as', 'yardbird', 'and', 'bird,', 'was', 'an', 'american', 'jazz', 'saxophonist', 'and', 'composer.parker', 'was', 'a', 'highly', 'influential', 'jazz', 'soloist', 'and', 'a', 'leading', 'figure', 'in', 'the', 'development', 'of', 'bebop', 'a', 'form', 'of', 'jazz', 'characterized', 'by', 'fast', 'tempos,', 'virtuosic', 'technique', 'and', 'advanced', 'harmonies.', 'parker', 'was', 'a', 'blazingly', 'fast', 'virtuoso,', 'and', 'he', 'introduced', 'revolutionary', 'harmonic', 'ideas', 'including', 'rapid', 'passing', 'chords,', 'new', 'variants', 'of', 'altered', 'chords,', 'and', 'chord', 'substitutions.', 'his', 'tone', 'ranged', 'from', 'clean', 'and', 'penetrating', 'to', 'sweet', 'and', 'somber.', 'parker', 'acquired', 'the', 'nickname', '


 * STEMMED WORDS: 
['john', 'william', 'coltrane,', 'also', 'known', 'as', '"trane"', '(septemb', '23,', '1926', '-', 'juli', '17,', '1967),', 'was', 'an', 'american', 'jazz', 'saxophonist', 'and', 'composer.', 'work', 'in', 'the', 'bebop', 'and', 'hard', 'bop', 'idiom', 'earli', 'in', 'his', 'career,', 'coltran', 'help', 'pioneer', 'the', 'use', 'of', 'mode', 'in', 'jazz', 'and', 'was', 'later', 'at', 'the', 'forefront', 'of', 'free', 'jazz.', 'he', 'led', 'at', 'least', 'fifti', 'record', 'session', 'during', 'his', 'career,', 'and', 'appear', 'as', 'a', 'sideman', 'on', 'mani', 'album', 'by', 'other', 'musicians,', 'includ', 'trumpet', 'mile', 'davi', 'and', 'pianist', 'theloni', 'monk.', 'as', 'his', 'career', 'progressed,', 'coltran', 'and', 'his', 'music', 'took', 'on', 'an', 'increas', 'spiritu', 'dimension.', 'coltran', 'influenc', 'innumer', 'musicians,', 'and', 'remain']

 * STOP WORDS REMOVED: 
['john', 'william', 'coltrane,', 'also', 'known', '"trane"', '(septemb', '23,', 

#### TF-IDF

After the bag-of-words model creation, we create a TF-IDF representation of words.

In [5]:
tfidf_vectorizer = TfidfVectorizer(norm='l2', max_df=0.95, min_df=0.1)

train_data_features = tfidf_vectorizer.fit_transform(clean_documents)

search_db = train_data_features.toarray()
print('Dataset shape:', search_db.shape)


Dataset shape: (16, 1049)


#### Test the method
The method converts a new text query using the same processing used for building the corpus. Then, the vector of the query is compared with the vector of each jazzist via cosine similarity. The jazzist closer to the query is returned as answer to the search query.

In [6]:
query_document = []
query = "Famous jazz saxophonist born in Kansas who played bebop and latin"
query_document.append(" ".join(review_to_wordlist(query, regex=None,
                                                  remove_stopwords=True, stemmer=snowball_stemmer)))
query_tfidf = tfidf_vectorizer.transform(query_document).toarray()

similarities = cosine_similarity(query_tfidf, search_db)
sorted_indexes = np.argsort(similarities)
results = data["name"][np.argmax(similarities)]
print("Query:", query)
print("Answer:", results)

Query: Famous jazz saxophonist born in Kansas who played bebop and latin
Answer: Charles "Charlie" Parker, Jr. 


In [7]:
query_document = []
query = "Trumpeter who sang wonderful world"
query_document.append(" ".join(review_to_wordlist(query, regex=None,
                                                  remove_stopwords=True, stemmer=snowball_stemmer)))
query_tfidf = tfidf_vectorizer.transform(query_document).toarray()

similarities = cosine_similarity(query_tfidf, search_db)
sorted_indexes = np.argsort(similarities)
results = data["name"][np.argmax(similarities)]
print("Query:", query)
print("Answer:", results)

Query: Trumpeter who sang wonderful world
Answer: Louis Armstrong 


In [8]:
query_document = []
query = "Saxophonist who made giant steps"
query_document.append(" ".join(review_to_wordlist(query, regex=None,
                                                  remove_stopwords=True, stemmer=snowball_stemmer)))
query_tfidf = tfidf_vectorizer.transform(query_document).toarray()

similarities = cosine_similarity(query_tfidf, search_db)
sorted_indexes = np.argsort(similarities)
results = data["name"][np.argmax(similarities)]
print("Query:", query)
print("Answer:", results)

Query: Saxophonist who made giant steps
Answer: John William Coltrane, also known as "Trane" 


In [9]:
query_document = []
query = "Woman who sang Feeling Good"
query_document.append(" ".join(review_to_wordlist(query, regex=None,
                                                  remove_stopwords=True, stemmer=snowball_stemmer)))
query_tfidf = tfidf_vectorizer.transform(query_document).toarray()

similarities = cosine_similarity(query_tfidf, search_db)
sorted_indexes = np.argsort(similarities)
results = data["name"][np.argmax(similarities)]
print("Query:", query)
print("Answer:", results)

Query: Woman who sang Feeling Good
Answer: Nina Simone 
