In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from scipy.spatial.distance import cosine
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import euclidean, cityblock
import nltk

In [38]:
df_clean = df_cleaned = pd.read_pickle('../data_cleaning/clusters_added.pkl')

### Vectorization and IDF scaling

In [33]:
tfidf = TfidfVectorizer(stop_words='english', min_df=0.001, max_df=0.999).fit(bills['combined_title'])
X_idf  = tfidf.transform(bills['combined_title']).toarray()
X_feat_names = tfidf.get_feature_names()

In [34]:
def nearest_k(query, objects, k, dist):
    """Return the indices to objects most similar to query
    
    Parameters
    ----------
    query : ndarray
        query object represented in the same form vector representation as the
        objects
    objects : ndarray
        vector-represented objects in the database; rows correspond to 
        objects, columns correspond to features
    k : int
        number of most similar objects to return
    dist : function
        accepts two ndarrays as parameters then returns their distance
    
    Returns
    -------
    ndarray
        Indices to the most similar objects in the database
    """
    return np.argsort([dist(query, o) for o in objects])[:k]

In [62]:
def inverse_transform(text_query):
    """Codes copied from data_building_and_cleaning. Credits to the 
    rightful owner."""
    
    import string
    punc = string.punctuation.replace('-', '').replace("'", '')
    table = str.maketrans(' ', ' ', punc)
    stripped = [w.translate(table) for w in [text_query.lower()]]

    ######################################################################

    # filter out stop words
    from nltk.corpus import stopwords

    stop_words = set(stopwords.words('english'))

    no_stop_words = []
    for word_list in stripped:
        word_list = word_list.split()
        words = [w for w in word_list if not w in stop_words]
        no_stop_words += [words]

    ######################################################################

    # nltk.download('wordnet')
    from nltk.stem import WordNetLemmatizer 

    lemmatizer = WordNetLemmatizer() 

    lematized = []
    for word_list in no_stop_words:
        words = [lemmatizer.lemmatize(w) for w in word_list]
        lematized += [words]

    title_cleaned = []
    for i in lematized:
        title_cleaned+=[" ".join(i)]

    return title_cleaned[0]

In [63]:
text_query = 'education coffee'

In [64]:
## sample querying

query = [lemmatizer.lemmatize(inverse_transform(text_query))]

# no of results
k = 5
                              
print(query[0])
                              
search_results = nearest_k(tfidf.transform(query).toarray()[0], X_idf, k, euclidean)

education coffee


In [65]:
print(f'Top {k} search results:\n\n')
i = 1
for title in bills['title'].iloc[search_results].to_list():
    print(f'Top {i}\n', title, '\n')
    i += 1
    

Top 5 search results:


Top 1
 PREPARATORY EDUCATION ACT 

Top 2
 PRESCHOOL EDUCATION ACT 

Top 3
 EDUCATION REVITALIZATION ACT 

Top 4
 PRESCHOOL EDUCATION ACT 

Top 5
 EDUCATION REVITALIZATION ACT OF 2007 

