In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from scipy.spatial.distance import cosine
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import euclidean, cityblock

In [2]:
bills = df_cleaned = pd.read_pickle('../data_cleaning/clean_data.pkl')

### Vectorization and IDF scaling

In [3]:
tfidf = TfidfVectorizer(token_pattern=r'[a-z]+', stop_words='english', min_df=0.01, max_df=0.99).fit(bills['combined_title'])
X_idf  = tfidf.transform(bills['combined_title']).toarray()
X_feat_names = tfidf.get_feature_names()

In [4]:
def nearest_k(query, objects, k, dist):
    """Return the indices to objects most similar to query
    
    Parameters
    ----------
    query : ndarray
        query object represented in the same form vector representation as the
        objects
    objects : ndarray
        vector-represented objects in the database; rows correspond to 
        objects, columns correspond to features
    k : int
        number of most similar objects to return
    dist : function
        accepts two ndarrays as parameters then returns their distance
    
    Returns
    -------
    ndarray
        Indices to the most similar objects in the database
    """
    return np.argsort([dist(query, o) for o in objects])[:k]


In [8]:
## sample querying

# type sample query
query = ['education']

# no of results
k = 5

search_results = nearest_k(tfidf.transform(query).toarray()[0], X_idf, k, euclidean)

In [9]:
print(f'Top {k} search results:\n\n')
i = 1
for title in bills['combined_title'].iloc[search_results].to_list():
    print(f'Top {i}\n', title, '\n')
    i += 1
    

Top 5 search results:


Top 1
 independent study distance education act act require independent evaluation distance education programs” 

Top 2
 environmental awareness education act 2007 

Top 3
 ladderized education act 2014 

Top 4
 open learning distance education act 2014 

Top 5
 techprep education act act providing technical preparation education 

