In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from scipy.spatial.distance import cosine
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import euclidean, cityblock
import nltk

In [63]:
df_clean = pd.read_pickle('../data_cleaning/clusters_added.pkl')

In [66]:
filtered = df_clean.query("congress == 18").copy()

### Vectorization and IDF scaling

In [67]:
tfidf = TfidfVectorizer(stop_words='english', min_df=0.001, max_df=0.999)
X_idf  = tfidf.fit_transform(filtered['combined_title']).toarray()
X_feat_names = tfidf.get_feature_names()

In [68]:
def nearest_k(query, objects, k, dist):
    """Return the indices to objects most similar to query
    
    Parameters
    ----------
    query : ndarray
        query object represented in the same form vector representation as the
        objects
    objects : ndarray
        vector-represented objects in the database; rows correspond to 
        objects, columns correspond to features
    k : int
        number of most similar objects to return
    dist : function
        accepts two ndarrays as parameters then returns their distance
    
    Returns
    -------
    ndarray
        Indices to the most similar objects in the database
    """
    return np.argsort([dist(query, o) for o in objects])[:k]

In [69]:
def inverse_transform(text_query):
    """Codes copied from data_building_and_cleaning. Credits to the 
    rightful owner."""
    
    import string
    punc = string.punctuation.replace('-', '').replace("'", '')
    table = str.maketrans(' ', ' ', punc)
    stripped = [w.translate(table) for w in [text_query.lower()]]

    ######################################################################

    # filter out stop words
    from nltk.corpus import stopwords

    stop_words = set(stopwords.words('english'))

    no_stop_words = []
    for word_list in stripped:
        word_list = word_list.split()
        words = [w for w in word_list if not w in stop_words]
        no_stop_words += [words]

    ######################################################################

    # nltk.download('wordnet')
    from nltk.stem import WordNetLemmatizer 

    lemmatizer = WordNetLemmatizer() 

    lematized = []
    for word_list in no_stop_words:
        words = [lemmatizer.lemmatize(w) for w in word_list]
        lematized += [words]

    title_cleaned = []
    for i in lematized:
        title_cleaned+=[" ".join(i)]

    return title_cleaned[0]

In [95]:
text_query = 'Health and Demography'

In [96]:
## sample querying

query = [inverse_transform(text_query)]

# no of results
k = 30
                              
print(query[0])
                              
search_results = nearest_k(tfidf.transform(query).toarray()[0], X_idf, k, euclidean)

health demography


In [97]:
pd.set_option('display.max_colwidth', None)
print(f'Top {k} search results:\n\n')
i = 1
filtered.iloc[search_results]

Top 30 search results:




Unnamed: 0,congress,type,bill_num,title,filed_on,filed_by,long_title,scope,subjects,primary_committee,combined_title,clustering_ward_a,clustering_ward_b,clustering_ward_c
15347,18,Senate,1443,PHILIPPINE HEALTH SECURITY ACT,2020-04-27,"[Cayetano, Pia S, ]",AN ACT PROVIDING FOR A NATIONAL HEALTH SECURITY AND APPROPRIATING FUNDS THEREFOR,National,Philippine Health SecurityCovid-19,Health and Demography,philippine health security act act providing national health security appropriating fund therefor,0,2,4
15416,18,Senate,1512,PHILIPPINE E-HEALTH SYSTEMS AND SERVICES ACT OF 2020,2020-05-04,"[Revilla Jr, , Ramon Bong]","AN ACT ESTABLISHING THE PHILIPPINE E-HEALTH SYSTEM IN THE DELIVERY OF HEALTH SERVICES WITH THE USE OF INFORMATION AND COMMUNICATIONS TECHNOLOGY IN THE PHILIPPINES, AND APPROPRIATING FUNDS THEREFOR",National,E-Health (Electronic Health)Health Care Delivery System,Health and Demography,philippine e-health system service act 2020 act establishing philippine e-health system delivery health service use information communication technology philippine appropriating fund therefor,0,1,2
14866,18,Senate,962,CREATING A DENTAL UNIT IN EVERY RURAL HEALTH UNIT,2019-08-27,"[Angara, Sonny]",AN ACT CREATING A DENTAL UNIT IN EVERY RURAL HEALTH UNIT UNDER THE DEPARTMENT OF HEALTH AS PART OF THE PRIMARY APPROACH IN THE DELIVERY OF HEALTH SERVICES AND PROVIDING FUNDS THEREFOR,National,Dental HealthRural DentistDepartment of Health (DOH),Health and Demography,act creating dental unit every rural health unit department health part primary approach delivery health service providing fund therefor,0,0,19
14220,18,Senate,316,BARANGAY HEALTH WORKERS AND SERVICES REFORM ACT OF 2019,2019-07-10,"[Poe, Grace]","AN ACT IMPROVING AND PROMOTING QUALITY DELIVERY OF HEALTH SERVICES BY BARANGAY HEALTH WORKERS IN THE BARANGAY LEVEL, PROVIDING FUNDS THEREFOR, AND FOR OTHER PURPOSES",National,Barangay Health WorkerHealth Services,Health and Demography,barangay health worker service reform act 2019 act improving promoting quality delivery health service barangay health worker barangay level providing fund therefor purpose,0,0,57
15340,18,Senate,1436,"MANDATORY PROTECTION OF HEALTH WORKERS, FRONTLINERS AND PATIENTS ACT",2020-04-16,"[Hontiveros, Risa]","AN ACT AMENDING REPUBLIC ACT NO. 11332, OTHERWISE KNOWN AS THE MANDATORY REPORTING OF NOTIFIABLE DISEASES AND HEALTH EVENTS OF PUBLIC HEALTH CONCERN ACT",National,Anti-DiscriminationHealthcare WorkersFrontlinersPatientsCovid-19,Health and Demography,mandatory protection health worker frontliners patient act act amending republic act 11332 otherwise known mandatory reporting notifiable disease health event public health concern act,0,0,19
15437,18,Senate,1533,MANDATORY REPORTING OF NOTIFIABLE DISEASES AND HEALTH OF PUBLIC HEALTH CONCERN,2020-05-18,"[Dela Rosa, Ronald ""Bato""]","AN ACT AMENDING REPUBLIC ACT NO. 11332, OTHERWISE KNOWN AS THE MANDATORY REPORTING OF NOTIFIABLE DISEASES AND HEALTH OF PUBLIC HEALTH CONCERN ACT, AND FOR OTHER PURPOSES",National,Health Care,Health and Demography,act amending republic act 11332 otherwise known mandatory reporting notifiable disease health public health concern act purpose,0,0,19
14069,18,Senate,165,BIBONG BHW ACT OF 2019,2019-07-02,"[Hontiveros, Risa]","AN ACT CREATING THE COMMUNITY HEALTH WORKER EDUCATION AND TRAINING PROGRAM, INCREASING COMPENSATION AND OTHER BENEFITS FOR BARANGAY HEALTH WORKERS (BHW), APPROPRIATING FUNDS THEREFOR AND FOR OTHER PURPOSES",National,Barangay Health WorkerCommunity Health WorkersBarangay Development Program,Health and Demography,bibong bhw act 2019 act creating community health worker education training program increasing compensation benefit barangay health worker bhw appropriating fund therefor purpose,0,0,57
14089,18,Senate,185,MAGNA CARTA FOR BARANGAY HEALTH WORKERS ACT OF 2019,2019-07-02,"[De Lima, Leila M, ]","AN ACT INSTITTUTING THE MAGNA CARTA FOR BARANGAY HEALTH WORKERS, REPEALING REPUBLIC ACT NO. 7883, OTHERWISE KNOWN AS THE BARANGAY HEALTH WORKER’S BENEFITS AND INCENTIVES ACT OF 1995, PROVIDING FUNDS THEREFOR, AND FOR OTHER PURPOSES",National,Barangay Health Worker,Health and Demography,magna carta barangay health worker act 2019 act instittuting magna carta barangay health worker repealing republic act 7883 otherwise known barangay health worker’s benefit incentive act 1995 providing fund therefor purpose,0,0,57
15375,18,Senate,1471,AMENDING R.A. NO. 11036 (THE MENTAL HEALTH ACT),2020-05-04,"[Angara, Sonny]","AN ACT AMENDING CERTAIN PROVISIONS OF REPUBLIC ACT NO. 11036, OTHERWISE KNOWN AS THE MENTAL HEALTH ACT",National,Mental Health,Health and Demography,amending ra 11036 mental health act act amending certain provision republic act 11036 otherwise known mental health act,0,0,19
15312,18,Senate,1408,HEALTH EMERGENCY LOCKDOWN ACT,2020-03-10,"[Tolentino, Francis ""Tol"" N, ]","AN ACT STRENGTHENING THE CAPACITY OF THE DEPARTMENT OF HEALTH IN THE DETECTION AND CONTAINMENT OF INFECTIOUS DISEASES, INCLUDING THE QUARANTINE OF INFECTED PERSONS AND LOCKDOWN OF INFECTED AREAS, AMENDING REPUBLIC ACT NO. 9271, AND FOR OTHER PURPOSES",National,Department of Health (DOH)Novel Coronavirus,Health and Demography,health emergency lockdown act act strengthening capacity department health detection containment infectious disease including quarantine infected person lockdown infected area amending republic act 9271 purpose,0,2,69


In [None]:
# improve query using filter
# Verify performance using AUC-PR of Scope and Primary Committee
# Seach committee sample, AUC-PR
# Verify with 5 randomized search terms, manually. Compare from government website 
# Show power with 'minimum wage' manual




# congess filed_by scope filed_on primary_committee