In [1]:
import pandas as pd
import json
import bs4
import re
from tqdm import tqdm

In [2]:
# import bz2

# with bz2.open('../data_scraping/bills.json.bz2', 'r') as f:  
#     data = f.read()  
#     d = json.loads(data.decode("utf-8"))

In [3]:
# values = []

# for i in tqdm(list(d.values())):
#     dic = {}
#     soup = bs4.BeautifulSoup(i, 'html.parser')
#     content = soup.find('td', attrs = {'id':'content'}).text.strip()
#     try:
#         dic['congress'] = int(re.search('\d{2}(?=th Congress)',
#                                         content).group(0))
#         dic['type'] = re.search('(?<= Congress).+(?= Bill)', content).group(0)
#         bill_num = int(re.search('(?<=Bill No. )\d{1,}', content).group(0))
#         dic['bill_num'] = bill_num
#         dic['title'] = re.search(f'(?<=BillTi No. {bill_num}).+(?=Filed)',
#                                  content).group(0)
#         dic['filed_on'] = re.search('(?<=Filed on ).+(?= by)',
#                                     content).group(0)
#         dic['filed_by'] = re.search('(?<=by ).+', content).group(0)
#         dic['introduced_by'] = re.search('(?<=Introduced by ).+',
#                                          content).group(0)
#         dic['long_title'] = re.search('(?<=Long title).+', content).group(0)
#         dic['scope'] = re.search('(?<=Scope).+', content).group(0)
#         dic['subjects'] = re.search('(?<=Subject\(s\)).+', content).group(0)
#         dic['primary_committee'] = re.search('(?<=Primary committee).+',
#                                              content).group(0)
#         dic['secondary_committee'] = re.search('(?<=Secondary committee).+',
#                                                content).group(0)
#     except:
#         pass
#     values.append(dic)

In [4]:
df = pd.read_csv('bills.csv')

In [5]:
df.isna().sum()

Unnamed: 0                0
congress                  0
type                      0
bill_num                  0
title                     0
filed_on                  0
filed_by                  0
introduced_by           498
long_title              498
scope                   498
subjects                846
primary_committee       882
secondary_committee    5138
dtype: int64

In [6]:
df[df['introduced_by'].isna()].tail()

Unnamed: 0.1,Unnamed: 0,congress,type,bill_num,title,filed_on,filed_by,introduced_by,long_title,scope,subjects,primary_committee,secondary_committee
15444,15445,18,Senate,1541,AMENDING SECTION 3 OF RA 7797 (AN ACT TO LENGT...,"May 26, 2020","Sotto III, Vicente C., Villanueva, Joel, Tolen...",,,,,,
15465,15466,18,Senate,1562,RECRUITMENT QUOTA OF FEMALE PHILIPPINE NATIONA...,"May 28, 2020","Revilla Jr., Ramon Bong, De Lima, Leila M., De...",,,,,,
15466,15467,18,Senate,1563,"PNP, BFP, BJMP AND BUCOR HEIGHT EQUALITY ACT","May 28, 2020","Zubiri, Juan Miguel ""Migz"" F., Sotto III, Vice...",,,,,,
15467,15468,18,Senate,1564,BAYANIHAN TO RECOVER AS ONE ACT,"June 1, 2020","Marcos, Imee R., Angara, Sonny, Recto, Ralph G...",,,,,,
15485,15486,18,Senate,1582,SAFE PATHWAYS ACT,"June 3, 2020","Cayetano, Pia S., Tolentino, Francis ""Tol"" N.,...",,,,,,


Note: bill_num 1582 has a scope and long title but I think it is not really important.

In [7]:
df.dropna(subset=['filed_by'], inplace=True)

In [8]:
len(df)

15640

In [9]:
df['long_title'] = df['long_title'].fillna('')
df = df.fillna("N/A")

In [10]:
len(df)

15640

We will keep these bills and we can make Unknown as a catch all label for target variables. 

In [11]:
df.isna().sum()

Unnamed: 0             0
congress               0
type                   0
bill_num               0
title                  0
filed_on               0
filed_by               0
introduced_by          0
long_title             0
scope                  0
subjects               0
primary_committee      0
secondary_committee    0
dtype: int64

In [12]:
len(df)

15640

In [13]:
df.to_pickle('bills.pkl', protocol=3)

## Preprocessing Pipeline

1. Data Cleaning
    1. Categorical Features
        1. Normalize names of categories 
        2. Convert strings of sequences of words to list of categories.
    2. Text Features
        1. Combine title and long title
        2. Natural Language Processing
            1. Removing of punctiations
            2. Removing stop words
            3. Lemmatization
    3. Convert dates to datetime columns

3. TF-IDF Vectorization Text Features for Clustering

In [14]:
df = pd.read_pickle('bills.pkl')

In [15]:
cols_used = ['congress', 'bill_num', 'title', 'long_title', 'filed_on',
             'filed_by', 'introduced_by',  'primary_committee']
df = df[cols_used]

In [16]:
df.head()

Unnamed: 0,congress,bill_num,title,long_title,filed_on,filed_by,introduced_by,primary_committee
0,13,1,SPECIAL MEASURES TO LOWER THE COST OF MEDICINE,AN ACT PRESCRIBING SPECIAL MEASURES TO LOWER...,"June 30, 2004","Flavier, Juan M., Angara, Edgardo J.",Senator FLAVIER;,Health and Demography
1,13,2,THE CONSTITUTIONAL CONVENTION ACT,AN ACT CALLING FOR A CONSTITUTIONAL CONVENTIO...,"June 30, 2004","Flavier, Juan M.",Senator FLAVIER;,"Constitutional Amendments, Revision of Codes a..."
2,13,3,MAGNA CARTA OF PATIENT'S RIGHTS AND OBLIGATIONS,AN ACT DECLARING THE RIGHTS AND OBLIGATIONS O...,"June 30, 2004","Flavier, Juan M., Angara, Edgardo J.",Senator FLAVIER;,Health and Demography
3,13,4,HEALTH PROMOTION ACT OF 2004,AN ACT ESTABLISHING A NATIONAL HEALTH PROMOTI...,"June 30, 2004","Flavier, Juan M., Angara, Edgardo J.",Senator FLAVIER;,Government Corporations and Public Enterprises
4,13,5,INCREASING THE ALLOWABLE PERSONAL ADDITIONAL T...,AN ACT INCREASING THE ALLOWABLE PERSONAL ADDIT...,"June 30, 2004","Flavier, Juan M.",Senator FLAVIER;,Ways and Means


## Data Cleaning

In [17]:
df_clean = df.copy()

### Categorical Features

#### filed_by

In [18]:
# TODO: Still needs a lot of cleaning

df_clean['filed_by'] = df_clean['filed_by'].str.split('.')
df_clean['filed_by'].sample(5)

6697                            [Lapid, Manuel "Lito" M, ]
4597                                       [Villar, Manny]
15047                                 [De Lima, Leila M, ]
12412                       [Escudero, Francis "Chiz" G, ]
14468    [Binay, Maria Lourdes Nancy S, , De Lima, Leil...
Name: filed_by, dtype: object

#### introduced_ by

In [19]:
# TODO: Still needs a lot of cleaning
# less effort because column might not be used later

df_clean['introduced_by'] = df_clean['introduced_by'].str.replace(
    'Senator |Senators |;', '')

df_clean['introduced_by'] = df_clean['introduced_by'].str.split(r'and|,')

#### primary_committee

In [20]:
# No need for cleaning for now

# {'Amateur Sports Competitiveness': 'Games, Amusement and Sports',
#  'Climate Change':'Environment, Natural Resources and Climate Change', 
#  'Peace, Unification and Reconciliation': 'National Defense and Security, Peace, Unification and Reconciliation',
#  'Games and Amusement': 'Games, Amusement and Sports',
#  'Sports': 'Games, Amusement and Sports',
#  'Cooperatives': 'Cooperatives'
#  }

df_clean['primary_committee'].value_counts()

Health and Demography                                                   1495
Education, Arts and Culture                                             1479
Justice and Human Rights                                                1381
N/A                                                                      882
Labor, Employment and Human Resources Development                        804
Public Services                                                          668
Environment and Natural Resources                                        636
Local Government                                                         628
Ways and Means                                                           522
Agriculture and Food                                                     426
Civil Service and Government Reorganization                              415
Government Corporations and Public Enterprises                           409
Constitutional Amendments, Revision of Codes and Laws                    398

    2. Text Features
        1. Combine title and long title
        2. Natural Language Processing
            1. Removing of punctiations
            2. Removing stop words
            3. Lemmatization

### Text Features

#### Combine title and long title

In [39]:
df_clean['combined_title'] = df_clean["title"] + ' ' + df_clean['long_title']


df_clean['combined_title'] = df_clean['combined_title'].str.lower()

df_clean['combined_title'].head()

0    special measures to lower the cost of medicine...
1    the constitutional convention act an act calli...
2    magna carta of patient's rights and obligation...
3    health promotion act of 2004 an act establishi...
4    increasing the allowable personal additional t...
Name: combined_title, dtype: object

#### Natural Language Processing

##### Removing of punctuations

In [74]:
import string
punc = string.punctuation.replace('-', '').replace("'", '')
table = str.maketrans(' ', ' ', punc)
stripped = [w.translate(table) for w in df_clean['combined_title']]

##### Removing stop words

In [75]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zephy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [76]:
# filter out stop words
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

no_stop_words = []
for word_list in stripped:
    word_list = word_list.split()
    words = [w for w in word_list if not w in stop_words]
    no_stop_words += [words]

#### Lemmatization

In [77]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 

lematized = []
for word_list in no_stop_words:
    words = [lemmatizer.lemmatize(w) for w in word_list]
    lematized += [words]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Zephy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [78]:
title_cleaned = []
for i in lematized:
    title_cleaned+=[" ".join(i)]

In [79]:
df_clean['combined_title'] = title_cleaned
df_clean['combined_title'].sample(5)

7003     philippine park miniparks development act act ...
13350    social medium awareness school university act ...
7448     plain language health insurance act act enhanc...
7251     std prevention act act instituting comprehensi...
15124    magna carta worker informal economy macwie act...
Name: combined_title, dtype: object

### Convert dates to datetime columns

In [80]:
df_clean['filed_on'] = pd.to_datetime(df_clean['filed_on'])
df_clean['filed_on'].head()

0   2004-06-30
1   2004-06-30
2   2004-06-30
3   2004-06-30
4   2004-06-30
Name: filed_on, dtype: datetime64[ns]

In [87]:
df_clean.to_pickle('clean_data.pkl', protocol=3)

## TF-IDF Vectorization Text Features for Clustering

In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df_clean['combined_title'])
print(vectorizer.get_feature_names())
print(X.shape)

(15640, 1000)


## Clustering