<a href="https://colab.research.google.com/github/Ilank1/BillsPrediction/blob/main/DataPreparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re, os, wget
from docx import Document

In [None]:
passed_status = 118

SubTypeOptions = {"ועדה":0, 
       "פרטית":1,
       "ממשלתית":2}

documents_db_path = r"KNS_DocumentBill.csv"
documents_dir = r"Bills"
db_Bill_path = r"KNS_Bill.csv"

Download all the documents of the bills from the govermental OData API
http://knesset.gov.il/Odata/ParliamentInfo.svc/

We've dumped the following tables for preparing the data 
KNS_DocumentBill - For the documents
KNS_Bill - For bills info

In [None]:
df_documents = pd.read_csv(documens_db_path, encoding='utf-8-sig')

In [None]:
for index, row in df_documents.iterrows():
    # Download only DOC files (doc or docx) and initial suggestion type of documents(type 1)
    if row["ApplicationDesc"] == "DOC" and row["GroupTypeID"] == 1:
        file_name = wget.filename_from_url(row["FilePath"])

        file_path = os.path.join(documents_dir, file_name)
        
        # Download non-existing files
        if os.path.isfile(file_path):
            continue
        else:
            file_name = wget.download(row["FilePath"])
            print(" : " + file_name)

print("Finished download")

In [None]:
def get_bill_text(bill_id):
    bill_df = df_documents.loc[df_documents['BillID'] == bill_id]
    if len(bill_df) == 0:
        raise Exception("No documents found for bill id {}".format(bill_id))
    for i, r in bill_df.iterrows():
        
        # Use only DOC documents, those are the only type of documents we've downloaded
        if r["ApplicationDesc"] != "DOC":
            raise Exception("Bill document type {} not supported, only DOC supported".format(r["ApplicationDesc"]))
        
        # Use only type 1 documents, those are the only type of documents we've downloaded
        if r["GroupTypeID"] != 1:
            raise Exception("Bill GroupTypeID {} not supported, only initial suggesstions are allowed".format(r["GroupTypeID"]))
        
        file_path = os.path.join(documents_dir, wget.filename_from_url(r["FilePath"]))
        if not os.path.isfile(file_path):
            raise Exception("Bill document {} doesn't exists in the directory, web path {}".format(file_path, r["FilePath"]))
        
        return extract_bill_explanation(file_path)
    

In [None]:
def extract_bill_explanation(file_path):
    text = ""
    document = Document(file_path)

    start_ind = [i for i, para in enumerate(document.paragraphs) if (('דברי הסבר' in para.text) or ('דברי  הסבר' in para.text) or ('ד ב ר י   ה ס ב ר' in para.text))]
    if len(start_ind) == 0:
            raise Exception("No Explanataion paragraph headline found")
    if len(start_ind) > 1:
        raise Exception("More than one Explanataion paragraph headline found")
    else:
        start_ind = start_ind[0]

    end_ind = [i for i, para in enumerate(document.paragraphs) if '---' in para.text]
    if len(end_ind) == 0:
        raise Exception("No Explanataion paragraph headline found")
    if len(end_ind) > 1:
        raise Exception("More than one Explanataion paragraph end marker found")
    else:
        end_ind = end_ind[0]
    
    if start_ind and end_ind:
        for i, para in enumerate(document.paragraphs):
            if i > start_ind  and i < end_ind :
                 text += para.text + "\r\n"
    return text

In [None]:
df_bills = pd.read_csv(db_Bill_path, encoding='utf-8-sig')

outout_df = pd.DataFrame(columns=['label', 'data','InitiatorType','cont_bit', 'bill_id'])

for index, row in df_bills.iterrows():
    try:
                                       
        current_bill = {'label': row["StatusID"] == passed_status, 
                        'data': get_bill_text(),
                        'InitiatorType': SubTypeOptions[row["SubTypeDesc"]],
                        'cont_bit': row["IsContinuationBill"],
                        'bill_id': row["BillID"]}
        
        outout_df = outout_df.append(current_bill,ignore_index=True)
    except Exception as e: 
        print(e)
        print("Failed bill id: ", row["BillID"])

outout_df.replace({False: 0, True: 1}, inplace=True)

outout_df.to_excel(r"output.xlsx") 

print("Finish")

Final stats regarding the data: 

In [None]:
all_bills = r"output.xlsx"
df_all_bills = pd.read_excel(all_bills)

total = len(df_all_bills)
print("Total: ", total)

print("All extracted bills: ", len(df_all_bills))

df_bills_cont = df_all_bills.loc[df_all_bills["IsContinuationBill"] == 1]
print("Continuation bit on ", len(df_bills_cont))

df_bills_notcont = df_all_bills.loc[df_all_bills["IsContinuationBill"] == 0]
print("Continuation bit off ", len(df_bills_notcont))

df_bills_com = df_all_bills.loc[df_all_bills["SubTypeDesc"] == "ועדה"]
print("Committee bills: ", len(df_bills_com))

df_bills_gov = df_all_bills.loc[df_all_bills["SubTypeDesc"] == "ממשלתית"]
print("Govermental bills: ", len(df_bills_gov))

df_bills_private = df_all_bills.loc[df_all_bills["SubTypeDesc"] == "פרטית"]
print("Private bills: ", len(df_bills_private))


df_all_bills = df_all_bills.loc[df_all_bills["StatusID"] == passed_status]
print("Passed: " ,len(df_bills))

df_bills_cont = df_all_bills.loc[df_all_bills["IsContinuationBill"] == 1]
print("Continuation bit on ", len(df_bills_cont))

df_bills_notcont = df_all_bills.loc[df_all_bills["IsContinuationBill"] == 0]
print("Continuation bit off ", len(df_bills_notcont))

df_bills_com = df_all_bills.loc[df_all_bills["SubTypeDesc"] == "ועדה"]
print("Committee bills: ", len(df_bills_com))

df_bills_gov = df_all_bills.loc[df_all_bills["SubTypeDesc"] == "ממשלתית"]
print("Govermental bills: ", len(df_bills_gov))

df_bills_private = df_all_bills.loc[df_all_bills["SubTypeDesc"] == "פרטית"]
print("Private bills: ", len(df_bills_private))