># Collect data from Zaubacorp

In [None]:
from selenium.webdriver.support.ui import WebDriverWait
from tqdm.notebook import tqdm_notebook
from selenium.webdriver.common.by import By
from IPython.display import display
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import time
import sys
import re

In [None]:
# chromium
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

In [None]:
# Zaubacorp login
url = 'https://www.zaubacorp.com/user/login?destination=node'
driver = webdriver.Chrome('chromedriver',options=chrome_options)
driver.get(url)

In [None]:
def login(cuser_text, pass_text, captcha_text):
    """Login to zaubacorp

    Args:
        cuser_text (string): username
        pass_text (string): password
        captcha_text (integer): captcha result
    """
    cuser = driver.find_element(By.ID, "edit-name")
    cpass = driver.find_element(By.ID, "edit-pass")
    cpt = driver.find_element(By.ID, "edit-captcha-response")
    cuser.send_keys(cuser_text)
    cpass.send_keys(pass_text)
    cpt.send_keys(captcha_text)
    driver.find_element(By.ID, "edit-submit").click()

In [None]:
# captcha location from webpage
t = driver.find_element(By.XPATH,"//div[@class='form-type-textfield form-item-captcha-response form-item form-group']")
variables = ''.join(list(filter(lambda x:len(x)!=0,re.split(r'[Math question *]*',t.text)))[:-1])
# ans -> captcha ans
ans = eval(variables)
# login
login('prince404','PRINCE@123',str(ans))

In [None]:
print(driver.current_url)
# wait till it loads
WebDriverWait(driver=driver, timeout=10).until(
    lambda x: x.execute_script('return document.readyState') == 'complete'
)
print(driver.current_url)
time.sleep(7)
print(driver.current_url)

In [None]:
def display_all(data):
    """diplays data without trimming in notebook

    Args:
        data (string, optional): The data
    """
    for i in range(len(data)):
        print(i)
        display(data[i])

In [None]:
def load_page(link):
    """Loads the page in webdriver

    Args:
        link (string): page link
    """
    driver.get(link)
    WebDriverWait(driver=driver, timeout=10).until(
    lambda x: x.execute_script('return document.readyState') == 'complete'
    )
    print(driver.current_url)
    time.sleep(3)

def get_emailetc(d_company):
    """Hadles extaction of Email, Address, Website and MailID and noise removal

    Args:
        d_company (dictionary): dictionary containing company details
    """
    all_text = driver.find_element(By.XPATH,"/html/body").text
    a = all_text.split('Email ID:')
    if(len(a)>1):
        a = a[1].split('Director Details')[0]
    # split to get other details
    a = a.split('\n')
    a = list(filter(lambda x:len(x)>0,a))
    if(len(a)>2):
        # check if its an email
        if('@' in a[0]):
            d_company['Email']=a[0].strip()
        # update adress in the dictionary
        try: d_company['Address']=a[3].strip()
        except: pass
        temp = a[1].split(' ')
        # remove noise and update Website if present
        if('Click' not in temp):
            ot_temp = a[1].split(':')
            if(len(ot_temp)>1):
                d_company['Website']=ot_temp[1].strip()

def basic_details(data,d_company):
    """Extracts Basic company details & noise removal

    Args:
        data (list): basic details in tabular format
        d_company (dictionary): dictionary containing company details
    """
    temp = np.array(data)[0]
    for i in range(len(temp)):
        if((temp[i][1]!='-') and (temp[i][1]!='') and (str(temp[i][1])!='nan')):
            if(temp[i][0]=="Activity"):
                # Remove all part present after Click to view ....
                clean_value = list(filter(lambda x:len(x)!=0,temp[i][1].split('Click')))
                # Also remove deep description present
                if(len(clean_value)>0):
                    more_clean = list(filter(lambda x:len(x)!=0,clean_value[0].split('[')))
                    if(len(more_clean)>0):
                        d_company[temp[i][0]]=more_clean[0].rstrip()
            else:
                d_company[temp[i][0]]=temp[i][1]

def pre_data(data,d_company,val):
    """Handles Previous CIN and Names column of zaubacorp

    Args:
        data (list): data from webpage converted in tabular format
        d_company (dictionary): dictionary containing company details 
        val (string): key
    """
    temp = np.array(data).tolist()[0]
    # if present by search for found in (Not Found)
    if(len(temp)<1 or (len(temp)==1 and 'found' in temp[0][0].split(' '))):
        return
    l = []
    for i in range(len(temp)):
        l.append(temp[i][0])
    d_company[val]=l

def establishments(data,d_company):
    """Handles Establishments owned by the Company

    Args:
        data (list): data from webpage converted in tabular format
        d_company (dictionary): dictionary containing company details 
    """
    temp = np.array(data).tolist()[0]
    # check if data is present
    if(len(temp)<1 or (len(temp)==1 and 'found' in (temp[0][0].lower()).split(' '))):
        return
    l = []
    estab_dict = {}
    # update the value in dictionary as per location in the passed argument
    for i in range(len(temp)):
        s_dict = {}
        if (len(str(temp[i][0]))>0 and temp[i][0]!='-' and temp[i][0]!=''):
            s_dict['Establishment Name'] = temp[i][0]
        if(len(str(temp[i][1]))>0 and temp[i][1]!='-' and temp[i][1]!=''):
            s_dict['City'] = temp[i][1]
        if(len(str(temp[i][2]))>0 and str(temp[i][2])!='-' and str(temp[i][2])!='' and temp[i][2]!=float('nan')):
            s_dict['Pincode'] = temp[i][2]
        if(len(str(temp[i][3]))>0 and temp[i][3]!='-' and temp[i][3]!=''):
            s_dict['Address'] = temp[i][3]
        estab_dict[i] = s_dict
    d_company['Establishments'] = estab_dict

def charges(data,d_company):
    """Handles Charges lodged against the Company

    Args:
        data (list): data from webpage converted in tabular format
        d_company (dictionary): dictionary containing company details 
    """
    temp = np.array(data).tolist()[0]
    # check if data is preset
    if(len(temp)<1 or (len(temp)==1 and 'found' in str(temp[0][0]).split(' '))):
        return
    # charges disctionary
    c_d = {}
    # holds total amount under charges
    amount = 0
    # update the value in dictionary as per location in the passed argument
    for i in range(len(temp)):
        t_d = {}
        if(len(str(temp[i][1]))>0 and str(temp[i][1])!='-' and str(temp[i][1])!=''):
            t_d['Creation Date'] = temp[i][1]
        if(len(str(temp[i][2]))>0 and str(temp[i][2])!='-' and str(temp[i][2])!=''):
            t_d['Modification Date']=temp[i][2]
        if(len(str(temp[i][3]))>0 and str(temp[i][3])!='-' and str(temp[i][3])!=''):
            t_d['Closure Date']=temp[i][3]
        if(len(str(temp[i][4]))>0 and str(temp[i][4])!='-' and str(temp[i][4])!='' and str(temp[i][4])!='nan'):
            t_d['Assets Under Charge']=temp[i][4]
        if(temp[i][5]!=float('nan') and str(temp[i][5]).isnumeric()):
            amount+=int(temp[i][5])
            t_d['Amount']=temp[i][5]
        if(len(str(temp[i][6]))>0 and str(temp[i][6])!='-' and str(temp[i][6])!=''):
            t_d['Charge Holder']=temp[i][6]
        c_d[temp[i][0]]=t_d
    d_company['Charges'] = c_d
    d_company['total Charges/Borrowing Amount'] = amount
    d_company['Number of Charges']=len(temp)

def persecution(data,d_company):
    """Handles pesecution against the Company

    Args:
        data (list): data from webpage converted in tabular format
        d_company (dictionary): dictionary containing company details 
    """
    temp = np.array(data).tolist()[0]
    # holds total amount under charges
    if(len(temp)<1 or (len(temp)==1 and 'found' in (temp[0][0].lower()).split(' '))):
        return
    # conatains all persecution details with key as persecutin id
    p_d = {}
    for i in range(len(temp)):
        t_d = {}
        if(len(str(temp[i][1]))>0 and str(temp[i][1])!='-' and str(temp[i][1])!=''):
            t_d['Defaulting Entities'] = temp[i][1]
        if(len(str(temp[i][2]))>0 and str(temp[i][2])!='-' and str(temp[i][2])!=''):
            t_d['Court Name']=temp[i][2]
        if(len(str(temp[i][3]))>0 and str(temp[i][3])!='-' and str(temp[i][3])!=''):
            t_d['Prosecution Section'] = temp[i][3]
        if(len(str(temp[i][4]))>0 and str(temp[i][4])!='-' and str(temp[i][4])!=''):
            t_d['Date Of Order'] = temp[i][4]
        if(len(str(temp[i][5]))>0 and str(temp[i][5])!='-' and str(temp[i][5])!=''):
            t_d['Status'] = temp[i][5]
        p_d[i]=t_d
    d_company['Persecution'] = p_d
    d_company['Number of Persecutions']=len(temp)

def cur_directors(data,d_company):
    """Handles current directors of the company

    Args:
        data (list): data from webpage converted in tabular format
        d_company (dictionary): dictionary containing company details 
    """
    temp = np.array(data[0]).tolist()
    if(len(temp)<1 or (len(temp)==1 and 'not' in (temp[0][0].lower()).split(' '))):
        return
    l = []
    d_d = {}
    for i in range(len(temp)):
        if(str(temp[i][0]).isdecimal()):
            t_d = {}
            if(len(str(temp[i][1]))>0 and str(temp[i][1])!='-' and str(temp[i][1])!=''):
                t_d['Name']=temp[i][1]
            if(len(str(temp[i][2]))>0 and str(temp[i][2])!='-' and str(temp[i][2])!=''):
                t_d['Designation'] = temp[i][2]
            if(len(str(temp[i][3]))>0 and str(temp[i][3])!='-' and str(temp[i][3])!=''):
                t_d['Appointment Date'] = temp[i][3]
            d_d[temp[i][0]]=t_d
    d_company['Current Directors'] = d_d

In [None]:
def fetch_data(link):
    """Main function for calling other subordinate functions

    Args:
        link (string): link of company details on zaubacorp

    Returns:
        disctionary: Comapany details are stored in the dictionary
    """
    # call load page to load the current company page
    load_page(link)
    # get html page
    page = driver.page_source
    # parse the page
    soup = BeautifulSoup(page, 'html.parser')
    # find all table instances in the table
    temp_data = soup.find_all('table')
    # Check if company is present/Page is loaded fully
    if(len(temp_data)<5):
        print("Company Not Found!")
        return
    # dictionary containing all the data of the company
    d_company = {}
    basic_details(pd.read_html(str(temp_data[0]),header=None),d_company)
    basic_details(pd.read_html(str(temp_data[3]),header=None),d_company)
    basic_details(pd.read_html(str(temp_data[4]),header=None),d_company)
    basic_details(pd.read_html(str(temp_data[5]),header=None),d_company)
    basic_details(pd.read_html(str(temp_data[6]),header=None),d_company)
    get_emailetc(d_company)
    try:
        pre_data(pd.read_html(str(temp_data[1]),header=None),d_company,'Previous Names')
        pre_data(pd.read_html(str(temp_data[2]),header=None),d_company,'Previous CIN')
    except:
        pass
    try :
        establishments(pd.read_html(str(temp_data[-1]),header=None),d_company)
        charges(pd.read_html(str(temp_data[-2]),header=None),d_company)
        persecution(pd.read_html(str(temp_data[-3]),header=None),d_company)
        cur_directors(pd.read_html(str(temp_data[7]),header=None),d_company)
    except:
        pass
    return d_company

In [None]:
# load links
with open('./temps/missing_links.json','r+') as jsonfp:
    company_links = json.load(jsonfp)

print("Length of Data :",len(company_links))
company_data = {}
count = 0 
# Loop for iterating over all the comnpany in loaded data
with tqdm_notebook(total=len(company_links)) as pbar:
    for i in company_links:
        try:
            json_data = fetch_data(company_links[i]['link'])
            company_data[i] = json_data
        except Exception as e:
            print(e)
        pbar.update()


In [None]:
# dump the data
file = open("./missing_links.json", "w+")
json.dump(company_data, file, indent=4)
file.close()