># Wikipedia data collection

In [None]:
# from google.colab import drive
# drive.mount('/gdrive')
# %cd /gdrive

In [3]:
# !apt-get update # to update ubuntu to correctly run apt install
# !pip install fuzzywuzzy
# !pip install wikipedia

In [1]:
from tqdm.notebook import tqdm_notebook
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
import pandas as pd
import numpy as np
import wikipedia
import requests
import json
import time
import re
import os

In [2]:
# Load json file containing Links of Companies

# Data Format :
# {
#    "CIN": {
#        "Company": "Company Name",
#        "RoC": "RoC Location",
#        "Status": "Active/Strike Off",
#        "link": "https://company_link_on_zaubacorp.com"
#    }
# }

with open("./company_link.json","r+") as link:
    company_links = json.load(link)
print(len(company_links))

141


In [3]:
# Remove stopwords
list_remove = ['private','limited','limit','ed','limite','d','llp','company','pvt','ltd','lim','limi','ted','(opc)','(india)']
def scoring_sys(val1,val2):
    """Score two string on similarity
    
    Args:
        val1 (string): Company Name
        val2 (string): Wikipedia Search Result

    Returns:
        Float : Score
    """    
    if(len(val1)<=0 and len(val2)<=0): return 0
    score = 0
    total = max(len(val1),len(val2))
    s1 = ''.join(val1)
    s2 = ''.join(val2)
    if(len(s1)>7): l1 = 7
    else: l1 = len(s1)
    if(len(s2)>7): l2 = 7
    else: l2 = len(s2)
    if(l1 ==0 or l2 ==0): return 0
    other_25 = fuzz.ratio(s1[0:l1],s2[0:l2])
    other_70=0
    for i in val1:
        for j in val2:
            temp_score = fuzz.ratio(i,j)
            if(temp_score>67):
                other_70+=1
    other_70 = 70*(other_70/(total/2))
    other_25 = 25*(other_25)/(max(l1,l2)*100)
    if(len(val1)==len(val2)): score+=5
    score += other_25
    score += other_70 
    if(score>100): score = 100
    return score

def return_correctname(company_name,results):
    """Return result with max similarity score

    Args:
        company_name (string): Company's Name
        results (list): Wikipedia Search Results

    Returns:
        string : Wikipedia Search result with highest matching score with name 
    """
    score_list = []
    max_score = 0
    indx=0
    comp_split = list(filter(lambda x:len(x)>0,re.split(r"[^a-z0-9A-z.]",company_name)))
    for k in range(len(results)):
        brokn = list(filter(lambda x:len(x)>0,re.split(r"[^a-z0-9A-z.]",results[k].lower())))
        for j in range(len(brokn)):
            if(brokn[j]in list_remove):
                brokn[j]=''
        fbrokn = list(filter(lambda x:len(x)!=0,brokn))
        score = scoring_sys(comp_split,fbrokn)
        # print(f'{company_name} - {results[k]} - {score}')
        if(score>max_score):
            max_score=score 
            indx=k
        score_list.append(score)
    if(max_score<=90): return ''
    else: return results[indx]

def basic_work(title,company_temp_dict):
    """Get wikipedia page and parse for infobox

    Args:
        title (string): Wikidia page title
        company_temp_dict (dictionary): disctionary with company details
    """    
    temp_url = re.sub(' ','_',title)
    page_url = 'https://en.wikipedia.org/wiki/' + temp_url
    page = requests.get(page_url).text
    soup = BeautifulSoup(page, 'html.parser')
    infobox = soup.find('table',{'class':"infobox vcard"})
    company_temp_dict['wikipedia page url']=wikipedia.WikipediaPage(title=title).url
    if infobox is not None:
        try: 
            ret_table_dict(infobox,company_temp_dict,("table_"+str(i)))
        except Exception as e: 
            print(e)

def ret_parent_table(data):
    """remove all child tables

    Args:
        data (bytes): data in tablular format

    Returns:
        string: noisless table
    """    
    rows = data.findChildren('tr')
    table = ""
    i=0
    while (i < len(rows)):
        if(rows[i].findChildren('table')):
            val = len((str(rows[i])).split('<tr>'))-1
            i=i+val
        else:
            table+=str(rows[i])
        i+=1
    table = '<table>' + table + '</table>'
    table = str(table)
    table = table.replace("<ul>"," ")
    table = table.replace("</ul>"," ")
    table = table.replace("<li>"," ")
    table = table.replace("</li>",",")
    table = table.replace("<br/>",",")
    return table

def ret_table_dict(data,d_company,val):
    """Convert tabular data in dict format after removing noise

    Args:
        data (bytes): Raw table
        d_company ([type]): Company dictionary
        val (integer): table count
    """
    try: 
        table=ret_parent_table(data)
        df = pd.read_html(table)
        table_columns = list(df[0].columns)
        data_list = np.array(df).tolist()[0]
    except Exception as e:
        print(e)
        html_part = re.sub(r'<.*?>', lambda g: g.group(0).upper(), ret_parent_table(data))
        df = pd.read_html(html_part)[0]
        table_columns = list(df.columns)
        data_list = np.array(df).tolist()
    used = []
    for i in table_columns:
        if(len(str(i))>25): return
        if(len(str(i))<0 or len(str(i))=='nan' or str(i).isnumeric()): used.append(False)
        else: used.append(True)
    temp_list = []
    for i in range(len(data_list)):
        if(not used[0] and len(used)==2):
            if(str(data_list[i][0])!='nan' and str(data_list[i][1])!='nan' and len(str(data_list[i][0]))>0 and len(str(data_list[i][1]))>0 and str(data_list[i][0]) != str(data_list[i][1])):
                first_val = re.sub("\[[0-9]+\]", '', data_list[i][0]).strip().lower()
                second_val = re.sub("\[[0-9]+\]", '', data_list[i][1]).strip().lower()
                d_company[first_val]=second_val 
        else:
            dictio = {}
            for j in range(len(table_columns)):
                if(str(data_list[i][j])=='nan' or len(str(data_list[i][j]))<=0): continue
                if(not str(table_columns[j]).isnumeric()): first_val = re.sub("\[[0-9]+\]", '', data_list[i][j]).strip().lower()
                else: first_val = int(table_columns[j])
                second_val = re.sub("\[[0-9]+\]", '', data_list[i][j]).strip().lower()
                dictio[first_val]=second_val
            temp_list.append(dictio)
    if(len(temp_list)>0):
        d_company[val]=temp_list

In [4]:
#
# Main loop for iterating over all the companies
#

# The main data containing dictionary
temp_dict = {}
findings_dict = {}
# stopword list
list_remove = ['private','limited','limit','ed','limite','d','llp','company','pvt','ltd','lim','limi','ted','(opc)','(india)','services']

with tqdm_notebook(total=len(company_links)) as pbar:
    for i in company_links:
        try:
            # formating company name for matching algorithm and wikipedia search
            company_name = company_links[i]['Company'].lower()
            brokn = company_name.split(' ')
            for j in range(len(brokn)):
                if(brokn[j]in list_remove):
                    brokn[j]=''
            brokn = list(filter(lambda x:len(x)!=0,brokn))
            company_name = ' '.join(brokn)
            # get results from wikipedia
            results = wikipedia.search(company_links[i]['Company'])
            # best matching name with score > 90
            final_res = return_correctname(company_name,results)
            if(len(final_res)>0):
                company_temp_dict = {
                    'name':company_links[i]['Company'],
                    'wikipedia page':final_res
                }
                basic_work(final_res,company_temp_dict)
                # check headquater located in hyderabad (As Companies with RoC Hyderabad are all with headquater in hyderabad)
                if 'headquarters' in company_temp_dict:
                    val = re.sub(",","",company_temp_dict['headquarters'].lower())
                    temp_val = list(filter(lambda x:len(x)>0,val.split(" ")))
                    if ('hyderabad' in temp_val) or ('telangana' in temp_val):
                        temp_dict[i] = company_temp_dict
        except Exception as e:
            print(e)
        pbar.update()

  0%|          | 0/141 [00:00<?, ?it/s]

In [5]:
# dump the dictionary to a location
file = open("./check_result.json", "w+")
json.dump(temp_dict, file, indent=4)
file.close()

0
