In [1]:
!pip install dataset

Collecting dataset
  Downloading dataset-1.5.2-py2.py3-none-any.whl (18 kB)
Collecting banal>=1.0.1
  Downloading banal-1.0.6-py2.py3-none-any.whl (6.1 kB)
Installing collected packages: banal, dataset
Successfully installed banal-1.0.6 dataset-1.5.2


### Purpose :

Datascience begins with Data. Data in this world starts with websites, and APIs. This notebook shows the complete cycle of data gathering to analysis conclusion. Emphasis is on Data gathering using the BeautifulSoup and Requests libraries. As I went through the extraction process, an idea struck. What will happen once the web 3.0 kicks in??? 

Explore the connection of the directors, board members in the Indian companies. This will give the information on the power centers and decision makers in the Indian organisations. To begin with, we will concentrate on the index stocks and then the move to more companies and their directors.

### Major Libraries Involved:

BeautifulSoup, Requests, NetworkX and Dataset libraries

### What to Expect:

The web-scraping the site using BeautifulSoup and the links using the functions. The process of identifying the data in the soup is outlined for the clarity in the required cells. The majority of the challenge occurs in getting the data, and upserting into the database.

Following that visualisation of the entire board members and their companies are graphed using the networkX library. 


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import requests
import dataset
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urldefrag
from joblib import Parallel, delayed

In [95]:
#Instantiate the database for company owners

db = dataset.connect('sqlite:///owners.db')

source_url = 'https://www.moneycontrol.com/india/stockpricequote/'

In [76]:
session = requests.Session()
r = session.get(source_url)
html_soup = BeautifulSoup(r.text, 'html.parser')

In [96]:
comp_link = html_soup.find_all(class_='pcq_tbl MT10')
comp_link = comp_link[0].find_all('td')
links = []
#Get the links of the 50 companies first
for company in comp_link:
    links.append(company.find('a').get('href'))

In [97]:
#Sanity checking the links that are scraped before using it for next step
base_url = 'https://www.moneycontrol.com/company-facts/'
def full_link(base_url,url):
    if url is None:
        return None
    link_split = url.split('/')
    full_url = base_url+link_split[-2]+'/management/'+link_split[-1]
    #print('Inserting {} company and its link {} into owners database'.format(link_split[-2],full_url))
    #creating companies table and inserting data inside it
    org_id = db['companies'].insert({'url': full_url,'entry_url': url,'company': link_split[-2]})
    
#Populating the Owner's companies table
for link in links:
    full_link(base_url,link)

In [98]:
company_link = pd.read_sql_table('companies','sqlite:///owners.db')
print('There are total {} companies in the database'.format(company_link.shape[0]))

There are total 471 companies in the database


In [99]:
def get_data(company,link):
    #Initiate new session for the companies
    comp = requests.Session()
    comp.headers.update({'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Mobile Safari/537.36',
                           'Referer': 'https://www.moneycontrol.com/promo/mc_wap_interstitial_dfp.php?classic=true'})
    r = comp.get(link)
    comp_soup = BeautifulSoup(r.text, 'html.parser')
    tab = comp_soup.find_all('table',class_='commomtable')
    for data in tab: #There are 2 elements in the tab tag
        name = data.select('tr > td')
        data_len = len(name)
        index = 0

        for i in range(int(data_len/2)): #There are minimum 8 elements in each of the tab's element
            db['mgmt'].insert({'name': data.select('tr > td')[index].get_text().strip(),
                               'designation': data.select('tr > td')[index+1].get_text().strip(),
                               'company': company})
            index += 2

In [100]:
def fund_data(company,entry_link):
    #Initiate new session for the companies
    comp = requests.Session()
    comp.headers.update({'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Mobile Safari/537.36'})
    r = comp.get(entry_link)
    fund_soup = BeautifulSoup(r.text, 'html.parser')
    o_table = fund_soup.find_all(class_='overvw_mob')
    if o_table is None:
        print('No Data Available from the webpage, so filling 0')
        db['companies'].upsert({'company':company,
                                'Market_Cap': float(0),
                                'EPS': float(0),
                                'PtoE': float(0),
                                'Book_value': float(0),
                                'PtoB': float(0)},['id'])

    else:
        data = o_table[0].select('li > div')
        #print('inserting data from overvw_mob class')
        db['companies'].upsert({'company':company,'Market_Cap': data[6].get_text(),
                               'EPS': data[11].get_text(),
                               'PtoE': data[12].get_text(),
                               'Book_value': data[14].get_text(),
                               'PtoB': data[15].get_text()},['company'])

In [101]:
from time import sleep
#Getting the data for each companies
for id in range(company_link.shape[0]):
    #print('inserting data of {} company into the owners database'.format(company_link.company[id]))
    get_data(company_link.company[id],company_link.url[id])
    #sleep(randint(30, 45))#adding a gap half a minute to 45 seconds to avoid continued hammering  
print('Collecting Management data completed')
    
for id in range(company_link.shape[0]):
    #print('inserting data of {} company into the owners database'.format(company_link.company[id]))
    fund_data(company_link.company[id],company_link.entry_url[id])
    #print(company_link.company[id])
    #sleep(randint(30, 45))#adding a gap half a minute to 45 seconds to avoid continued hammering  
print('Collecting Fundamental data completed')

Collecting Management data completed
Collecting Fundamental data completed


In [103]:
company_link = pd.read_sql_table('companies','sqlite:///owners.db')
print('There are total {} companies in the database'.format(company_link.shape[0]))

There are total 471 companies in the database


In [104]:
company_link.head()

Unnamed: 0,id,url,entry_url,company,Market_Cap,EPS,PtoE,Book_value,PtoB
0,1,https://www.moneycontrol.com/company-facts/3mi...,https://www.moneycontrol.com/india/stockpriceq...,3mindia,28185,198.99,125.73,1695.8,14.76
1,2,https://www.moneycontrol.com/company-facts/aar...,https://www.moneycontrol.com/india/stockpriceq...,aartidrugs,4653,22.79,22.05,98.58,5.1
2,3,https://www.moneycontrol.com/company-facts/aar...,https://www.moneycontrol.com/india/stockpriceq...,aartiindustries,36714,17.72,57.16,82.7,12.25
3,4,https://www.moneycontrol.com/company-facts/aav...,https://www.moneycontrol.com/india/stockpriceq...,aavasfinanciers,24579,41.03,75.89,304.22,10.24
4,5,https://www.moneycontrol.com/company-facts/adi...,https://www.moneycontrol.com/india/stockpriceq...,adityabirlacapital,29550,5.56,22.0,63.03,1.95


In [None]:
management = pd.read_sql_table('mgmt','sqlite:///owners.db')
management.tail()

With the data of the directors, board members in hand here are some questions that can be answered.

1) What is the maximum number of Board members a company can have?

2) Which board members are there in board of multiple companies?

3) How many type of Board members are there? 

4) Which companies have certain type of Board members?



In [None]:
mul_dir = management.groupby('name')['id'].count()
mul_dir.sort_values(ascending=False,inplace=True)
mul_dir = mul_dir[mul_dir.values > 2]

In [None]:
mul_dirq['mul_companies'] = 5
mul_dir.head()
for names in mul_dir.index:
    mul_dir['mul_companies'] = management.loc[(management.name == names),'company'].values

In [None]:
mul_dir

In [None]:
import networkx
G = networkx.DiGraph()

print('Building graph...')
for page in db['mgmt'].all():
    G.add_node(page['name'])
    G.add_node(page['company'])

In [None]:
for link in db['mgmt'].all():
    # Only addedge if the endpoints have both been visited
    if G.has_node(link['name']) and G.has_node(link['company']):
        G.add_edge(link['name'], link['company'])

In [None]:
G.number_of_edges()

In [None]:
G.remove_nodes_from(list(networkx.isolates(G)))

In [None]:
import matplotlib.pyplot as plt
# Calculate node betweenness centrality as a measure of importance
print('Calculating betweenness...')
betweenness = networkx.betweenness_centrality(G, endpoints=False)

print('Drawing graph...')

# Sigmoid function to make the colors (a little) more appealing
squish = lambda x : 1 / (1 + 0.5**(20*(x-0.1)))

colors = [(0, 0, squish(betweenness[n])) for n in G.nodes()]
labels = dict((n, d['company']) for n, d in G.nodes(data=True))
positions = networkx.spring_layout(G)

networkx.draw(G, positions, node_color=colors, edge_color='#AEAEAE')

# Draw the labels manually to make them appear above the nodes
for k, v in positions.items():
    plt.text(v[0], v[1]+0.025, s=labels[k], 
             horizontalalignment='center', size=8)

plt.show()