In [205]:
import requests
from bs4 import BeautifulSoup
from lxml import html
import pandas as pd

# Start with first reference page to collect all links of sectors
URL = 'https://www.moneycontrol.com/india/stockmarket/sector-classification/marketstatistics/nse/automotive.html'
source = requests.get(URL)

sector = pd.DataFrame()

In [206]:
# Add first sector and link
tree = html.fromstring(source.content)
name = tree.xpath('//*[@id="mc_mainWrapper"]/div[2]/div[1]/div[5]/div[2]/div[2]/ul/li[2]/a')
sector['Name'] = [name[0].text_content()]
sector['URL'] = [URL]

In [207]:
# Add other sectors and their links in sector df
name1 = tree.xpath('//*[@id="mc_mainWrapper"]/div[2]/div[1]/div[5]/div[2]/div[2]/ul/li')
for i in range(len(name1)-2):
    sector.loc[len(sector.index)] = [name1[i+2].xpath('.//a')[0].text_content(),'https://www.moneycontrol.com'+name1[i+2].xpath('.//a')[0].get('href')]
# Remove the 14th row as it is faulty (manual)
sector = sector.drop(14)

In [208]:
# Reset the index, drop = True means it doesn't store previous indices as extra column
sector = sector.reset_index(drop=True)
sector

Unnamed: 0,Name,URL
0,Automotive,https://www.moneycontrol.com/india/stockmarket...
1,Banking/Finance,https://www.moneycontrol.com/india/stockmarket...
2,Cement/Construction,https://www.moneycontrol.com/india/stockmarket...
3,Chemicals,https://www.moneycontrol.com/india/stockmarket...
4,Conglomerates,https://www.moneycontrol.com/india/stockmarket...
5,Cons Durable,https://www.moneycontrol.com/india/stockmarket...
6,Cons Non-Durable,https://www.moneycontrol.com/india/stockmarket...
7,Engineering,https://www.moneycontrol.com/india/stockmarket...
8,Food & Beverage,https://www.moneycontrol.com/india/stockmarket...
9,Gold ETF,https://www.moneycontrol.com/india/stockmarket...


In [212]:
# Create dfs of every sector and store it in a list
sector_list = {}
for k in range(len(sector['Name'])):
    URL = sector['URL'][k]
    source = requests.get(URL)
    tree = html.fromstring(source.content)

    sectori = pd.DataFrame(columns=['Stocks','Industry','LastPrice','Change','PctChange','MktCap','URL'])
    name = tree.xpath('//*[@id="mc_mainWrapper"]/div[2]/div[1]/div[5]/div[3]/div/table[@class="tbldata14 bdrtpg"]/tr')
    for i in range(len(name)-1):
        row = [name[i+1].xpath('.//td')[j].text_content() for j in range(len(name[i+1].xpath('.//td')))]
        row[-1] = float(''.join(row[-1].split(',')))
        row.append('https://www.moneycontrol.com'+name[i+1].xpath('.//td')[0].xpath('.//a')[0].get('href'))
        sectori.loc[len(sectori.index)] = row
    sector_list[sector['Name'][k]] = sectori

In [213]:
# Print all the sectors
sector_list.keys()

dict_keys(['Automotive', 'Banking/Finance', 'Cement/Construction', 'Chemicals', 'Conglomerates', 'Cons Durable', 'Cons Non-Durable', 'Engineering', 'Food & Beverage', 'Gold ETF', 'Technology', 'Manufacturing', 'Media', 'Metals & Mining', 'Miscellaneous', 'Oil & Gas', 'Pharmaceuticals', 'Retail/Real Estate', 'Services', 'Telecom', 'Tobacco', 'Utilities'])

In [214]:
# See the automotive sector data
sector_list['Automotive']

Unnamed: 0,Stocks,Industry,LastPrice,Change,PctChange,MktCap,URL
0,Autoline Ind,Auto Ancillaries,79.65,7.20,9.94,310.0,https://www.moneycontrol.com/india/stockpriceq...
1,Exide Ind,Auto Ancillaries,172.40,13.00,8.16,14654.0,https://www.moneycontrol.com/india/stockpriceq...
2,Mah Scooters,Auto - 2 & 3 Wheelers,4484.40,327.60,7.88,5125.0,https://www.moneycontrol.com/india/stockpriceq...
3,Subros,Auto Ancillaries,357.95,24.20,7.25,2335.0,https://www.moneycontrol.com/india/stockpriceq...
4,Ndr Auto Compon,,469.25,28.35,6.43,279.0,https://www.moneycontrol.com/india/stockpriceq...
...,...,...,...,...,...,...,...
78,Bharat Gears,Auto Ancillaries,195.95,-4.95,-2.46,201.0,https://www.moneycontrol.com/india/stockpriceq...
79,Escorts Kubota,,2019.60,-55.50,-2.67,26647.0,https://www.moneycontrol.com/india/stockpriceq...
80,Shivam Auto,Auto Ancillaries,42.80,-2.00,-4.46,523.0,https://www.moneycontrol.com/india/stockpriceq...
81,Auto Stampings,Auto Ancillaries,471.45,-28.95,-5.79,748.0,https://www.moneycontrol.com/india/stockpriceq...


In [215]:
# Add ticker of every company by visiting their moneycontrol page
for sectori in sector_list:
    #print(sector)
    ticker_list = []
    for k in range(len(sector_list[sectori]['URL'])):
        try:
            #print(k)
            URL = sector_list[sectori]['URL'][k]
            source = requests.get(URL)
            tree = html.fromstring(source.content)
            ticker_list.append(tree.xpath('//*[@id="company_info"]/ul/li[5]/ul/li[2]/p/text()')[0]+'.NS')
        except Exception:
            # Delete the row if ticker symbol is not available
            sector_list[sectori] = sector_list[sectori].drop(k)
    
    # Make a column for ticker symbol
    sector_list[sectori]['Ticker'] = ticker_list
    sector_list[sectori]
    

In [216]:
# reset the index if for any df, a row is deleted
for sector in sector_list:
    sector_list[sector] = sector_list[sector].reset_index(drop=True)

In [217]:
# Save the data in a folder/files
for sectors in sector_list:
    name = sectors.split('/')[0]
    sector_list[sectors].to_csv(rf'./sector_data/{name}.csv') # To read, do pd.read_csv('file_path')