In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
# Specifying the url of the website and getting the access
url = "https://en.wikipedia.org/wiki/NIFTY_50"

accessors = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1",
    "Cache-Control": "max-age=0",
}

response = requests.get(url, headers=accessors)

print(response.status_code)

200


In [3]:

soup = BeautifulSoup(response.text,'html.parser')

In [4]:
title = soup.title

In [5]:
title.text

'NIFTY 50 - Wikipedia'

In [6]:
table = soup.find('table',class_="wikitable sortable")

In [7]:
header_row = table.find('tr')

In [8]:
header_row

<tr>
<th>Company name
</th>
<th>Symbol
</th>
<th>Sector<sup class="reference" id="cite_ref-15"><a href="#cite_note-15"><span class="cite-bracket">[</span>15<span class="cite-bracket">]</span></a></sup>
</th>
<th>Date added<sup class="reference" id="cite_ref-inclexcl_16-0"><a href="#cite_note-inclexcl-16"><span class="cite-bracket">[</span>16<span class="cite-bracket">]</span></a></sup>
</th></tr>

In [9]:
table_headers = []

In [10]:
# Extracting the table headers
header_row = table.find('tr')

In [11]:
# Storing the headers into a list
for th in header_row.find_all('th'):

      table_headers.append(th.text.strip())

print(table_headers)

['Company name', 'Symbol', 'Sector[15]', 'Date added[16]']


In [12]:
table_headers = [h.split('[')[0].strip() for h in table_headers]

In [13]:
table_headers

['Company name', 'Symbol', 'Sector', 'Date added']

In [14]:
data_rows = []

In [15]:
# Extracting the table data
rows = table.tbody.find_all('tr')[1:]

In [16]:
# Storing the table rows into a list
for row in rows:

    cells = row.find_all(['td', 'th'])
    row_text = [cell.text.strip() for cell in cells]

    if len(row_text) == len(table_headers):
        data_rows.append(row_text)

In [17]:
data_rows

[['Adani Enterprises', 'ADANIENT', 'Metals & Mining', '30 September 2022'],
 ['Adani Ports & SEZ', 'ADANIPORTS', 'Services', '28 September 2015'],
 ['Apollo Hospitals', 'APOLLOHOSP', 'Healthcare', '31 March 2022'],
 ['Asian Paints', 'ASIANPAINT', 'Consumer Durables', '27 April 2012[a]'],
 ['Axis Bank', 'AXISBANK', 'Financial Services', '27 March 2009'],
 ['Bajaj Auto',
  'BAJAJ-AUTO',
  'Automobile and Auto Components',
  '1 October 2010[b]'],
 ['Bajaj Finance', 'BAJFINANCE', 'Financial Services', '29 September 2017'],
 ['Bajaj Finserv', 'BAJAJFINSV', 'Financial Services', '2 April 2018'],
 ['Bharat Electronics', 'BEL', 'Capital Goods', '30 September 2024'],
 ['Bharti Airtel', 'BHARTIARTL', 'Telecommunication', '1 March 2004'],
 ['Cipla', 'CIPLA', 'Healthcare', '7 October 1998'],
 ['Coal India', 'COALINDIA', 'Oil, Gas & Consumable Fuels', '10 October 2011'],
 ["Dr. Reddy's Laboratories", 'DRREDDY', 'Healthcare', '1 October 2010[c]'],
 ['Eicher Motors',
  'EICHERMOT',
  'Automobile and 

In [18]:
import pandas as pd

In [19]:
# Converting the extracted data into a pandas DataFrame
df = pd.DataFrame(data_rows, columns=table_headers)

In [20]:
df

Unnamed: 0,Company name,Symbol,Sector,Date added
0,Adani Enterprises,ADANIENT,Metals & Mining,30 September 2022
1,Adani Ports & SEZ,ADANIPORTS,Services,28 September 2015
2,Apollo Hospitals,APOLLOHOSP,Healthcare,31 March 2022
3,Asian Paints,ASIANPAINT,Consumer Durables,27 April 2012[a]
4,Axis Bank,AXISBANK,Financial Services,27 March 2009
5,Bajaj Auto,BAJAJ-AUTO,Automobile and Auto Components,1 October 2010[b]
6,Bajaj Finance,BAJFINANCE,Financial Services,29 September 2017
7,Bajaj Finserv,BAJAJFINSV,Financial Services,2 April 2018
8,Bharat Electronics,BEL,Capital Goods,30 September 2024
9,Bharti Airtel,BHARTIARTL,Telecommunication,1 March 2004


In [21]:
# Standardized the column names

df = df.rename(columns={"Company name": "Company_name", "Date added": "Date_added"})

In [22]:
# Removed the unwanted metadata from the Date added column
df['Date_added'] = df['Date_added'].str.replace(r'\[.*?\]', '', regex=True)

In [23]:
df

Unnamed: 0,Company_name,Symbol,Sector,Date_added
0,Adani Enterprises,ADANIENT,Metals & Mining,30 September 2022
1,Adani Ports & SEZ,ADANIPORTS,Services,28 September 2015
2,Apollo Hospitals,APOLLOHOSP,Healthcare,31 March 2022
3,Asian Paints,ASIANPAINT,Consumer Durables,27 April 2012
4,Axis Bank,AXISBANK,Financial Services,27 March 2009
5,Bajaj Auto,BAJAJ-AUTO,Automobile and Auto Components,1 October 2010
6,Bajaj Finance,BAJFINANCE,Financial Services,29 September 2017
7,Bajaj Finserv,BAJAJFINSV,Financial Services,2 April 2018
8,Bharat Electronics,BEL,Capital Goods,30 September 2024
9,Bharti Airtel,BHARTIARTL,Telecommunication,1 March 2004


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Company_name  50 non-null     object
 1   Symbol        50 non-null     object
 2   Sector        50 non-null     object
 3   Date_added    50 non-null     object
dtypes: object(4)
memory usage: 1.7+ KB


In [25]:
df.dtypes

Unnamed: 0,0
Company_name,object
Symbol,object
Sector,object
Date_added,object


In [26]:
# Updating the datatype of 'Date_added' column from 'Object' to 'Datetime'

df['Date_added'] = pd.to_datetime(df['Date_added'])

In [27]:
df.dtypes

Unnamed: 0,0
Company_name,object
Symbol,object
Sector,object
Date_added,datetime64[ns]


In [28]:
# Saving this DataFrame as a CSV file
# df.to_csv('Nifty50_WebScraping_Project.csv')