In [1]:
pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.


### Screener

In [62]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Read the test data into a DataFrame
test_df = pd.read_excel('IndexFundLink.xlsx')

# Initialize an empty list to store table data
all_table_data = []

# Iterate over each row of test data
for idx, row in test_df.iterrows():
    # Extract data from the current row
    index_symbol = row['indexSymbol']
    screener_link = row['screenerLink']
    num_pages = int(row['pages'])
    page_limit = 25

    # Iterate over each page
    for page_num in range(1, num_pages + 1):
        if screener_link and not pd.isna(screener_link): 
            
            # Construct the URL with page number
            url = f'{screener_link}?limit={page_limit}&page={page_num}'
            print(url)
            time.sleep(3) 

            # Send an HTTP GET request to the URL
            response = requests.get(url)

            # Check if the request was successful
            if response.status_code == 200:
                # Parse the HTML content
                soup = BeautifulSoup(response.content, "html.parser")

                # Find the section containing the table
                index_data = soup.find("div", class_="responsive-holder fill-card-width")

                # Find the table within the section
                index_data_table = index_data.find("table", class_='data-table text-nowrap striped mark-visited')

                # Initialize lists to store table data
                table_data = []

                # Find all rows in the table
                rows = index_data_table.find_all("tr")

                # Iterate over each row
                for row in rows:
                    # Find all cells in the row
                    cells = row.find_all("td")
                    # Extract text from each cell and append to the list
                    row_data = [cell.get_text(strip=True) for cell in cells]
                    # Append the value of indexSymbol to the row_data
                    row_data.append(index_symbol)
                    table_data.append(row_data)

                # Drop the first row as it contains headers
                table_data = table_data[1:]

                # Append the table data of this page to the list
                all_table_data.extend(table_data)

                print(f"Scraped data from page {page_num} for {index_symbol}")
            else:
                print(f"Failed to retrieve the webpage for {index_symbol}")

# Convert all table data into a DataFrame
df = pd.DataFrame(all_table_data, columns=["Sr No", "Company", "CMP", "PE", "Market Cap", 
                                           "Div Yld", "NP Qtr", "Qtr Profit Var", "Sales Qtr", 
                                           "Qtr Sales Var", "ROCE", "Index Fund"])
filtered_df = df[["Sr No", "Company", "Market Cap", "Index Fund"]]

filtered_df.to_excel('IndexFundHoldingData.xlsx', index=False, sheet_name='Sheet1')

print('Done scraping, Excel is ready!')

https://www.screener.in/company/NIFTY/?limit=25&page=1
Scraped data from page 1 for NIFTY 50
https://www.screener.in/company/NIFTY/?limit=25&page=2
Scraped data from page 2 for NIFTY 50
https://www.screener.in/company/NIFTYJR/?limit=25&page=1
Scraped data from page 1 for NIFTY NEXT 50
https://www.screener.in/company/NIFTYJR/?limit=25&page=2
Scraped data from page 2 for NIFTY NEXT 50
https://www.screener.in/company/CNX100/?limit=25&page=1
Scraped data from page 1 for NIFTY 100
https://www.screener.in/company/CNX100/?limit=25&page=2
Scraped data from page 2 for NIFTY 100
https://www.screener.in/company/CNX100/?limit=25&page=3
Scraped data from page 3 for NIFTY 100
https://www.screener.in/company/CNX100/?limit=25&page=4
Scraped data from page 4 for NIFTY 100
https://www.screener.in/company/CNX100/?limit=25&page=5
Scraped data from page 5 for NIFTY 100
https://www.screener.in/company/CNX200INDE/?limit=25&page=1
Scraped data from page 1 for NIFTY 200
https://www.screener.in/company/CNX200IN

Scraped data from page 15 for NIFTY MIDSML 400
https://www.screener.in/company/id/1275154/?limit=25&page=16
Scraped data from page 16 for NIFTY MIDSML 400
https://www.screener.in/company/id/1275135/?limit=25&page=1
Scraped data from page 1 for NIFTY LARGEMID250
https://www.screener.in/company/id/1275135/?limit=25&page=2
Scraped data from page 2 for NIFTY LARGEMID250
https://www.screener.in/company/id/1275135/?limit=25&page=3
Scraped data from page 3 for NIFTY LARGEMID250
https://www.screener.in/company/id/1275135/?limit=25&page=4
Scraped data from page 4 for NIFTY LARGEMID250
https://www.screener.in/company/id/1275135/?limit=25&page=5
Scraped data from page 5 for NIFTY LARGEMID250
https://www.screener.in/company/id/1275135/?limit=25&page=6
Scraped data from page 6 for NIFTY LARGEMID250
https://www.screener.in/company/id/1275135/?limit=25&page=7
Scraped data from page 7 for NIFTY LARGEMID250
https://www.screener.in/company/id/1275135/?limit=25&page=8
Scraped data from page 8 for NIFTY L

Scraped data from page 1 for NIFTY100 ESG
https://www.screener.in/company/NIFT100ESG/?limit=25&page=2
Scraped data from page 2 for NIFTY100 ESG
https://www.screener.in/company/NIFT100ESG/?limit=25&page=3
Scraped data from page 3 for NIFTY100 ESG
https://www.screener.in/company/NIFT100ESG/?limit=25&page=4
Scraped data from page 4 for NIFTY100 ESG
Done scraping, Excel is ready!


### Groww

In [16]:
# import requests
# from bs4 import BeautifulSoup
# import pandas as pd

# # Define the URL to scrape
# url = 'https://groww.in/indices/nifty-next'


# # Send an HTTP GET request to the URL
# response = requests.get(url)

# # Check if the request was successful
# if response.status_code == 200:
#     # Parse the HTML content
#     soup = BeautifulSoup(response.content, "html.parser")
    
#     # Find the section containing the table
#     index_data = soup.find("section", class_="indexCompanies_sectionWrapper__FJ0EU")
    
#     # Find the table within the section
#     index_data_table = index_data.find("table", class_='tb10Table')
    
#     # Initialize lists to store table data
#     table_data = []
    
#     # Find all rows in the table
#     rows = index_data_table.find_all("tr")
    
#     # Iterate over each row
#     for row in rows:
#         # Find all cells in the row
#         cells = row.find_all("td")
#         # Extract text from each cell and append to the list
#         row_data = [cell.get_text(strip=True) for cell in cells]
#         table_data.append(row_data)
    
#     # Drop the first row as it contains headers
#     table_data = table_data[1:]
    
#     # Convert table data into DataFrame
#     df = pd.DataFrame(table_data, columns=["Company", "Market Cap", "Market Price", "Sector"])
#     df['Index Fund'] = "Nifty Next 50"
    
#     print(df)
# else:
#     print("Failed to retrieve the webpage.")


                                    Company    Market Cap  Market Price  \
0  Life Insurance Corporation of India Ltd.  ₹6,19,976 Cr  ₹0.00(0.00%)   
1                         Avenue Supermarts  ₹2,91,743 Cr  ₹0.00(0.00%)   
2                        Adani Green Energy  ₹2,86,171 Cr  ₹0.00(0.00%)   
3                     Hindustan Aeronautics  ₹2,68,710 Cr  ₹0.00(0.00%)   
4                    Indian Oil Corporation  ₹2,49,663 Cr  ₹0.00(0.00%)   
5                    JIO Financial Services  ₹2,41,679 Cr  ₹0.00(0.00%)   
6                               Adani Power  ₹2,28,215 Cr  ₹0.00(0.00%)   
7                                       DLF  ₹2,24,684 Cr  ₹0.00(0.00%)   

                Sector     Index Fund  
0            Insurance  Nifty Next 50  
1      Consumer Retail  Nifty Next 50  
2               Energy  Nifty Next 50  
3  Aerospace & Defense  Nifty Next 50  
4     Oil, Gas & Fuels  Nifty Next 50  
5   Financial Services  Nifty Next 50  
6               Energy  Nifty Next 50  
7   