In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
response = requests.get("https://www.4icu.org/in/gujarat/")

Extracting the list of college's page link from the home page

In [3]:
if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the table containing information about Gujarat universities
        table = soup.find('table')
        
        # Check if the table is found
        if table:
            # Initialize lists to store extracted information
            website_urls = []

            # Extract information from each row in the table
            for row in table.find_all('tr')[1:]:  # Skip the header row
                columns = row.find_all('td')

                # Check if there are enough columns in the row
                if len(columns) >= 3:
                    # Extract data from columns
                    website_url = columns[1].find('a')['href'].strip() if columns[1].find('a') else ""

                    # Append data to respective lists
                    website_urls.append(website_url)

In [4]:
import pandas as pd

# Assuming universities_info is your list of dictionaries
websites = pd.DataFrame(website_urls)

In [5]:
websites.columns=["paths"]

In [6]:
websites

Unnamed: 0,paths
0,/reviews/2064.htm
1,/reviews/2020.htm
2,/reviews/13227.htm
3,/reviews/13237.htm
4,/reviews/13122.htm
...,...
63,/reviews/14338.htm
64,/reviews/15975.htm
65,/reviews/18737.htm
66,/reviews/17807.htm


Above paths are accessed through hyperlink of the website so only that path get extracted
This url cannot be visited as there is no Protocol,subdomain and domain in the URL


Let update the URL to a proper format by appending domain of the website

In [7]:
#URL of home page to be added as prefix to the path
home = "https://www.4icu.org"

In [8]:
websites['paths'] = home + websites['paths'].astype(str)

In [9]:
websites

Unnamed: 0,paths
0,https://www.4icu.org/reviews/2064.htm
1,https://www.4icu.org/reviews/2020.htm
2,https://www.4icu.org/reviews/13227.htm
3,https://www.4icu.org/reviews/13237.htm
4,https://www.4icu.org/reviews/13122.htm
...,...
63,https://www.4icu.org/reviews/14338.htm
64,https://www.4icu.org/reviews/15975.htm
65,https://www.4icu.org/reviews/18737.htm
66,https://www.4icu.org/reviews/17807.htm


Now we got the links of all the universities present in the list

Create the dataframe which will store the extracted information of all the universities

In [69]:
columns = ['university_name', 'university_website','location','contact_number']
university_info = pd.DataFrame(columns=columns)

In [70]:
i=0

In [71]:
for college_url in websites['paths']:
    response = requests.get(college_url)
    
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all tables with the specified class
        tables = soup.find_all('table', class_="table borderless")

        if tables:
            # Access the first table
            table = tables[0]

            # Find all rows in the table
            rows = table.find_all('tr')

            # Check if there are rows in the table
            if rows:
                # Access the first row
                first_row = rows[0]

                # Find all columns in the first row
                columns = first_row.find_all('td')

                # Check if there are columns in the first row and if the first column exists
                if columns and len(columns) > 0:
                    # Access the text of the second column
                    university_info.at[i,"university_name"] = columns[0].text.strip()
                    university_info.at[i,"university_website"] = columns[0].find('a')['href'].strip()

                else:
                    print("Column not found")

            else:
                print("Row not found")

        else:
            print("Table not found")

        if tables:
            # Access the second table
            table = tables[1]

            rows = table.find_all('tr')

            # Check if there are rows in the table
            if rows:
                # Access the first row
                first_row = rows[0]

                # Find all columns in the first row
                columns = first_row.find_all('td')

                # Check if there are columns in the first row and if the first column exists
                if columns and len(columns) > 0:
                    # Access the text of the second column
                    university_info.at[i,"location"] = columns[0].text.strip()

                else:
                    print("Column not found")

                third_row = rows[2]

                # Find all columns in the third row
                columns = third_row.find_all('td')

                # Check if there are columns in the third row and if the first column exists
                if columns and len(columns) > 0:
                    # Access the text of the first column
                    university_info.at[i,"contact_number"] = columns[0].text.strip()

                else:
                    print("Column not found")

            else:
                print("Row not found")

        else:
            print("Table not found")
        
    i+=1

In [72]:
university_info

Unnamed: 0,university_name,university_website,location,contact_number
0,The Maharaja Sayajirao University of Baroda,https://msubaroda.ac.in,"Opposite Drug Laboratory, Fateh Gunj\nVadodara...",+91 (265) 279 5521
1,Gujarat University,https://www.gujaratuniversity.ac.in,PO Box 4010 Navrangpura\nAhmedabad \n380 009 G...,+91 (79) 630 1919
2,Indian Institute of Technology Gandhinagar,https://iitgn.ac.in,Visat-Gandhinagar Highway\nChandkheda \n382 42...,+91 (79) 2397 2583
3,National Institute of Design,https://www.nid.edu,Paldi\nAhmedabad \n380 007 Gujarat\r\nIndia,+91 (79) 2662 3692
4,Gujarat Technological University,https://www.gtu.ac.in,"JACPC Building, L.D.College of Engineering Cam...",+91 (79) 2630 0499
...,...,...,...,...
63,Kadi Sarva Vishwavidyalaya,http://ksv.ac.in,"Sector 15, Near KH - 5\nGandhinagar \n382 015 ...",+91 (79) 2324 4690
64,C.U. Shah University,http://www.cushahuniversity.ac.in,"Kothariya Village, Surendranagar - Ahmedabad H...",+91 (2752) 247 711
65,Atmiya University,https://atmiyauni.ac.in,Kalawad Road\nRajkot \n360 005 Gujarat\r\nIndia,+91 (281) 256 3445
66,Plastindia International University,https://www.plastindia.edu.in,"Dungra Colony Road, Chanod Colony, Dungra\nVap...",+91


In [82]:
university_info['location'] = university_info['location'].str.replace('\n', ' ')

In [83]:
university_info

Unnamed: 0,university_name,university_website,location,contact_number
0,The Maharaja Sayajirao University of Baroda,https://msubaroda.ac.in,"Opposite Drug Laboratory, Fateh Gunj Vadodara ...",+91 (265) 279 5521
1,Gujarat University,https://www.gujaratuniversity.ac.in,PO Box 4010 Navrangpura Ahmedabad 380 009 Guj...,+91 (79) 630 1919
2,Indian Institute of Technology Gandhinagar,https://iitgn.ac.in,Visat-Gandhinagar Highway Chandkheda 382 424 ...,+91 (79) 2397 2583
3,National Institute of Design,https://www.nid.edu,Paldi Ahmedabad 380 007 Gujarat\r India,+91 (79) 2662 3692
4,Gujarat Technological University,https://www.gtu.ac.in,"JACPC Building, L.D.College of Engineering Cam...",+91 (79) 2630 0499
...,...,...,...,...
63,Kadi Sarva Vishwavidyalaya,http://ksv.ac.in,"Sector 15, Near KH - 5 Gandhinagar 382 015 Gu...",+91 (79) 2324 4690
64,C.U. Shah University,http://www.cushahuniversity.ac.in,"Kothariya Village, Surendranagar - Ahmedabad H...",+91 (2752) 247 711
65,Atmiya University,https://atmiyauni.ac.in,Kalawad Road Rajkot 360 005 Gujarat\r India,+91 (281) 256 3445
66,Plastindia International University,https://www.plastindia.edu.in,"Dungra Colony Road, Chanod Colony, Dungra Vapi...",+91


In [84]:
university_info.to_excel('University_details.xlsx',index=False)