In [1]:
from bs4 import BeautifulSoup
import requests
import time
import os
import re

In [2]:
BASE_URL="https://vikaspedia.in"

In [3]:
def clean_name(folder_name):
    # Replace special characters in folder name with '-'
    # For special cases like while scraping data for category: Guidelines for Release/Notification, Provisional Notification and De-notification of Cultivars
    cleaned_name = re.sub(r'[\/:*?"<>|]', '-', folder_name)
    return cleaned_name

In [4]:
def fetchData(url):
    dat=requests.get(url,verify=False)
    soup=BeautifulSoup(dat.text,"html.parser")
    MiddleColumn = soup.find('div', id='MiddleColumn_internal')
    # Remove all <a> tags from MiddleColumn
    for a_tag in MiddleColumn.find_all('a'):
        a_tag.decompose()
    # Remove all tags and add only text
    text_content = ''
    for element in MiddleColumn.contents:
        if element.name == 'h3' or element.name == 'h4' or element.name == 'p':
            text_content += str(element.text) + '\n'
        else:
            text_content += str(element)
    # Remove HTML tags using regular expression
    return re.sub(r'<.*?>', '', text_content)

In [5]:
def webScrapeSubCategory(url, folder_path='.'):
    #For Delay
    try:
        time.sleep(10)
        data = requests.get(url, verify=False) #Ignoring SSL certificate verification
        soup = BeautifulSoup(data.text, "html.parser") # Generating Soup
        a_tags = soup.find_all('a', class_='folderfile_name') #Fetching a tags
        result = []
        if a_tags:
            for tag in a_tags:
                category_name = tag.text.strip() #removing extra spaces or lines
                category_folder_name = clean_name(category_name) #clean name
                category_folder_path = os.path.join(folder_path, category_folder_name) # calulate path
                os.makedirs(category_folder_path, exist_ok=True) # If directory does not exitst create one
                print(f"Scraping data for category: {category_name} {url}") #Show message --> For Debugging
                category_url = BASE_URL + tag.get('href') #Next website
                subcategories = webScrapeSubCategory(category_url, category_folder_path) # recursively call
                result.append({category_name: subcategories})
        else:
            title = soup.find('h3', class_='card-title title')
            scrapped_data = title.text + "\n\n" + fetchData(url)
            with open(os.path.join(folder_path, clean_name(title.text)+'.txt'), 'w', encoding='utf-8') as file:
                file.write(scrapped_data)
            return scrapped_data
    
        return result
    except Exception as e:
        print("EXCEPTION OCCURRED",e)
        return ""


In [8]:
def webScrape(url, start,end,folder_path='.'):
    #For Delay
    try:
        time.sleep(10)
        data = requests.get(url, verify=False) #Ignoring SSL certificate verification
        soup = BeautifulSoup(data.text, "html.parser") # Generating Soup
        a_tags = soup.find_all('a', class_='folderfile_name') #Fetching a tags
        result = []
        if a_tags:
            for tag in a_tags[start:end+1]:
                category_name = tag.text.strip() #removing extra spaces or lines
                category_folder_name = clean_name(category_name) #clean name
                category_folder_path = os.path.join(folder_path, category_folder_name) # calulate path
                os.makedirs(category_folder_path, exist_ok=True) # If directory does not exitst create one
                print(f"Scraping data for category: {category_name} {url}") #Show message --> For Debugging
                category_url = BASE_URL + tag.get('href') #Next website
                subcategories = webScrapeSubCategory(category_url, category_folder_path) # recursively call
                result.append({category_name: subcategories})
        else:
            title = soup.find('h3', class_='card-title title')
            scrapped_data = title.text + "\n\n" + fetchData(url)
            with open(os.path.join(folder_path, clean_name(title.text)+'.txt'), 'w', encoding='utf-8') as file:
                file.write(scrapped_data)
            return scrapped_data
    
        return result
    except Exception as e:
        print("EXCEPTION OCCURRED",e)


In [13]:
#For generating list with indices
data = requests.get(BASE_URL+"/agriculture", verify=False) #Ignoring SSL certificate verification
soup = BeautifulSoup(data.text, "html.parser") # Generating Soup
a_tags = soup.find_all('a', class_='folderfile_name') #Fetching a tags
result = []
if a_tags:
    for i,tag in enumerate(a_tags):
        result.append([tag.text,i])
    print(result)
else:
    print("No a tags found")



[['Agri Credit', 0], ['Agri Directory', 1], ['Agri Exports', 2], ['Agri Inputs', 3], ['Agri Insurance', 4], ['Agro enterprises', 5], ['Best Practices', 6], ['Crop Production', 7], ['Fisheries', 8], ['Forestry', 9], ['ICT applications in Agriculture', 10], ['Livestock', 11], ['Market information', 12], ['National Schemes for Farmers', 13], ['Policies and Schemes', 14], ['Post Harvest Technologies', 15], ['Poultry', 16], ['State-specific schemes for farmers', 17], ['Women and agriculture', 18]]


In [None]:
allSectors=webScrape(BASE_URL+"/agriculture",16,18,"VIKASPEDIA_DATA") #START AND END ARE BOTH INCLUDING INDICES



Scraping data for category: Poultry https://vikaspedia.in/agriculture




EXCEPTION OCCURRED
Scraping data for category: State-specific schemes for farmers https://vikaspedia.in/agriculture




Scraping data for category: Andaman and Nicobar https://vikaspedia.in/agriculture/state-specific-schemes-for-farmers




Scraping data for category: Agricultural Credit https://vikaspedia.in/agriculture/state-specific-schemes-for-farmers/andaman-and-nicobar




Scraping data for category: Agricultural Marketing https://vikaspedia.in/agriculture/state-specific-schemes-for-farmers/andaman-and-nicobar


