<a href="https://colab.research.google.com/github/Jeru-John/ML-case-studies/blob/main/webscraping_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Website link to be webscrapped:** [IEX India](https://www.iexindia.com/marketdata/market_snapshot.aspx)

**Importing needed modules:**

In [60]:
import requests
from time import sleep #sleep function- introduce delay btn making requests
from bs4 import BeautifulSoup #library to retrieve a webpage, parse its content, and return a BeautifulSoup object
from google.colab import drive #used to mount your Google Drive into the Colab environment
import json #allows you to work with JSON data

!pip install selenium
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC



# **Scraping the title of the page:**

**`cook_soup_from_url()` function: accessing the title of the page**

In [61]:
def cook_soup_from_url(url, parser='lxml',sleep_time=0):
    """Uses requests to retreive webpage and returns a BeautifulSoup made using lxml parser."""

    sleep(sleep_time)
    response = requests.get(url)

    # check status of request
    if response.status_code != 200:
        raise Exception(f'Error: Status_code !=200.\n status_code={response.status_code}')

    c = response.content
    # feed content into a beautiful soup using lxml
    soup = BeautifulSoup(c,'lxml')
    return soup

In [62]:
url = 'https://www.iexindia.com/marketdata/market_snapshot.aspx'
soup = cook_soup_from_url(url,sleep_time=1)

In [63]:
# getting the title
title_list = soup.head.title.contents[0];
title = title_list.text;
print(title);


	Market Snapshot | Indian Energy Exchange Ltd



# **Linking Drive to google Collab to use webdriver:**

Selenium's WebDriver, in web scraping provides a way to automate interactions with websites and web applications. This is particularly useful when websites are built with dynamic content

In [64]:
# Mount Google Drive
drive.mount('/content/drive')

# Create ChromeDriver options
chrome_options = webdriver.ChromeOptions()

# Set the ChromeDriver binary location (this will be different in Colab)
chrome_options.binary_location = '/usr/bin/google-chrome'

# Use the --headless option to run Chrome in headless mode (no GUI)
chrome_options.add_argument('--headless')

# Add arguments to disable extensions and set window size
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--window-size=1920x1080')

# Create a webdriver instance with ChromeDriver options
driver = webdriver.Chrome(options=chrome_options)

# Open the webpage
url = 'https://www.iexindia.com/marketdata/market_snapshot.aspx'
driver.get(url)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [65]:
# Scroll down the page by 200 pixels
script = "window.scrollBy(0, 200);"
driver.execute_script(script)

# Wait for the hidden div element to be present
hidden_div = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, '#VisibleReportContentctl00_InnerContent_reportViewer_ctl09'))
)

# Execute JavaScript to capture the content of the hidden div
hidden_div_content = hidden_div.get_attribute('innerHTML')
# print(hidden_div_content)
# Now you can parse the hidden_div_content to extract the table's data using BeautifulSoup or other methods

# Clean up and close the driver
driver.quit()



# **Parsing the html using BeautifulSoup**

BeautifulSoup() - used to parse the html

In [66]:
# Parse the hidden_div_content using BeautifulSoup
soup = BeautifulSoup(hidden_div_content, 'html.parser')
# print(soup)

# **Accessing the data we need:**

In [67]:
#getting the taget tag
first_table = soup.find('table')
first_tr = first_table.tr.td.table.tbody
tr_in_tbody = first_tr.find_all('tr')
td_in_tbody = tr_in_tbody[7].table.table
target_div = td_in_tbody.find_all('tr')
target_table = target_div[1].table
target_trs = target_table.find_all('tr')
table_head_row = target_trs[1].find_all('td')

In [68]:
col_heads = [];
#getting headings of the columns of the table
for head in table_head_row:
  if (head.div):
    table_heading = head.div.text
    col_heads.append(table_heading)

print(col_heads)

['Date | Hour | Time Block', 'Purchase Bid (MW)', 'Sell Bid (MW)', 'MCV (MW)', 'Final Scheduled Volume (MW)', 'MCP (Rs/MWh) *', 'Weighted MCP (Rs/MWh)']


In [69]:
#setting the data array
data_array = [];
for data in target_trs[2:]:
  data_row = data.find_all('td')
  for row in data_row:
    if (row.div):
      data_array.append(row.div.text)

In [70]:
#getting the date
date = '';
for i in range(1, len(data_array)):
  date = data_array[0]

print(date)

31-08-2023


In [71]:
#setting a temp array with needed datas
temp_array = []
for value in data_array[1:]:
   temp_array.append(value)
print(temp_array)
print(len(temp_array))
temp_arr_len = len(temp_array)

['1', '00:00 - 00:15', '24120.30', '2541.50', '2541.50', '2541.50', '10000.00', '10000.00', '00:15 - 00:30', '22261.80', '2501.90', '2501.90', '2501.90', '10000.00', '10000.00', '00:30 - 00:45', '21599.80', '2499.20', '2499.20', '2499.20', '10000.00', '10000.00', '00:45 - 01:00', '21134.60', '2562.30', '2562.30', '2562.30', '10000.00', '10000.00', '2', '01:00 - 01:15', '20259.50', '2839.00', '2839.00', '2839.00', '10000.00', '10000.00', '01:15 - 01:30', '19601.60', '2945.00', '2945.00', '2945.00', '10000.00', '10000.00', '01:30 - 01:45', '18978.00', '3133.90', '3133.90', '3133.90', '10000.00', '10000.00', '01:45 - 02:00', '18522.50', '3550.80', '3550.80', '3550.80', '10000.00', '10000.00', '3', '02:00 - 02:15', '18556.50', '4577.10', '4577.10', '4577.10', '10000.00', '10000.00', '02:15 - 02:30', '17878.50', '4698.00', '4698.00', '4698.00', '10000.00', '10000.00', '02:30 - 02:45', '17532.90', '4709.60', '4702.30', '4702.30', '10000.00', '10000.00', '02:45 - 03:00', '17203.20', '4797.50'

In [72]:
result = {}  # Initialize the result dictionary

for i in range(0, len(temp_array), 29):
    key = temp_array[i]
    result[key] = {}
    subset_arr = [i + 1, i + 8, i + 15, i + 22]
    for subset_index in subset_arr:
        sub_key = temp_array[subset_index]
        result[key][sub_key] = []
        subarray = temp_array[subset_index + 1 : subset_index + 7]
        result[key][sub_key].append(subarray)

# Print the result dictionary
print(json.dumps(result, indent=4))
data_scraped = result

{
    "1": {
        "00:00 - 00:15": [
            [
                "24120.30",
                "2541.50",
                "2541.50",
                "2541.50",
                "10000.00",
                "10000.00"
            ]
        ],
        "00:15 - 00:30": [
            [
                "22261.80",
                "2501.90",
                "2501.90",
                "2501.90",
                "10000.00",
                "10000.00"
            ]
        ],
        "00:30 - 00:45": [
            [
                "21599.80",
                "2499.20",
                "2499.20",
                "2499.20",
                "10000.00",
                "10000.00"
            ]
        ],
        "00:45 - 01:00": [
            [
                "21134.60",
                "2562.30",
                "2562.30",
                "2562.30",
                "10000.00",
                "10000.00"
            ]
        ]
    },
    "2": {
        "01:00 - 01:15": [
            [
         

# **Storing the data being webscraped:**

In [73]:
# Define the CSV file path
csv_file_path = "/content/drive/MyDrive/data.csv"

# Open the CSV file in write mode
with open(csv_file_path, mode="w", newline="") as csv_file:
    csv_writer = csv.writer(csv_file)

    # Write the header row
    csv_writer.writerow(["Date", "Hour", "Time Range", "Purchase Bid (MW)", "Sell Bid (MW)", "MCV (MW)", "Final Scheduled Volume (MW)", "MCP (Rs/MWh)", "Weighted MCP (Rs/MWh)"])

    # Iterate through the data and write rows
    for key, time_data in data_scraped.items():
        for time_range, values_list in time_data.items():
            for values in values_list:
                csv_writer.writerow([date, key, time_range] + values)
