<a href="https://colab.research.google.com/github/Jeru-John/ML-case-studies/blob/main/webscraping_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Website link to be webscrapped:** [IEX India](https://www.iexindia.com/marketdata/market_snapshot.aspx)

**Importing needed modules:**

In [20]:
import requests
from time import sleep #sleep function- introduce delay btn making requests
from bs4 import BeautifulSoup #library to retrieve a webpage, parse its content, and return a BeautifulSoup object
from google.colab import drive #used to mount your Google Drive into the Colab environment
import json #allows you to work with JSON data

!pip install selenium
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC



# **Scraping the title of the page:**

**`cook_soup_from_url()` function: accessing the title of the page**

In [21]:
def cook_soup_from_url(url, parser='lxml',sleep_time=0):
    """Uses requests to retreive webpage and returns a BeautifulSoup made using lxml parser."""

    sleep(sleep_time)
    response = requests.get(url)

    # check status of request
    if response.status_code != 200:
        raise Exception(f'Error: Status_code !=200.\n status_code={response.status_code}')

    c = response.content
    # feed content into a beautiful soup using lxml
    soup = BeautifulSoup(c,'lxml')
    return soup

In [22]:
url = 'https://www.iexindia.com/marketdata/market_snapshot.aspx'
soup = cook_soup_from_url(url,sleep_time=1)

In [23]:
# getting the title
title_list = soup.head.title.contents[0];
title = title_list.text;
print(title);


	Market Snapshot | Indian Energy Exchange Ltd



# **Linking Drive to google Collab to use webdriver:**

Selenium's WebDriver, in web scraping provides a way to automate interactions with websites and web applications. This is particularly useful when websites are built with dynamic content

In [50]:
# Mount Google Drive
drive.mount('/content/drive')

# Create ChromeDriver options
chrome_options = webdriver.ChromeOptions()

# Set the ChromeDriver binary location (this will be different in Colab)
chrome_options.binary_location = '/usr/bin/google-chrome'

# Use the --headless option to run Chrome in headless mode (no GUI)
chrome_options.add_argument('--headless')

# Add arguments to disable extensions and set window size
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--window-size=1920x1080')

# Create a webdriver instance with ChromeDriver options
driver = webdriver.Chrome(options=chrome_options)

# Open the webpage
url = 'https://www.iexindia.com/marketdata/market_snapshot.aspx'
driver.get(url)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
# Scroll down the page by 200 pixels
script = "window.scrollBy(0, 200);"
driver.execute_script(script)

# Wait for the hidden div element to be present
hidden_div = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, '#VisibleReportContentctl00_InnerContent_reportViewer_ctl09'))
)

# Execute JavaScript to capture the content of the hidden div
hidden_div_content = hidden_div.get_attribute('innerHTML')
# print(hidden_div_content)
# Now you can parse the hidden_div_content to extract the table's data using BeautifulSoup or other methods

# Clean up and close the driver
driver.quit()



# **Parsing the html using BeautifulSoup**

BeautifulSoup() - used to parse the html

In [48]:
# Parse the hidden_div_content using BeautifulSoup
soup = BeautifulSoup(hidden_div_content, 'html.parser')
# print(soup)

# **Accessing the data we need:**

In [29]:
#getting the taget tag
first_table = soup.find('table')
first_tr = first_table.tr.td.table.tbody
tr_in_tbody = first_tr.find_all('tr')
td_in_tbody = tr_in_tbody[7].table.table
target_div = td_in_tbody.find_all('tr')
target_table = target_div[1].table
target_trs = target_table.find_all('tr')
table_head_row = target_trs[1].find_all('td')

In [30]:
col_heads = [];
#getting headings of the columns of the table
for head in table_head_row:
  if (head.div):
    table_heading = head.div.text
    col_heads.append(table_heading)

print(col_heads)

['Date | Hour | Time Block', 'Purchase Bid (MW)', 'Sell Bid (MW)', 'MCV (MW)', 'Final Scheduled Volume (MW)', 'MCP (Rs/MWh) *', 'Weighted MCP (Rs/MWh)']


In [31]:
#setting the data array
data_array = [];
for data in target_trs[2:]:
  data_row = data.find_all('td')
  for row in data_row:
    if (row.div):
      data_array.append(row.div.text)

In [32]:
#getting the date
date = '';
for i in range(1, len(data_array)):
  date = data_array[0]

print(date)

21-08-2023


In [33]:
#setting a temp array with needed datas
temp_array = []
for value in data_array[1:]:
   temp_array.append(value)
print(temp_array)

['1', '00:00 - 00:15', '14017.20', '2995.90', '2595.90', '2595.90', '10000.00', '10000.00', '00:15 - 00:30', '13894.60', '2890.70', '2490.70', '2490.70', '10000.00', '10000.00', '00:30 - 00:45', '13577.20', '2944.60', '2544.57', '2544.57', '10000.00', '10000.00', '00:45 - 01:00', '12852.00', '2827.10', '2427.10', '2427.10', '10000.00', '10000.00', '2', '01:00 - 01:15', '12200.90', '2908.70', '2508.70', '2508.70', '10000.00', '10000.00', '01:15 - 01:30', '11601.30', '2927.10', '2527.10', '2527.10', '10000.00', '10000.00', '01:30 - 01:45', '11973.30', '3087.90', '2587.90', '2587.90', '10000.00', '10000.00', '01:45 - 02:00', '11776.40', '3203.10', '2703.10', '2703.10', '10000.00', '10000.00', '3', '02:00 - 02:15', '10667.30', '4240.10', '3740.10', '3740.10', '10000.00', '10000.00', '02:15 - 02:30', '10661.90', '4362.50', '3862.49', '3862.49', '10000.00', '10000.00', '02:30 - 02:45', '10409.20', '4426.50', '3826.50', '3826.50', '10000.00', '10000.00', '02:45 - 03:00', '10011.60', '4513.80'

In [42]:
result = {}  # Initialize the result dictionary
subset_arr = [1,8,15,22]
for i in range(1, 29):
    key = temp_array[0]
    if key not in result:
        result[key] = {}
    array_i = result[key]

    sub_key = temp_array[subset_arr[0]]

    for j in range(subset_arr[0]-2, subset_arr[0]+5):
        array_i[sub_key] = []
        subarray = temp_array[j-3:j+3]
        array_i[sub_key].append(subarray)

    sub_key = temp_array[subset_arr[1]]

    for j in range(subset_arr[1]-2, subset_arr[1]+5):
        array_i[sub_key] = []
        subarray = temp_array[j-3:j+3]
        array_i[sub_key].append(subarray)

    sub_key = temp_array[subset_arr[2]]

    for j in range(subset_arr[2]-2, subset_arr[2]+5):
        array_i[sub_key] = []
        subarray = temp_array[j-3:j+3]
        array_i[sub_key].append(subarray)

    sub_key = temp_array[subset_arr[3]]

    for j in range(subset_arr[3]-2, subset_arr[3]+5):
        array_i[sub_key] = []
        subarray = temp_array[j-3:j+3]
        array_i[sub_key].append(subarray)

# Print the result dictionary
print(json.dumps(result, indent=4))
data_scraped = result;

{
    "1": {
        "00:00 - 00:15": [
            [
                "14017.20",
                "2995.90",
                "2595.90",
                "2595.90",
                "10000.00",
                "10000.00"
            ]
        ],
        "00:15 - 00:30": [
            [
                "13894.60",
                "2890.70",
                "2490.70",
                "2490.70",
                "10000.00",
                "10000.00"
            ]
        ],
        "00:30 - 00:45": [
            [
                "13577.20",
                "2944.60",
                "2544.57",
                "2544.57",
                "10000.00",
                "10000.00"
            ]
        ],
        "00:45 - 01:00": [
            [
                "12852.00",
                "2827.10",
                "2427.10",
                "2427.10",
                "10000.00",
                "10000.00"
            ]
        ]
    }
}


# **Storing the data being webscraped:**

In [47]:
import csv
data = []
data.append(col_heads)
data.append(data_scraped)
# print(data)

csv_file_path = '/content/drive/MyDrive/data.csv'
  # Your scraped data as a list of dictionaries or lists
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Col1','Col2'])  # Write header
    for row in data_array:
        writer.writerow(row)
        print(row)

21-08-2023
1
00:00 - 00:15
14017.20
2995.90
2595.90
2595.90
10000.00
10000.00
00:15 - 00:30
13894.60
2890.70
2490.70
2490.70
10000.00
10000.00
00:30 - 00:45
13577.20
2944.60
2544.57
2544.57
10000.00
10000.00
00:45 - 01:00
12852.00
2827.10
2427.10
2427.10
10000.00
10000.00
2
01:00 - 01:15
12200.90
2908.70
2508.70
2508.70
10000.00
10000.00
01:15 - 01:30
11601.30
2927.10
2527.10
2527.10
10000.00
10000.00
01:30 - 01:45
11973.30
3087.90
2587.90
2587.90
10000.00
10000.00
01:45 - 02:00
11776.40
3203.10
2703.10
2703.10
10000.00
10000.00
3
02:00 - 02:15
10667.30
4240.10
3740.10
3740.10
10000.00
10000.00
02:15 - 02:30
10661.90
4362.50
3862.49
3862.49
10000.00
10000.00
02:30 - 02:45
10409.20
4426.50
3826.50
3826.50
10000.00
10000.00
02:45 - 03:00
10011.60
4513.80
3913.80
3913.80
10000.00
10000.00
4
03:00 - 03:15
10121.80
4998.50
4328.70
4328.70
10000.00
10000.00
03:15 - 03:30
10129.50
5194.90
4525.10
4525.10
10000.00
10000.00
03:30 - 03:45
9823.60
5374.20
4292.30
4292.30
7529.08
7529.08
03:45 - 0