# Web Scraping Assignment-4

In [1]:
# import necessary libraries
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re
import requests
import pandas as pd
import numpy as np
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException,TimeoutException,StaleElementReferenceException,ElementClickInterceptedException
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup as bs

import warnings
warnings.filterwarnings('ignore')

# 1. Scrape the details of most viewed videos on YouTube from Wikipedia. 
Url = https://en.wikipedia.org/wiki/List_of_most-viewed_YouTube_videos 

You need to find following details:   
A) Rank
B) Name
C) Artist
D) Upload date
E) Views

In [2]:
# open chrome and load wikipedia webpage
driver=webdriver.Chrome()
driver.get('https://en.wikipedia.org/wiki/List_of_most-viewed_YouTube_videos')

In [63]:
# scrape the table from webpage

try:
    table=driver.find_element(By.XPATH,'//table[@class="sortable wikitable sticky-header static-row-numbers sort-under col3center col4right jquery-tablesorter"]')

    data={'Rank':[],'Video_name':[],'Artist':[],'Upload_date':[],'Views':[]}
    rows = table.find_elements(By.TAG_NAME, 'tr')

    rank=0
    for row in rows:
        records=row.find_elements(By.TAG_NAME,'td')

        vid=[]
        art=[]
        date=[]
        view=[]

        for index, record in enumerate(records):

            if index==0:
                text=record.text
                extracted_text=re.findall(r'"([^"]+)"', text)
                vid.append(extracted_text)
            elif index==1:
                art.append(record.text)
            elif index==2:
                view.append(record.text)
            elif index==3:
                date.append(record.text)

        data['Rank'].extend([rank] * len(vid))
        data['Video_name'].extend(vid)
        data['Artist'].extend(art)
        data['Upload_date'].extend(date)
        data['Views'].extend(view)

        rank+= 1

    if all(len(i) == len(data['Video_name']) for i in data.values()):
        wiki_data = pd.DataFrame(data)
        display(wiki_data)
    else:
        print('Arrays are not of same length')
        
except NoSuchElementException:
    print('Table not found in the webpage!')
    
driver.quit()

Unnamed: 0,Rank,Video_name,Artist,Upload_date,Views
0,1,[Baby Shark Dance],Pinkfong Baby Shark - Kids' Songs & Stories,"June 17, 2016",14.32
1,2,[Despacito],Luis Fonsi,"January 12, 2017",8.41
2,3,[Johny Johny Yes Papa],LooLoo Kids - Nursery Rhymes and Children's Songs,"October 8, 2016",6.89
3,4,[Bath Song],Cocomelon - Nursery Rhymes,"May 2, 2018",6.66
4,5,[Shape of You],Ed Sheeran,"January 30, 2017",6.23
5,6,[See You Again],Wiz Khalifa,"April 6, 2015",6.22
6,7,[Wheels on the Bus],Cocomelon - Nursery Rhymes,"May 24, 2018",6.01
7,8,[Phonics Song with Two Words],ChuChu TV Nursery Rhymes & Kids Songs,"March 6, 2014",5.75
8,9,[Uptown Funk],Mark Ronson,"November 19, 2014",5.18
9,10,[Gangnam Style],Psy,"July 15, 2012",5.1


# 2. Scrape the details team India’s international fixtures from bcci.tv.
Url = https://www.bcci.tv/.

You need to find following details:  
A) Series
B) Place
C) Date
D) Time

Note: - From bcci.tv home page you have reach to the international fixture page through code.

In [64]:
# open chrome and load bcci webpage

driver=webdriver.Chrome()
driver.get('https://www.bcci.tv/')

In [93]:
# scrape details from international fixture page

try:
    fixtures=driver.find_element(By.XPATH,'/html/body/header/div[3]/div[2]/ul/div[1]/a[2]').click()
    time.sleep(3)
except TimeoutException:
    print('Timeout occured while loading the page. Refreshing again.')
    driver.refresh()
except NoSuchElementException:
    print('Fixture page not found. Check if the XPath is correct.')
    
else:
    ser=[]
    plc=[]
    dt=[]
    tm=[]

    for s in driver.find_elements(By.XPATH,'//div[@id="match-card"]/div/div/div/h5'):
        ser.append(s.text)
    for p in driver.find_elements(By.XPATH,'//div[@class="match-place ng-scope"]'):
        combined=p.find_element(By.XPATH,'./span[1]').text + ' '+ p.find_element(By.XPATH,'./span[2]').text
        plc.append(combined)

    for d in driver.find_elements(By.XPATH,'//div[@class="match-dates ng-binding"]'):
        dt.append(d.text)
    for t in driver.find_elements(By.XPATH,'//div[@ class="match-time no-margin ng-binding"]'):
        tm.append(t.text)

    if len(ser)==len(plc)==len(dt)==len(tm): 
        bcci_data=pd.DataFrame({'Series':ser,'Place/Venue':plc,'Date':dt,'Time':tm})
        display(bcci_data)
        
driver.quit()

Unnamed: 0,Series,Place/Venue,Date,Time
0,INDIA WOMEN TOUR OF BANGLADESH T20 SERIES 2024,"Sylhet International Cricket Stadium, Sylhet","2 MAY, 2024",3:30 PM IST
1,INDIA WOMEN TOUR OF BANGLADESH T20 SERIES 2024,"Sylhet International Cricket Stadium, Sylhet","6 MAY, 2024",3:30 PM IST
2,INDIA WOMEN TOUR OF BANGLADESH T20 SERIES 2024,"Sylhet International Cricket Stadium, Sylhet","9 MAY, 2024",3:30 PM IST
3,ICC MENS T20 WORLD CUP 2024,"Nassau County International Cricket Stadium, N...","5 JUNE, 2024",8:00 PM IST
4,ICC MENS T20 WORLD CUP 2024,"Nassau County International Cricket Stadium, N...","9 JUNE, 2024",8:00 PM IST
5,ICC MENS T20 WORLD CUP 2024,"Nassau County International Cricket Stadium, N...","12 JUNE, 2024",8:00 PM IST
6,ICC MENS T20 WORLD CUP 2024,"Central Broward Park & Broward County Stadium,...","15 JUNE, 2024",8:00 PM IST
7,INDIA TOUR OF ZIMBABWE 2024,"Harare Sports Club, Harare","6 JULY, 2024",8:00 PM IST
8,INDIA TOUR OF ZIMBABWE 2024,"Harare Sports Club, Harare","7 JULY, 2024",8:00 PM IST


# 3. Scrape the details of State-wise GDP of India from statisticstime.com.
Url = http://statisticstimes.com/

You have to find following details:  
A) Rank
B) State
C) GSDP(18-19)- at current prices
D) GSDP(19-20)- at current prices
E) Share(18-19)
F) GDP($ billion)

Note: - From statisticstimes home page you have to reach to economy page through code.

In [109]:
# open chrome and load statisticstimes webpage

driver=webdriver.Chrome()
driver.get('http://statisticstimes.com/')

In [111]:
# navigate to required page
try:
    econ=driver.find_element(By.XPATH,'/html/body/div[2]/div[1]/div[2]/div[2]/button').click()
    india=driver.find_element(By.XPATH,'//div[@class="dropdown-content"]/a[3]').click()
    time.sleep(2)
    gdp=driver.find_element(By.XPATH,'/html/body/div[2]/div[2]/div[2]/ul/li[1]/a').click()
    time.sleep(2)
except TimeoutException:
    print('Timeout occured while loading the page')

except NoSuchElementException:
    print('Page not found. Check if the XPath is correct.')

In [120]:
# scrape data

rank=[]
state=[]
g1=[]
g2=[]
s1=[]
gdp=[]

try:

    for r in driver.find_elements(By.XPATH,'//table[@id="table_id"]/tbody/tr/td[1]'):
        rank.append(r.text)

    for s in driver.find_elements(By.XPATH,'//table[@id="table_id"]/tbody/tr/td[2]'):
        state.append(s.text)

    for gsdp1 in driver.find_elements(By.XPATH,'//table[@id="table_id"]/tbody/tr/td[4]'):
            g1.append(gsdp1.text)

    for gsdp2 in driver.find_elements(By.XPATH,'//table[@id="table_id"]/tbody/tr/td[5]'):
        g2.append(gsdp2.text)

    for share in driver.find_elements(By.XPATH,'//table[@id="table_id"]/tbody/tr/td[6]'):
        s1.append(share.text)

    for g in driver.find_elements(By.XPATH,'//table[@id="table_id"]/tbody/tr/td[7]'):
        gdp.append(g.text)
        
except NoSuchElementException:
    rank.append('-')
    state.append('-')
    g1.append('-')
    g2.append('-')
    s1.append('-')
    gdp.append('-')
    
if len(rank)==len(state)==len(g1)==len(g2)==len(s1)==len(gdp):
    india_stats=pd.DataFrame({'Rank':rank,'State':state,'GSDP(22-23)': g1, 'GSDP(21-22)':g2,'Share(21-22)':s1,'GDP($ billion)':gdp})
    display(india_stats)
else:
    print('Arrays are not of same length')
    print('Lengths of arrays are: \n Rank: ',len(rank),'\n State: ',len(state),'\n GSDP1: ',len(g1),'\n GSDP2: ',len(g2),'\n Share: ',len(s1),'\n GDP: ',len(gdp))
    
driver.quit()

Unnamed: 0,Rank,State,GSDP(22-23),GSDP(21-22),Share(21-22),GDP($ billion)
0,1,Maharashtra,-,3108022,13.17%,414.928
1,2,Tamil Nadu,2364514,2071286,8.78%,276.522
2,3,Karnataka,2269995,1978094,8.38%,264.08
3,4,Uttar Pradesh,2258040,1975595,8.37%,263.747
4,5,Gujarat,2230609,1928683,8.17%,257.484
5,6,West Bengal,1531758,1329238,5.63%,177.456
6,7,Rajasthan,1365849,1193489,5.06%,159.334
7,8,Andhra Pradesh,1303524,1148471,4.87%,153.324
8,9,Telangana,1308034,1124204,4.76%,150.084
9,10,Madhya Pradesh,1246471,1092964,4.63%,145.913


# 4. Scrape the details of trending repositories on Github.com.
Url = https://github.com/

You have to find the following details:

A) Repository title
B) Repository description
C) Contributors count
D) Language used

In [2]:
# open chrome and load github webpage

driver=webdriver.Chrome()
driver.get('https://github.com/')

In [3]:
# signin to access

signin=driver.find_element(By.XPATH,'/html/body/div[1]/div[1]/header/div/div[2]/div/div/div/a').click()
time.sleep(2)
enter_uid=driver.find_element(By.XPATH,'/html/body/div[1]/div[3]/main/div/div[4]/form/input[2]').send_keys('dummysample098@gmail.com')
enter_pw=driver.find_element(By.XPATH,'/html/body/div[1]/div[3]/main/div/div[4]/form/div/input[1]').send_keys('dummysample852#')
search=driver.find_element(By.XPATH,'/html/body/div[1]/div[3]/main/div/div[4]/form/div/input[13]').click()
time.sleep(4)

In [6]:
# go to trendings section
explore=driver.find_element(By.XPATH,'/html/body/div[1]/div[6]/div/div/div/div/div/aside/div[3]/div/a').click()
time.sleep(3)
trending=driver.find_element(By.XPATH,'/html/body/div[1]/div[6]/main/div[1]/nav/div/a[3]').click()
time.sleep(3)

In [7]:
# scrape links of each repositories

links=[]
try:
    for link in driver.find_elements(By.XPATH,'//a[@class="Link"]'):
        links.append(link.get_attribute('href'))
    print('Length of list:', len(links), '\n',links)
        
except NoSuchElementException:
    print("Specified element not found")
    
except StaleElementReferenceException:
    print("Referenced element is no longer attached to the DOM") 


Length of list: 25 
 ['https://github.com/dnhkng/GlaDOS', 'https://github.com/TheOfficialFloW/PPPwn', 'https://github.com/freeCodeCamp/freeCodeCamp', 'https://github.com/fastfetch-cli/fastfetch', 'https://github.com/hydralauncher/hydra', 'https://github.com/TracecatHQ/tracecat', 'https://github.com/coollabsio/coolify', 'https://github.com/dylanaraps/neofetch', 'https://github.com/ItzCrazyKns/Perplexica', 'https://github.com/Orange-OpenSource/hurl', 'https://github.com/pagefaultgames/pokerogue', 'https://github.com/penpot/penpot', 'https://github.com/yangshun/tech-interview-handbook', 'https://github.com/Dokploy/dokploy', 'https://github.com/codecrafters-io/build-your-own-x', 'https://github.com/xlang-ai/OSWorld', 'https://github.com/NaiboWang/EasySpider', 'https://github.com/trimstray/the-book-of-secret-knowledge', 'https://github.com/pytorch/executorch', 'https://github.com/kelseyhightower/kubernetes-the-hard-way', 'https://github.com/MagicMirrorOrg/MagicMirror', 'https://github.com/a

In [8]:
# scrape data

titles=[]
desc=[]
count=[]
lang=[]

for i in links:
    driver.get(i)
    time.sleep(3)
       
    for title in driver.find_elements(By.XPATH,'//strong[@class="mr-2 flex-self-stretch d-none d-md-block"]'):
        if title:
            titles.append(title.text)
        else:
            titles.append('-')
        
    data_present=False
    for de in driver.find_elements(By.XPATH,'//div[@class="BorderGrid-cell"]/div/p'):
        desc.append(de.text)
        data_present=True
    if not data_present:
        desc.append('-')
    
    data_present=False
    for ct in driver.find_elements(By.XPATH,'//*[contains(text(), "Contributors")]/span'):
        count.append(ct.text)
        data_present = True
    
    if not data_present:
        count.append('-')
        
    data_present=False    
    for la in driver.find_elements(By.XPATH,'//div[contains(., "Languages")]/ul'):
        text=la.text
        replaced=text.replace('\n',' ')
        pattern=re.compile(r'\d+.\d+\%')
        edited_text=pattern.sub(',',replaced)
        lang.append(edited_text)
        data_present=True
    if not data_present:
        lang.append('-')

if len(titles)==len(desc)==len(count)==len(lang):
    github_data=pd.DataFrame({'Trending repository': titles,'Description':desc,'Contributors count':count,'Language used':lang})
    display(github_data)
else:
    print('Arrays are not of same length\n')
    print('Lengths of arrays are: \n Titles: ',len(titles),'\n Description: ',len(desc),'\n Count: ',len(count),'\n Language: ',len(lang))
 

Unnamed: 0,Trending repository,Description,Contributors count,Language used
0,GlaDOS,Personality Core,4,"Python , Jupyter Notebook ,"
1,PPPwn,PPPwn - PlayStation 4 PPPoE RCE,8,"Python , C , Makefile , Assembly , Other ,"
2,freeCodeCamp,freeCodeCamp.org's open-source codebase and cu...,"5,000+","TypeScript , JavaScript , CSS , Dockerfile , E..."
3,fastfetch,"Like neofetch, but much faster because written...",84,"C , C++ , Objective-C , CMake , Shell ,"
4,hydra,Hydra is a game launcher with its own embedded...,13,"TypeScript , Python , Other ,"
5,tracecat,😼 The open source alternative to Tines / Splun...,3,"TypeScript , Python , Other ,"
6,coolify,An open-source & self-hostable Heroku / Netlif...,114,"PHP , Blade , Vue , Shell , Dockerfile , CSS ,..."
7,neofetch,🖼️ A command-line system information tool writ...,207,"Shell , Roff , Makefile ,"
8,Perplexica,Perplexica is an AI-powered search engine. It ...,2,"TypeScript , Other ,"
9,hurl,"Hurl, run and test HTTP requests with plain text.",54,"Rust , Python , Shell , PowerShell , HTML , Ro..."


In [9]:
driver.quit()

# 5. Scrape the details of top 100 songs on billiboard.com. 
Url = https:/www.billboard.com/ 

You have to find the following details:  
A) Song name
B) Artist name
C) Last week rank
D) Peak rank
E) Weeks on board

Note: - From the home page you have to click on the charts option then hot 100-page link through code.

In [17]:
# open chrome and load billboard webpage

driver=webdriver.Chrome()
driver.get('http://www.billboard.com/')

# navigate to hot 100-page link
charts=driver.find_element(By.XPATH,'/html/body/div[3]/header/div/div[2]/div/div/div[2]/div[2]/div/div/nav/ul/li[1]/a').click()
time.sleep(3)
hot=driver.find_element(By.XPATH,'/html/body/div[3]/header/div/div[3]/div/nav/ul/li[1]/a').click()
time.sleep(3)

# scrape data
data=[]
try:
    for d in driver.find_elements(By.XPATH,'//li[@class="lrv-u-width-100p"]'):
        data.append(d.text.split('\n'))
    
    # split data to separate lists and make a dataframe
    song=[item[0] for item in data]
    artist=[item[1] for item in data]
    lwr=[item[2] for item in data]
    pr=[item[3] for item in data]
    wob=[item[4] for item in data]
    top100_music=pd.DataFrame({'Song':song,'Artist':artist,'Last week Rank':lwr,'Peak Rank':pr,'Week on Chart':wob})
    display(top100_music)
    
except Exception as e:
    print(f'Error occured {e}')
    
finally:
    driver.quit()

Unnamed: 0,Song,Artist,Last week Rank,Peak Rank,Week on Chart
0,Fortnight,Taylor Swift Featuring Post Malone,-,1,1
1,Down Bad,Taylor Swift,-,2,1
2,I Can Do It With A Broken Heart,Taylor Swift,-,3,1
3,The Tortured Poets Department,Taylor Swift,-,4,1
4,"So Long, London",Taylor Swift,-,5,1
...,...,...,...,...,...
95,Us Vs. Them,$uicideBoy$,-,96,1
96,Wine Into Whiskey,Tucker Wetmore,84,77,5
97,Spin You Around (1/24),Morgan Wallen,89,24,13
98,Soak City,310babii,82,53,19


# 6. Scrape the details of Highest selling novels.

A) Book name
B) Author name
C) Volumes sold
D) Publisher
E) Genre

Url - https://www.theguardian.com/news/datablog/2012/aug/09/best-selling-books-all-time-fifty-shades-grey-compare

In [38]:
# open chrome and load theguardian webpage

driver=webdriver.Chrome()
driver.get('https://www.theguardian.com/news/datablog/2012/aug/09/best-selling-books-all-time-fifty-shades-grey-compare')
time.sleep(2)

# scrape data

data=[]
try:
    table=driver.find_element(By.XPATH,'//table[@class="in-article sortable"]/tbody') 
    rows=table.find_elements(By.TAG_NAME,'tr')
    for row in rows:
        records=row.find_elements(By.TAG_NAME,'td')
        data.append([record.text for record in records])
            
    novels_data=pd.DataFrame(data,columns=['Rank','Book','Author','Volumes Sold','Publisher','Genre'])
    display(novels_data.drop(columns=['Rank']))
    
except NoSuchElementException:
    print('Table not found / Element not found in the table')
except Exception as e:
    print(f'Error occured: {e}')
    
driver.quit()

Unnamed: 0,Book,Author,Volumes Sold,Publisher,Genre
0,"Da Vinci Code,The","Brown, Dan",5094805,Transworld,"Crime, Thriller & Adventure"
1,Harry Potter and the Deathly Hallows,"Rowling, J.K.",4475152,Bloomsbury,Children's Fiction
2,Harry Potter and the Philosopher's Stone,"Rowling, J.K.",4200654,Bloomsbury,Children's Fiction
3,Harry Potter and the Order of the Phoenix,"Rowling, J.K.",4179479,Bloomsbury,Children's Fiction
4,Fifty Shades of Grey,"James, E. L.",3758936,Random House,Romance & Sagas
...,...,...,...,...,...
95,"Ghost,The","Harris, Robert",807311,Random House,General & Literary Fiction
96,Happy Days with the Naked Chef,"Oliver, Jamie",794201,Penguin,Food & Drink: General
97,"Hunger Games,The:Hunger Games Trilogy","Collins, Suzanne",792187,Scholastic Ltd.,Young Adult Fiction
98,"Lost Boy,The:A Foster Child's Search for the L...","Pelzer, Dave",791507,Orion,Biography: General


# 7. Scrape the details most watched tv series of all time from imdb.com.
Url = https://www.imdb.com/list/ls512407256/ 

You have to find the following details:

A) Name
B) Year span
C) Genre
D) Run time
E) Ratings
F) Votes

In [47]:
# open chrome and load imdb webpage

driver=webdriver.Chrome()
driver.get('https://www.imdb.com/list/ls512407256/')
time.sleep(2)

# scrape data

name=[]
yr=[]
gnr=[]
runtm=[]
rate=[]
vt=[]

try:
    for n in driver.find_elements(By.XPATH,'//h3[@class="lister-item-header"]/a'):
        name.append(n.text)
    for y in driver.find_elements(By.XPATH,'//h3[@class="lister-item-header"]/span[2]'):
        yr.append(y.text)
    for g in driver.find_elements(By.XPATH,'//span[@class="genre"]'):
        gnr.append(g.text)    
    for rt in driver.find_elements(By.XPATH,'//span[@class="runtime"]'):
        runtm.append(rt.text)
    for r in driver.find_elements(By.XPATH,'//div[@class="ipl-rating-widget"]/div/span[2]'):
        rate.append(r.text)
    for v in driver.find_elements(By.XPATH,'//div[@class="lister-item-content"]/p[4]/span[2]'):
        vt.append(v.text)

    if len(name)==len(yr)==len(gnr)==len(runtm)==len(rate)==len(vt):
        imdb_data=pd.DataFrame({'Series Name':name,'Year Span':yr,'Genre':gnr,'Run Time':runtm,'Rating':rate,'Votes':vt})
        display(imdb_data)
    else:
    print('Arrays are not of same length\n')
    print('Lengths of arrays are: \n Series Name: ',len(name),'\n Year Span: ',len(yr),'\n Genre: ',len(gnr),'\n Run Time: ',len(runtm),'\n Rating: ',len(rate),'\n Votes: ',len(vt))
        
except Exceptions as e:
    print(f'Error occured: ', e)
    
driver.quit()    
    

Unnamed: 0,Series Name,Year Span,Genre,Run Time,Rating,Votes
0,Game of Thrones,(2011–2019),"Action, Adventure, Drama",60 min,9.2,2285808
1,Stranger Things,(2016–2025),"Drama, Fantasy, Horror",60 min,8.7,1337079
2,The Walking Dead,(2010–2022),"Drama, Horror, Thriller",45 min,8.1,1082551
3,13 Reasons Why,(2017–2020),"Drama, Mystery, Thriller",60 min,7.5,315629
4,The 100,(2014–2020),"Drama, Mystery, Sci-Fi",43 min,7.6,276243
...,...,...,...,...,...,...
95,True Detective,(2014– ),"Crime, Drama, Mystery",60 min,8.9,656832
96,Teen Wolf,(2011–2017),"Action, Drama, Fantasy",41 min,7.7,163314
97,The OA,(2016–2019),"Drama, Fantasy, Mystery",60 min,7.8,115914
98,The Simpsons,(1989– ),"Animation, Comedy",22 min,8.7,436059


# 8. Details of Datasets from UCI machine learning repositories.
Url = https://archive.ics.uci.edu/  
You have to find the following details:  

A) Dataset name
B) Data type
C) Task
D) Attribute type
E) No of instances
F) No of attribute 
G) Year

Note: - from the home page you have to go to the Show All Dataset page through code.


In [73]:
# open chrome and load uci webpage

driver=webdriver.Chrome()
driver.get('https://archive.ics.uci.edu/')
time.sleep(2)

In [79]:
# navigate to show all datasets

try:
    showall = driver.find_element(By.XPATH,'//a[@class="btn-ghost btn mb-2 text-center text-primary hover:underline"]').click()
except ElementClickInterceptedException:    
    action = ActionChains(driver)
    showall = driver.find_element(By.XPATH,'//a[@ class="btn-ghost btn mb-2 text-center text-primary hover:underline"]')
    action.move_to_element(showall).perform()
    showall.click()
time.sleep(3)


In [81]:
# scrape individual links

links=[]
try:
    for link in driver.find_elements(By.XPATH,'//div[@class="relative col-span-8 sm:col-span-7"]/h2/a'):
        links.append(link.get_attribute('href'))
    print('Length of list:', len(links), '\n',links[:5])
        
except NoSuchElementException:
    print("Specified element not found")
    
except StaleElementReferenceException:
    print("Referenced element is no longer attached to the DOM") 


Length of list: 10 
 ['https://archive.ics.uci.edu/dataset/53/iris', 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'https://archive.ics.uci.edu/dataset/602/dry+bean+dataset', 'https://archive.ics.uci.edu/dataset/545/rice+cammeo+and+osmancik', 'https://archive.ics.uci.edu/dataset/2/adult']


In [71]:
# scrape data

dtst=[]
dtype=[]
task=[]
attp=[]
inst=[]
atno=[]
year=[]

try:
    for i in links:
        driver.get(i)
        time.sleep(2)

        for ds in driver.find_elements(By.XPATH,'//div[@class="relative flex flex w-full items-center gap-4 bg-primary p-2"]/div[2]/div/h1'):
            dtst.append(ds.text)
        for dt in driver.find_elements(By.XPATH,'//div[@class="relative flex flex-col gap-4 bg-base-100 p-4 shadow"]/div[2]/div[1]/p'):
            dtype.append(dt.text)
        for t in driver.find_elements(By.XPATH,'//div[@class="relative flex flex-col gap-4 bg-base-100 p-4 shadow"]/div[2]/div[3]/p'):
            task.append(t.text)
        for at in driver.find_elements(By.XPATH,'//div[@class="relative flex flex-col gap-4 bg-base-100 p-4 shadow"]/div[2]/div[4]/p'):
            attp.append(at.text)
        for ins in  driver.find_elements(By.XPATH,'//div[@class="relative flex flex-col gap-4 bg-base-100 p-4 shadow"]/div[2]/div[5]/p'):
            inst.append(ins.text)
        for an in driver.find_elements(By.XPATH,'//div[@class="relative flex flex-col gap-4 bg-base-100 p-4 shadow"]/div[2]/div[6]/p'):
            atno.append(an.text)
        for yr in driver.find_elements(By.XPATH,'//div[@class="relative flex flex w-full items-center gap-4 bg-primary p-2"]/div[2]/h2'):
            yr_text=yr.text
            extracted_year=re.findall(r'\d{4}',yr_text)
            year.append(extracted_year)
            
    if len(dtst)==len(dtype)==len(task)==len(attp)==len(inst)==len(atno)==len(year):
        dataset_data=pd.DataFrame({'Dataset':dtst,'Datatype':dtype,'Task':task,'Attribute Type':attp,'No. of Instances':inst,'Attribute Number':atno,'Year':year})
        display(dataset_data)
    else:
        print('Arrays are not of same length\n')
        print('Lengths of arrays are: \n Dataset: ',len(dtst),'\n Datatype: ',len(dtype),'\n Task: ',len(task))
            
except NoSuchElementException:
    dtst.append('-')
    dtype.append('-')
    task.append('-')
    attp.append('-')
    inst.append('-')
    atno.append('-')
    year.append('-')

Unnamed: 0,Dataset,Datatype,Task,Attribute Type,No. of Instances,Attribute Number,Year
0,PhiUSIIL Phishing URL (Website),Tabular,Classification,"Real, Categorical, Integer",235795,54,[2024]
1,RT-IoT2022,"Tabular, Sequential, Multivariate","Classification, Regression, Clustering","Real, Categorical",123117,83,[2024]
2,Regensburg Pediatric Appendicitis,"Tabular, Image",Classification,"Real, Categorical, Integer",782,53,[2023]
3,National Poll on Healthy Aging (NPHA),Tabular,Classification,Categorical,714,14,[2023]
4,Infrared Thermography Temperature,Tabular,Regression,"Real, Categorical",1020,33,[2023]
5,Jute Pest,Image,"Classification, Other",Categorical,7235,17,[2023]
6,Differentiated Thyroid Cancer Recurrence,Tabular,Classification,"Real, Categorical, Integer",383,16,[2023]
7,Forty Soybean Cultivars from Subsequent Harvests,Tabular,"Classification, Regression, Clustering, Other","Real, Categorical, Integer",320,11,[2023]
8,Recipe Reviews and User Feedback,"Tabular, Other","Classification, Other","Real, Categorical, Integer",18182,15,[2023]
9,RealWaste,Image,Classification,-,4752,-,[2023]


In [82]:
driver.quit()