# Web Scraping using BeautifulSoup 

# Scrape and display IMDB’s Top rated 100 Indian movies’ data https://www.imdb.com/list/ls056092300/ (i.e. name, rating, year of release) 

In [78]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')

In [79]:
def movies(url):
    response=requests.get(url)
    
    if response.status_code==200:
        print('Successful response\n','*'*25)
        
        msoup=bs(response.content)
        name=[]
        for i in msoup.find_all('h3',class_='lister-item-header'):
            ii=i.text.split('\n')[2]
            name.append(ii)
        
        rating=[]
        for j in msoup.find_all('div',class_="ipl-rating-star small"):
            jj=j.text.strip('\n')
            rating.append(jj)
    
        year=[]
        for k in msoup.find_all('span',class_="lister-item-year text-muted unbold"):
                kk=k.text.strip()
                res=re.findall(r'\d{4}', kk)
                year.append(res)
        if len(name)==len(rating)==len(year):
            print('Data is equal in all columns\n','*'*25)
            names=pd.DataFrame(name,columns=['Indian movie title'])
            ratings=pd.DataFrame(rating,columns=['Rating'])
            years=pd.DataFrame(year,columns=['Year of Release'])
            
            Indian_movies=pd.concat([names,ratings,years], axis=1)
            print('IMDB’s Top rated 100 Indian movies: \n')
            display(Indian_movies)
        else:
            print('Data is NOT equal in all columns')
    
    
    else:
        print('html response not successful')
        

In [68]:
movies(r'https://www.imdb.com/list/ls056092300/')

Successful response
 *************************
Data is equal in all columns
 *************************
IMDB’s Top rated 100 Indian movies: 



Unnamed: 0,Indian movie title,Rating,Year of Release
0,Ship of Theseus,8,2012
1,Iruvar,8.4,1997
2,Kaagaz Ke Phool,7.8,1959
3,Lagaan: Once Upon a Time in India,8.1,2001
4,Pather Panchali,8.2,1955
...,...,...,...
95,Apur Sansar,8.4,1959
96,Kanchivaram,8.2,2008
97,Monsoon Wedding,7.3,2001
98,Black,8.1,2005


# Scrape house details from nobroker website. Enter three localities which are Indira Nagar, Jayanagar, Rajaji Nagar and the scraped data should include house title, location, area, EMI and price

In [158]:
def house(url):
    p=requests.get(url)
    
    if p.status_code==200:
        print('Successful response\n','*'*35)
        
        hsoup=bs(p.content,'html.parser')
        
        title=[]
        location=[]
        area=[]
        rent=[]
        
        for i in hsoup.find_all('a',{'class': 'overflow-hidden overflow-ellipsis whitespace-nowrap max-w-80pe po:max-w-full'}):
                ii=i.get_text(strip=True)
                if ii:
                    title.append(ii)
                else:
                    title.append('NA')
                                
        for i in hsoup.find_all('a',{'class': 'overflow-hidden overflow-ellipsis whitespace-nowrap max-w-80pe po:max-w-full'}):
                ii=i.get_text()
                loc=re.search('Jaya ?nagar|Rajaji ?nagar|Indira ?nagar',ii,re.IGNORECASE)
                if loc:
                    location.append(loc.group())
                else:
                    location.append('NA')
                        
                                              
        for i in hsoup.find_all('div',class_='mt-0.5p overflow-hidden overflow-ellipsis whitespace-nowrap max-w-70 text-gray-light leading-4 po:mb-0.1p po:max-w-95'):
                ii=i.get_text(strip=True)
                if ii:
                    area.append(ii)
                else:
                    area.append('NA')
                          
            
        for i in hsoup.find_all('div',{'id':'minimumRent'}):
            ii=i.get_text(strip=True)
            iii=re.search(r'₹ *([\d,]+)',ii)
            if iii:
                rent.append(iii.group(1))
            else:
                rent.append('NA')
               
        titles=pd.DataFrame(title,columns=['House title'])
        locations=pd.DataFrame(location,columns=['House location'])
        Area=pd.DataFrame(area,columns=['House area'])
        Rent=pd.DataFrame(rent,columns=['House rent'])
            
        No_broker=pd.concat([titles,locations,Area,Rent], axis=1)
        print('House details from NoBroker: \n')
        display(No_broker)
    
  
    else:
        print('html response not successful')
           
      

In [159]:
house('https://www.nobroker.in/property/rent/bangalore/multiple?searchParam=W3sibGF0IjoxMi45OTY2MDg3LCJsb24iOjc3LjU0NTQ2MiwicGxhY2VJZCI6IkNoSUpEOUE3MUpBOXJqc1JOeWdnZFU4cDdmWSIsInBsYWNlTmFtZSI6IkluZGlyYW5hZ2FyIiwic2hvd01hcCI6ZmFsc2V9LHsibGF0IjoxMi45MzA3NzM1LCJsb24iOjc3LjU4MzgzMDIsInBsYWNlSWQiOiJDaElKMmRkbFo1Z1ZyanNSaDFCT0FhZi1vcnMiLCJwbGFjZU5hbWUiOiJKYXlhbmFnYXIiLCJzaG93TWFwIjpmYWxzZX0seyJsYXQiOjEyLjk5ODE3MzIsImxvbiI6NzcuNTUzMDQ0NTk5OTk5OTksInBsYWNlSWQiOiJDaElKeGZXNERQTTlyanNSS3NOVEctNXBfUVEiLCJwbGFjZU5hbWUiOiJSYWphamluYWdhciIsInNob3dNYXAiOmZhbHNlfV0=&radius=2.0&sharedAccomodation=0&city=bangalore&locality=Indiranagar,Jayanagar,Rajajinagar&isMetro=false')

Successful response
 ***********************************
House details from NoBroker: 



Unnamed: 0,House title,House location,House area,House rent
0,1 BHK House for Rent In Rajajinagar,Rajajinagar,"Independent House, Mahakavi kuvempu metro station",9000.0
1,2 BHK Flat In Sas Residency Jayanagar for Rent...,Jayanagar,23 5th Cross Road,29500.0
2,1 BHK House for Rent In Rajajinagar,Rajajinagar,"Independent House, Mariyappanapalya Park, Mari...",15000.0
3,3 BHK House for Rent In Jayanagar,Jayanagar,"Independent House, 3rd Main Road",36000.0
4,3 BHK Flat In Ibbani Apartment for Rent In Ja...,Jayanagar,"Old No 321A, New no 31, 3rd floor, 5th cross, ...",53000.0
5,2 BHK House for Rent In Jayanagar 4th 't' Block,Jayanagar,"Independent House, near National Public School...",25000.0
6,1 RK House for Rent In 12th B Main Road,,"Independent House, #150, 12 B main road , near...",7500.0
7,3 BHK House for Rent In Jayanagar 5th Block,Jayanagar,"Independent House, DK Naik Nagar, Jayanagar, 9...",34000.0
8,3 BHK Flat In Standaloine Building for Rent I...,Jayanagar,LIC Colony Near City Education Society School,30000.0
9,1 BHK Flat In Sb for Rent In Rajajinagar,Rajajinagar,"Bhasyam Circle, near MedPlus Bashyam Circle",10000.0


# Scrape first 10 product details which include product name , price , image URL from https://www.bewakoof.com/bestseller?sort=popular .

In [308]:
def bwkf(url):
    k=requests.get(url)
    if k.status_code==200:
        print('HTML response successful!\n','*'*50)
        bsoup=bs(k.content,'html.parser')
        
        pdtname=[]
        pdtimg=[]
        pdtprice=[]
        
        for i in bsoup.find_all('div',class_='productNaming'):
            name=i.find('h2')
            if name:
                pdtname.append(name.text.strip())
                
        
        for i in bsoup.find_all('div',class_='discountedPriceText'):
            pdtprice.append(i.text)
        
        for i in bsoup.find_all('img',class_='productImgTag'):
            pdtimg.append(i['src'])
        
        pdtnames=pd.DataFrame(pdtname,columns=['Product name'])
        pdtprices=pd.DataFrame(pdtprice,columns=['Price'])
        pdtimgs=pd.DataFrame(pdtimg,columns=['Image url'])
        
        bewakoof=pd.concat([pdtnames,pdtprices,pdtimgs],axis=1)
        print('Product details of popular clothing from Bewakoof website: \n')
        return bewakoof
        
    else:
        print('HTML response not successful')



In [309]:
bwkf('https://www.bewakoof.com/bestseller?sort=popular')

HTML response successful!
 **************************************************
Product details of popular clothing from Bewakoof website: 



Unnamed: 0,Product name,Price,Image url
0,Men's White T-shirt,₹499,https://images.bewakoof.com/t640/men-s-white-t...
1,Women's Blue Round in Bugs Graphic Printed Ove...,₹487,https://images.bewakoof.com/t640/women-s-blue-...
2,Men's Black T-Shirts,₹439,https://images.bewakoof.com/t640/men-s-black-t...
3,Women's Brown Cargo Pants,₹1299,https://images.bewakoof.com/t640/women-s-strai...
4,Men's Black T-shirt,₹499,https://images.bewakoof.com/t640/men-s-black-t...
5,Men's Grey Eternity Graphic Printed Oversized ...,₹649,https://images.bewakoof.com/t640/men-s-grey-et...
6,Men's Black Eternity Graphic Printed T-shirt,₹379,https://images.bewakoof.com/t640/men-s-black-e...
7,Women's Black Straight Cargo Pants,₹1499,https://images.bewakoof.com/t640/women-s-black...
8,Women's Blue Straight Cargo Pants,₹1299,https://images.bewakoof.com/t640/women-s-blue-...
9,Men's Black Oversized T-shirt,₹649,https://images.bewakoof.com/t640/men-s-black-o...


# Scrape a) headings b) date c) News link from https://www.cnbc.com/world/?region=world 

In [306]:
def cnbc(url):
    c=requests.get(url)
    if c.status_code==200:
        print('HTML response successful\n','-'*40)
        csoup=bs(c.content)
        
        heading=[]
        link=[]
        date=[]
        
        for i in csoup.find_all('div', class_='RiverHeadline-headline'):
            
            heading.append(i.text.strip())
            
            lk=i.find('a')['href']
            link.append(lk)
            
            dt=re.search(r'(\d{4}/\d{2}/\d{2})',lk)
            if dt:
                date.append(dt.group())
                
            headings=pd.DataFrame(heading,columns=['News heading'])
            links=pd.DataFrame(link,columns=['News link'])
            dates=pd.DataFrame(date,columns=['News date'])
            cnbc_news=pd.concat([headings,links,dates],axis=1)
        print('News details published in CNBC website this week: \n')
        return cnbc_news 
    else:
        print('HTML response NOT successful\n')
        
                                

In [307]:
cnbc('https://www.cnbc.com/world/?region=world')

HTML response successful
 ----------------------------------------
News details published in CNBC website this week: 



Unnamed: 0,News heading,News link,News date
0,Russia says U.S. defended Kyiv too quickly aft...,https://www.cnbc.com/2024/03/27/ukraine-war-li...,2024/03/27
1,"Want to invest $10,000? Here are 5 ETFs to buy...",/pro/,2024/03/27
2,China's Xi meets U.S. executives as businesses...,https://www.cnbc.com/2024/03/27/chinas-xi-meet...,2024/03/27
3,Mumbai overtakes Beijing to become Asia's bill...,https://www.cnbc.com/2024/03/27/mumbai-overtak...,2024/03/27
4,India's love for gold has done little for its ...,https://www.cnbc.com/2024/03/27/larry-fink-say...,2024/03/27
5,U.S. or Europe – where's best to invest? CNBC ...,/pro/,2024/03/26
6,Nvidia's partners and more: BofA picks key glo...,/pro/,2024/03/27
7,This small Southeast Asian country has 'engine...,https://www.cnbc.com/2024/03/27/singapore-has-...,2024/03/27
8,Trump is selling $60 Bibles with 'God Bless th...,https://www.cnbc.com/2024/03/26/trump-selling-...,2024/03/27
9,"Class bullies may earn more in middle age, stu...",https://www.cnbc.com/2024/03/27/class-bullies-...,2024/03/26


# Scrape a) Paper title b) date c) Author from https://www.keaipublishing.com/en/journals/artificial-intelligence-in-agriculture/most-downloaded- articles/

In [294]:
def keai(url):
    j=requests.get(url)
    if j.status_code==200:
        print('HTML response successful\n','-'*40)
        jsoup=bs(j.content)
        
        title=[]
        author=[]
        date=[]
        
        for i in jsoup.find_all('h2',class_='h5 article-title'):
            ii=i.get_text(strip=True)
            title.append(ii)
            
        for j in jsoup.find_all('p',class_='article-date'):
            date.append(j.text)
            
        for k in jsoup.find_all('p',class_='article-authors'):
            author.append(k.text.strip())
            
        if len(author)==len(title)==len(date):
                titles=pd.DataFrame(title,columns=['Article title'])
                authors=pd.DataFrame(author,columns=['Article authors'])
                dates=pd.DataFrame(date,columns=['Pulication date'])
                keai_journals=pd.concat([titles,authors,dates],axis=1)
                print('Most downloaded articles in "Artificial Intelligence in Agriculture" category: \n')
               
                return keai_journals
    
        else:
               print('Columns not equal in size')
    else:
               print('HTML response NOT successful\n')
               

In [295]:
keai('https://www.keaipublishing.com/en/journals/artificial-intelligence-in-agriculture/most-downloaded-articles/')

HTML response successful
 ----------------------------------------
Most downloaded articles in "Artificial Intelligence in Agriculture" category: 



Unnamed: 0,Article title,Article authors,Pulication date
0,Implementation of artificial intelligence in a...,Tanha Talaviya | Dhara Shah | Nivedita Patel...,2020
1,A comprehensive review on automation in agricu...,Kirtan Jha | Aalap Doshi | Poojan Patel | M...,June 2019
2,Review of agricultural IoT technology,Jinyuan Xu | Baoxing Gu | Guangzhao Tian,2022
3,Automation and digitization of agriculture usi...,A. Subeesh | C.R. Mehta,2021
4,Real-time hyperspectral imaging for the in-fie...,Zongmei Gao | Yuanyuan Shao | Guantao Xuan |...,2020
5,Applications of electronic nose (e-nose) and e...,Juzhong Tan | Jie Xu,2020
6,Fruit ripeness classification: A survey,Matteo Rizzo | Matteo Marcuzzo | Alessandro ...,March 2023
7,A review of imaging techniques for plant disea...,Vijai Singh | Namita Sharma | Shikha Singh,2020
8,Deep learning based computer vision approaches...,V.G. Dhanya | A. Subeesh | N.L. Kushwaha | ...,2022
9,Comparison of CNN-based deep learning architec...,Md Taimur Ahad | Yan Li | Bo Song | Touhid ...,September 2023
