<a href="https://colab.research.google.com/github/Mayreeobi/2023-Data-Science-Salary-Analysis/blob/main/Web_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# What is Web Scraping?  
Web Scraping is the process of using automation tools to retrieve data from websites.  
It allows you to gather vast amounts of data from websites that would be tedious or impossible to collect manually.

Business Use cases
- Competitor pricing analysis
- Industry trends
- Customer review analysis
- Product price monitoring
- News aggregation

## Python Libraries for Web Scraping
1. Requests
2. Beautiful Soup

In [None]:
pip install requests bs4



In [None]:
pip install pandas



In [None]:
#Importing
import  requests
from bs4 import BeautifulSoup
import pandas as pd

#### Downloading HTML Content from Web page with Requests

HTTP Methods and Requests
   - GET
   - POST
   - PUT
   - DELETE

In [None]:
r = requests.get('https://books.toscrape.com/index.html')

In [None]:
r.status_code

200

In [None]:
r.text



#### Parsing downloaded HTML content with Beautiful Soup

In [None]:
soup = BeautifulSoup(r.text)

In [None]:
soup.text



In [None]:
# soup.find('article', class_='product_pod') #find, #find_all

In [None]:
products = soup.find_all('article', class_='product_pod')

In [None]:
links = [product.find('a').get('href') for product in  products]

In [None]:
links

['catalogue/a-light-in-the-attic_1000/index.html',
 'catalogue/tipping-the-velvet_999/index.html',
 'catalogue/soumission_998/index.html',
 'catalogue/sharp-objects_997/index.html',
 'catalogue/sapiens-a-brief-history-of-humankind_996/index.html',
 'catalogue/the-requiem-red_995/index.html',
 'catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html',
 'catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html',
 'catalogue/the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992/index.html',
 'catalogue/the-black-maria_991/index.html',
 'catalogue/starving-hearts-triangular-trade-trilogy-1_990/index.html',
 'catalogue/shakespeares-sonnets_989/index.html',
 'catalogue/set-me-free_988/index.html',
 'catalogue/scott-pilgrims-precious-little-life-scott-pilgrim-1_987/index.html',
 'catalogue/rip-it-up-and-start-again_986/index.html',
 'catalogue/our-band-could-be-your-life-scene

In [None]:
base_url = 'https://books.toscrape.com/'
full_links = [base_url + link for link in links]

In [None]:
full_links

['https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html',
 'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html',
 'https://books.toscrape.com/catalogue/soumission_998/index.html',
 'https://books.toscrape.com/catalogue/sharp-objects_997/index.html',
 'https://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html',
 'https://books.toscrape.com/catalogue/the-requiem-red_995/index.html',
 'https://books.toscrape.com/catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html',
 'https://books.toscrape.com/catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html',
 'https://books.toscrape.com/catalogue/the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992/index.html',
 'https://books.toscrape.com/catalogue/the-black-maria_991/index.html',
 'https://books.toscrape.com/catalogue/starving-hearts-triangular-trade-tr

In [None]:
soup = download_data(full_links[0])

In [None]:
soup.find('h1').text

'A Light in the Attic'

In [None]:
soup.find(class_='price_color').text

'Â£51.77'

In [None]:
soup.find(class_='availability').text

'\n\n    \n        In stock (22 available)\n    \n'

In [None]:
soup.find('p', class_='star-rating').get('class')[1]

'Three'

In [None]:
soup.find(string='Price (excl. tax)').next_element.text

'Â£51.77'

In [None]:
soup.find(string='Price (incl. tax)').next_element.text

'Â£51.77'

In [None]:
soup.find(string='Tax').next_element.text

'Â£0.00'

In [None]:
for ele in soup.find('th', string='Number of reviews').next_siblings:
  print(ele)



<td>0</td>




In [None]:
soup.find(class_='breadcrumb').find_all('li')[2].text

'\nPoetry\n'

In [None]:
# for row in soup.find('table').find_all('tr'):
#   print(row.find('th').text)
#   print(row.find('td').text)

{row.find('th').text:row.find('td').text for row in soup.find('table').find_all('tr')}

{'UPC': 'a897fe39b1053632',
 'Product Type': 'Books',
 'Price (excl. tax)': 'Â£51.77',
 'Price (incl. tax)': 'Â£51.77',
 'Tax': 'Â£0.00',
 'Availability': 'In stock (22 available)',
 'Number of reviews': '0'}

In [None]:
full_links[0]

'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'

In [None]:
def download_data(url):
  r = requests.get(url)

  if r.status_code == 200:
    text = r.text
  soup = BeautifulSoup(r.text)
  return soup

In [None]:
def get_data(soup):

  title = soup.find('h1').text
  price = soup.find(class_='price_color').text
  availability = soup.find(class_='availability').text
  genre = soup.find(class_='breadcrumb').find_all('li')[2].text
  ratings = soup.find('p', class_='star-rating').get('class')[1]
  more_details = {row.find('th').text:row.find('td').text for row in soup.find('table').find_all('tr')}

  result = {
      'title':title,
      'price':price,
      'genre':genre,
      'availability':availability,
      'ratings':ratings
  }

  result.update(more_details)

  return result





In [None]:
link = download_data(full_links[0])
get_data(link)

{'title': 'A Light in the Attic',
 'price': 'Â£51.77',
 'genre': '\nPoetry\n',
 'availability': '\n\n    \n        In stock (22 available)\n    \n',
 'ratings': 'Three',
 'UPC': 'a897fe39b1053632',
 'Product Type': 'Books',
 'Price (excl. tax)': 'Â£51.77',
 'Price (incl. tax)': 'Â£51.77',
 'Tax': 'Â£0.00',
 'Availability': 'In stock (22 available)',
 'Number of reviews': '0'}

In [None]:
data = []
for link in full_links:
  data.append(get_data(download_data(link)))

In [None]:
df = pd.DataFrame(data)

In [None]:
df.columns

Index(['title', 'price', 'genre', 'availability', 'ratings', 'UPC',
       'Product Type', 'Price (excl. tax)', 'Price (incl. tax)', 'Tax',
       'Availability', 'Number of reviews'],
      dtype='object')

## Data Cleaning

### Replace data columns


In [None]:
new_cols = {col:col.lower() for col in df.columns}

In [None]:
df.rename(columns = new_cols, inplace=True)

In [None]:
df.head()

Unnamed: 0,title,price,genre,availability,ratings,upc,product type,price (excl. tax),price (incl. tax),tax,availability.1,number of reviews
0,A Light in the Attic,Â£51.77,\nPoetry\n,\n\n \n In stock (22 available)\n \n,Three,a897fe39b1053632,Books,Â£51.77,Â£51.77,Â£0.00,In stock (22 available),0
1,Tipping the Velvet,Â£53.74,\nHistorical Fiction\n,\n\n \n In stock (20 available)\n \n,One,90fa61229261140a,Books,Â£53.74,Â£53.74,Â£0.00,In stock (20 available),0
2,Soumission,Â£50.10,\nFiction\n,\n\n \n In stock (20 available)\n \n,One,6957f44c3847a760,Books,Â£50.10,Â£50.10,Â£0.00,In stock (20 available),0
3,Sharp Objects,Â£47.82,\nMystery\n,\n\n \n In stock (20 available)\n \n,Four,e00eb4fd7b871a48,Books,Â£47.82,Â£47.82,Â£0.00,In stock (20 available),0
4,Sapiens: A Brief History of Humankind,Â£54.23,\nHistory\n,\n\n \n In stock (20 available)\n \n,Five,4165285e1663650f,Books,Â£54.23,Â£54.23,Â£0.00,In stock (20 available),0
