In [19]:
### Basic Web Scraper in Python
#This notebook demonstrates how to set up and run a simple web scraper using Pythons 'requests' and 'BeautifulSoup'.

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [5]:
# Send an HTTP request to the webpage

#url='https://en.wikipedia.org/wiki/Cloud-computing_comparison'
#response = requests.get(url)

url = 'https://en.wikipedia.org/wiki/Cloud-computing_comparison'
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)

print("Status code:", response.status_code)

if response.status_code == 200:
  print("Request was successful")
else:
  print("failed to retrieve webpage")

Status code: 200
Request was successful


In [6]:
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Print the title of the webpage to verify
print(soup.title.text)

Cloud-computing comparison - Wikipedia


In [7]:
# Find the table containing the data (selecting the first table by default)
table = soup.find('table')

In [8]:
# Extract table rows
rows= table.find_all('tr')

In [9]:
# Extract headers from the first row (using <th> tags)
headers = [header.text.strip() for header in rows[0].find_all('th')]

In [10]:
# Loop through the rows and extract data (skip the first row with headers)
data = []
for row in rows[1:]:
  cols= row.find_all('td')
  cols= [col.text.strip() for col in cols]
  data.append(cols)

In [12]:
# Convert the data into a pandas DataFrame, using the extracted headers as column names
df = pd.DataFrame(data, columns=headers)

In [13]:
# Display the first few rows of the DataFrame to verify
print(df.head())

                      Provider Launched Block storage Assignable IPs  \
0        Google Cloud Platform     2013           Yes             No   
1  Oracle Cloud Infrastructure     2014           Yes            Yes   
2          Amazon Web Services     2006           Yes            Yes   
3                    IBM Cloud     2005           Yes            Yes   
4              Microsoft Azure     2010           Yes            Yes   

  SMTP support IOPS Guaranteed minimum Security  \
0        No[1]                     Yes   Yes[2]   
1          Yes                     Yes   Yes[5]   
2   Partial[6]                     Yes   Yes[7]   
3        No[9]                     Yes  Yes[10]   
4      Yes[11]                     Yes  Yes[12]   

                                           Locations             Notes  
0  br, ca, cl, us, be, ch, de, es, fi, it, po, nl...  SMTP blocked.[4]  
1  us, ca, br, de, uk, nl, ch, in, aus, jp, kr, saud                    
2  us, ca, br, ie, de, uk, cn, sg, au, jp

In [14]:
# Save the DataFrame to a CSV file
df.to_csv('scraped_data.csv', index=False)