<a href="https://colab.research.google.com/github/fmhirwa/ML-Pipeline_Task1/blob/main/APIs_and_Web_scraping_Peer_Learning_Activity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Task 0

In [1]:
#!/usr/bin/env python3
"""Code to return the list of ships"""

import requests

def availableShips(passengerCount):
    """ List of ships

    Arguments:
        passengerCount (int): number of passengers
    """
    res = requests.get('https://swapi-api.alx-tools.com/api/starships')

    output = []
    while res.status_code == 200:
        res = res.json()
        for ship in res['results']:
            passengers = ship['passengers'].replace(',', '')
            try:
                if int(passengers) >= passengerCount:
                    output.append(ship['name'])
            except ValueError:
                pass
        if res['next']:
            res = requests.get(res['next'])
        else:
            break
    return output

# Testing; For colab only
# Get ships with at least 10 passengers
ships = availableShips(10)
print(ships)

['CR90 corvette', 'Sentinel-class landing craft', 'Death Star', 'Executor', 'Rebel transport', 'Imperial shuttle', 'EF76 Nebulon-B escort frigate', 'Calamari Cruiser', 'Republic Cruiser', 'Droid control ship', 'J-type diplomatic barge', 'AA-9 Coruscant freighter', 'Republic Assault ship', 'Solar Sailer', 'Trade Federation cruiser', 'Theta-class T-2c shuttle', 'Republic attack cruiser']


Scrapping

In [18]:
!pip install requests beautifulsoup4 pandas




In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL to scrape data from
url = 'https://www.scrapethissite.com/pages/forms/'

# Send a GET request to fetch the page content
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find the table containing data
table = soup.find('table')

# Extract headers
headers = [header.text.strip() for header in table.find_all('th')]

# Extract rows
rows = []
for row in table.find_all('tr')[1:]:
    cols = row.find_all('td')
    data = [col.text.strip() for col in cols]
    rows.append(data)

# Create a DataFrame using the headers and rows
df = pd.DataFrame(rows, columns=headers)

# Save the DataFrame to a CSV file
df.to_csv('scraped_data.csv', index=False)

# Display the DataFrame in the output
print("Scraped Data:")
print(df)


Scraped Data:
                Team Name  Year Wins Losses OT Losses  Win % Goals For (GF)  \
0           Boston Bruins  1990   44     24             0.55            299   
1          Buffalo Sabres  1990   31     30            0.388            292   
2          Calgary Flames  1990   46     26            0.575            344   
3      Chicago Blackhawks  1990   49     23            0.613            284   
4       Detroit Red Wings  1990   34     38            0.425            273   
5         Edmonton Oilers  1990   37     37            0.463            272   
6        Hartford Whalers  1990   31     38            0.388            238   
7       Los Angeles Kings  1990   46     24            0.575            340   
8   Minnesota North Stars  1990   27     39            0.338            256   
9      Montreal Canadiens  1990   39     30            0.487            273   
10      New Jersey Devils  1990   32     33              0.4            272   
11     New York Islanders  1990   25  

Amazon Scrapping

In [None]:
# Florent: Amazon scrapping; Amazon blocks after repeated use

In [36]:
import requests
from bs4 import BeautifulSoup
import os

# Create a directory to save images
os.makedirs('product_images', exist_ok=True)

# Function to scrape products
def scrape_products(search_query):
    # Replace spaces with '+' for the search query
    search_query = search_query.replace(' ', '+')
    url = f'https://www.amazon.com/s?k={search_query}'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0',
        'Accept-Language': 'en-US, en;q=0.9',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
        'Referer': 'https://www.amazon.com/',
    }

    # Send a GET request to fetch the page content
    response = requests.get(url, headers=headers)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    products = []
    count = 0

    # Loop through search results and extract product data
    for product in soup.find_all('div', {'data-component-type': 's-search-result'}):
        # Extract product name
        name = product.h2.text.strip()

        # Extract image URL
        image_tag = product.find('img', {'class': 's-image'})
        image_url = image_tag['src'] if image_tag else None

        if name and image_url and count < 5:
            # Save image locally
            image_response = requests.get(image_url)
            image_path = f'product_images/{name[:30].replace(" ", "_")}.jpg'
            with open(image_path, 'wb') as f:
                f.write(image_response.content)

            products.append((name, image_path))
            count += 1

    return products

# Searching for "cookies"
products = scrape_products("laptops")

# Display product names and image paths
print("Scraped Products:")
for product_name, image_path in products:
    print(f'Product: {product_name}, Image saved at: {image_path}')


Scraped Products:
Product: Acer Aspire 3 A315-24P-R7VH Slim Laptop | 15.6" Full HD IPS Display | AMD Ryzen 3 7320U Quad-Core Processor | AMD Radeon Graphics | 8GB LPDDR5 | 128GB NVMe SSD | Wi-Fi 6 | Windows 11 Home in S Mode, Image saved at: product_images/Acer_Aspire_3_A315-24P-R7VH_Sl.jpg
Product: HP Newest 255 G10 Laptop for Home or Work, 16GB RAM, 1TB SSD, 15.6" Full HD, Ryzen 3 7330U (Beat Intel i5-1135G7), Ethernet Port, HDMI, USB-C, Windows 11 Pro, Business and Fun Ready (2024), Image saved at: product_images/HP_Newest_255_G10_Laptop_for_H.jpg
Product: HP Newest 14" Ultral Light Laptop for Students and Business, Intel Quad-Core N4120, 8GB RAM, 192GB Storage(64GB eMMC+128GB Micro SD), 1 Year Office 365, Webcam, HDMI, WiFi, USB-A&C, Win 11 S, Image saved at: product_images/HP_Newest_14"_Ultral_Light_Lap.jpg
Product: Acer Aspire Go 15 Slim Laptop | 15.6" Full HD IPS 1080P Display | Intel Core i3-N305| Intel UHD Graphics | 8GB LPDDR5 | 128GB HD | Wi-Fi 6 | AI PC | Windows 11 Home in