In [13]:
pip install requests beautifulsoup4 pandas selenium

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
# Importing necessary libraries
import requests  # For fetching the webpage
from bs4 import BeautifulSoup  # For parsing the webpage
import pandas as pd

In [4]:
# URL of the webpage to scrape
url = 'https://starbucksmenuprices.com/'

# Send a request to the server
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    print("Successfully fetched the webpage!")
else:
    print(f"Failed to fetch webpage. Status code: {response.status_code}")

Successfully fetched the webpage!


In [6]:
# Parse the webpage content with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Print the HTML content to understand its structure
#print(soup.prettify())

In [7]:
# Find all <ul> elements containing the country links
sections = soup.find_all('ul')  # Locate all <ul> elements

# Lists in Python: The result is a list, which can store multiple items.
print(sections[1])

<ul class="sub-menu">
<li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-32" id="menu-item-32"><a href="https://starbucksmenuprices.com/starbucks-au-prices/">Australia</a></li>
<li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-42" id="menu-item-42"><a href="https://starbucksmenuprices.com/starbucks-brasil-precos/">Brasil</a></li>
<li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-70" id="menu-item-70"><a href="https://starbucksmenuprices.com/starbucks-%d1%86%d0%b5%d0%bd%d0%b8/">Bulgaria</a></li>
<li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-58" id="menu-item-58"><a href="https://starbucksmenuprices.com/starbucks-canada-menu/">Canada</a></li>
</ul>


In [8]:
# Extract links from the <ul> sections
country_links = []  # Empty list to store results

for section in sections:
    links = section.find_all('a')  # Find all <a> tags in each section
    for link in links:
        country_name = link.text.strip()  # Get the visible text of the link
        country_url = link.get('href')  # Get the href attribute (URL)
        country_links.append({'Country': country_name, 'URL': country_url})

In [11]:
df = pd.DataFrame(country_links)

# Display the first few rows of the DataFrame
print(df.head())

     Country                                                URL
0        A-C                                                  #
1  Australia  https://starbucksmenuprices.com/starbucks-au-p...
2     Brasil  https://starbucksmenuprices.com/starbucks-bras...
3   Bulgaria  https://starbucksmenuprices.com/starbucks-%d1%...
4     Canada  https://starbucksmenuprices.com/starbucks-cana...


In [12]:
# Save the DataFrame to a CSV file
df.to_csv('starbucks_country_links.csv', index=False)
print("Saved country links to starbucks_country_links.csv")

Saved country links to starbucks_country_links.csv


In [1]:
import os
import pandas as pd
import ftfy
from langdetect import detect
from deep_translator import GoogleTranslator
from concurrent.futures import ThreadPoolExecutor

folder_path = 'project3/prices/'  # Directory where the files are stored
worldwide_latte_prices = {}
translator = GoogleTranslator(source='auto', target="en")
translation_cache = {}  # Dictionary to cache translations

def fix_text_mojibake(text):
    """Fix mojibake and return cleaned text."""
    fixed_text = ftfy.fix_text(text)  # Fix mojibake
    return fixed_text.strip() if fixed_text.strip() else text  # Ensure no empty strings

def detect_language(text_list):
    """Detect language based on multiple samples for accuracy."""
    combined_text = " ".join(text_list[:5])  # Use the first 5 items for detection
    try:
        return detect(combined_text)
    except Exception:
        return "unknown"

def translate_text(text, source_lang):
    """Translate text only if it's not English."""
    if text in translation_cache:
        return translation_cache[text]  # Return cached translation if available
    
    if source_lang != "en":
        try:
            translated_text = translator.translate(text)
            translation_cache[text] = translated_text  # Cache the translation
            return translated_text
        except Exception:
            return text  # Return original if translation fails
    return text  # If already English, return as is

def process_file(file_name):
    """Process a single CSV file: fix mojibake, translate, and extract latte price."""
    country_name = file_name.replace('starbucks_prices_', '').replace('.csv', '').capitalize()
    file_path = os.path.join(folder_path, file_name)
    
    df = pd.read_csv(file_path)

    # Step 1: Fix Mojibake on the 'Item' column
    df['Item'] = df['Item'].astype(str).apply(fix_text_mojibake)

    # Step 2: Detect language of the fixed column
    detected_lang = detect_language(df['Item'].astype(str).tolist())

    # Step 3: Translate if necessary
    if detected_lang != "en":
        df['Item'] = df['Item'].astype(str).apply(lambda x: translate_text(x, detected_lang))

    # Save the translated data back to CSV
    translated_file_path = os.path.join(folder_path, f"translated_{file_name}")
    df.to_csv(translated_file_path, index=False, encoding='utf-8-sig')  # Use UTF-8 for full compatibility
    
    print(f"Translated data saved to {translated_file_path}")

    # Step 4: Search for 'Latte'
    latte_row = df[df['Item'].str.lower().str.contains('latte', na=False)]
    if not latte_row.empty:
        latte_price = latte_row.iloc[0]['Price']
        worldwide_latte_prices[country_name] = latte_price

# Use ThreadPoolExecutor for parallel processing
file_list = [f for f in os.listdir(folder_path) if f.startswith('starbucks_prices_') and f.endswith('.csv')]
with ThreadPoolExecutor() as executor:
    executor.map(process_file, file_list)

# Save the consolidated data to a new CSV file
output_file = 'project3/worldwide_latte_prices.csv'
df_worldwide = pd.DataFrame(list(worldwide_latte_prices.items()), columns=['Country', 'Latte Price'])
df_worldwide.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"Worldwide latte prices saved to {output_file}")
print(df_worldwide)


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'project3/prices/'