In [2]:
os.chdir(r"c:\\phd\coursework\\applied_data")
print(os.getcwd())

c:\phd\coursework\applied_data


In [4]:
import os
import pandas as pd
import ftfy
from langdetect import detect
from deep_translator import GoogleTranslator
from concurrent.futures import ThreadPoolExecutor

folder_path = 'project3/prices/'  # Directory where the files are stored
worldwide_latte_prices = {}
translator = GoogleTranslator(source='auto', target="en")
translation_cache = {}  # Dictionary to cache translations

def fix_text_mojibake(text):
    """Fix mojibake and return cleaned text."""
    fixed_text = ftfy.fix_text(text)  # Fix mojibake
    return fixed_text.strip() if fixed_text.strip() else text  # Ensure no empty strings

def detect_language(text_list):
    """Detect language based on multiple samples for accuracy."""
    combined_text = " ".join(text_list[:5])  # Use the first 5 items for detection
    try:
        return detect(combined_text)
    except Exception:
        return "unknown"

def translate_text(text, source_lang):
    """Translate text only if it's not English."""
    if text in translation_cache:
        return translation_cache[text]  # Return cached translation if available
    
    if source_lang != "en":
        try:
            translated_text = translator.translate(text)
            translation_cache[text] = translated_text  # Cache the translation
            return translated_text
        except Exception:
            return text  # Return original if translation fails
    return text  # If already English, return as is

def process_file(file_name):
    """Process a single CSV file: fix mojibake, translate, and extract latte price."""
    country_name = file_name.replace('starbucks_prices_', '').replace('.csv', '').capitalize()
    file_path = os.path.join(folder_path, file_name)
    
    df = pd.read_csv(file_path)

    # Step 1: Fix Mojibake on the 'Item' column
    df['Item'] = df['Item'].astype(str).apply(fix_text_mojibake)

    # Step 2: Detect language of the fixed column
    detected_lang = detect_language(df['Item'].astype(str).tolist())

    # Step 3: Translate if necessary
    if detected_lang != "en":
        df['Item'] = df['Item'].astype(str).apply(lambda x: translate_text(x, detected_lang))

    # Save the translated data back to CSV
    file_path = os.path.join(folder_path, file_name)
    df.to_csv(file_path, index=False, encoding='utf-8-sig')  # Use UTF-8 for full compatibility
    
    print(f"Translated data saved to {file_path}")

    # Step 4: Search for 'Latte'
    latte_row = df[df['Item'].str.lower().str.contains('latte', na=False)]
    if not latte_row.empty:
        latte_price = latte_row.iloc[0]['Price']
        worldwide_latte_prices[country_name] = latte_price

# Use ThreadPoolExecutor for parallel processing
file_list = [f for f in os.listdir(folder_path) if f.startswith('starbucks_prices_') and f.endswith('.csv')]
with ThreadPoolExecutor() as executor:
    executor.map(process_file, file_list)

# Save the consolidated data to a new CSV file
output_file = 'project3/worldwide_latte_prices.csv'
df_worldwide = pd.DataFrame(list(worldwide_latte_prices.items()), columns=['Country', 'Latte Price'])
df_worldwide.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"Worldwide latte prices saved to {output_file}")
print(df_worldwide)


Translated data saved to project3/prices/starbucks_prices_australia.csv
Translated data saved to project3/prices/starbucks_prices_bulgaria.csv
Translated data saved to project3/prices/starbucks_prices_canada.csv
Translated data saved to project3/prices/starbucks_prices_colombia.csv
Translated data saved to project3/prices/starbucks_prices_france.csv
Translated data saved to project3/prices/starbucks_prices_czech-republic.csv
Translated data saved to project3/prices/starbucks_prices_finland.csv
Translated data saved to project3/prices/starbucks_prices_greece.csv
Translated data saved to project3/prices/starbucks_prices_hungary.csv
Translated data saved to project3/prices/starbucks_prices_portugal.csv
Translated data saved to project3/prices/starbucks_prices_italy.csv
Translated data saved to project3/prices/starbucks_prices_switzerland.csv
Translated data saved to project3/prices/starbucks_prices_united-kingdom.csv
Translated data saved to project3/prices/starbucks_prices_denmark.csv
Tr