In [2]:
from geopy.geocoders import Nominatim
from geopy.exc import GeopyError



In [12]:
def validate_city(city_name, country_name):
    """
    Valide et récupère le nom standardisé d'une ville avec le pays en anglais via l'API Nominatim d'OpenStreetMap.
    Permet d'envoyer à la fois le nom de la ville et du pays pour une recherche plus précise.
    
    Args:
        city_name (str): Le nom de la ville à valider.
        country_name (str): Le nom du pays dans lequel chercher la ville.
        
    Returns:
        str: Le nom standardisé de la ville et du pays en anglais, ou None si la ville n'est pas trouvée.
    """
    # Initialiser le géolocalisateur avec un user_agent
    geolocator = Nominatim(user_agent="city_validation_test")
    
    try:
        # Formater la requête avec la ville et le pays
        query = f"{city_name}, {country_name}"
        
        # Rechercher la ville dans le pays, en demandant les résultats en anglais
        location = geolocator.geocode(query, addressdetails=True, exactly_one=True, language='en')
        
        if location:
            # Extraction du nom de la ville et du pays en anglais
            address_parts = location.address.split(", ")
            # On prend la première partie (ville) et la dernière (pays)
            city_and_country = f"{address_parts[0]}, {address_parts[-1]}"
            print(f"City found: {city_and_country}")
            return city_and_country
        else:
            print(f"City '{city_name}' in '{country_name}' not found.")
            return validate_city(input("Enter the city: "),input("Enter the country: "))
    except GeopyError as e:
        print(f"Error querying city API: {e}")
        return None

def get_choice(prompt, options_dict):
    """
    Demande à l'utilisateur de choisir une option parmi un dictionnaire de choix.
    Si l'entrée est invalide, l'appelle récursivement pour redemander l'option.
    
    Args:
        prompt (str): Le message à afficher à l'utilisateur.
        options_dict (dict): Dictionnaire contenant les options valides avec les clés comme choix.
    
    Returns:
        str: La valeur correspondante à l'option choisie.
    """
    # Affiche les options possibles
    print(f"Available options:")
    for key, value in options_dict.items():
        print(f"{key}. {value}")
    
    try:
        # Demander le choix à l'utilisateur
        choice = int(input(prompt))
        
        # Vérifie que le choix est valide
        if choice in options_dict:
            return options_dict[choice]
        else:
            print(f"Invalid choice. Choose from {', '.join(str(i) for i in options_dict.keys())}.")
            return get_choice(prompt, options_dict)  # Appel récursif si le choix est invalide
    except ValueError:
        print(f"Please enter a valid number.")
        return get_choice(prompt, options_dict)  # Appel récursif si l'entrée n'est pas un nombre
    
    
COSTS = {1: 'Affordable', 2: 'Mid-Range', 3: 'Premium', 4: 'Luxury'}
SPORTIVITY_LEVELS = {1: 'Low', 2: 'Moderate', 3: 'High', 4: 'Extreme'}
LANGUAGES = {1: 'English', 2: 'French', 3: 'German', 4: 'Spanish', 5: 'Italian', 6: 'Portuguese'}

city = validate_city(input("Enter the city: "),input("Enter the country: "))
language = get_choice("Enter the number corresponding to your language: ", LANGUAGES)

# Demander le nombre d'activités
num_activities = int(input("Enter the number of activities: "))

# Demander à l'utilisateur de choisir le niveau de sportivité
sportivity = get_choice(f"Enter the number corresponding to your sportivity level: ({', '.join(str(i) for i in SPORTIVITY_LEVELS.keys())}): ", SPORTIVITY_LEVELS)

# Demander à l'utilisateur de choisir la catégorie de prix
price_category = get_choice(f"Enter the number corresponding to your price category: ({', '.join(str(i) for i in COSTS.keys())}): ", COSTS)



City found: Rome, Italy
Available options:
1. English
2. French
3. German
4. Spanish
5. Italian
6. Portuguese
Please enter a valid number.
Available options:
1. English
2. French
3. German
4. Spanish
5. Italian
6. Portuguese
Available options:
1. Low
2. Moderate
3. High
4. Extreme
Invalid choice. Choose from 1, 2, 3, 4.
Available options:
1. Low
2. Moderate
3. High
4. Extreme
Available options:
1. Affordable
2. Mid-Range
3. Premium
4. Luxury
Please enter a valid number.
Available options:
1. Affordable
2. Mid-Range
3. Premium
4. Luxury


In [16]:

def main(city_standard, language, num_activities, sportivity, price_category):

    # Params communs
    raw_params = {"city": city_standard}
    processed_params = {"city": city_standard, "language": language}
    cluster_params = {"city": city_standard, "language": language, "sportivity": sportivity, "price_category": price_category}
    itinerary_params = {**cluster_params, "num_activities": num_activities}
    a = "df_" + "_".join(str(value).replace(",","_").replace(" ", "").replace("'", "").lower() for value in raw_params.values()) + ".csv"
    b = "df_" + "_".join(str(value).replace(",","_").replace(" ", "").replace("'", "").lower() for value in processed_params.values()) + ".csv"
    c = "df_" + "_".join(str(value).replace(",","_").replace(" ", "").replace("'", "").lower() for value in cluster_params.values()) + ".csv"
    d = "df_" + "_".join(str(value).replace(",","_").replace(" ", "").replace("'", "").lower() for value in itinerary_params.values()) + ".csv"


    print(a)
    print(b)
    print(c)
    print(d)

main(city, language, num_activities, sportivity, price_category)

df_rome_italy.csv
df_rome_italy_german.csv
df_rome_italy_german_extreme_affordable.csv
df_rome_italy_german_extreme_affordable_4.csv


## test websites extractions

In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
from googletrans import Translator
import unicodedata
import pycountry
import pycountry_convert as pc
from difflib import get_close_matches


def fetch_and_parse_page(url):
    """
    Récupère et parse le contenu HTML d'une page web.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    try:
        # Récupérer la page
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Lève une exception pour les erreurs HTTP
        
        # Parser le contenu avec BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    except requests.exceptions.RequestException as e:
        print(f"Erreur lors de la récupération de l'URL {url}: {e}")
        return None


def save_html_to_txt(soup, filename,dir_path='C:/Users/glenn/OneDrive/Bureau/VScode saves/WebScrapping/Projet'):
    """
    Sauvegarde le contenu HTML formaté dans un fichier texte.
    """
    path=f'{dir_path}/{filename}.txt'
    try:
        with open(filename, "w", encoding="utf-8") as file:
            file.write(soup.prettify())
        print(f"HTML sauvegardé dans le fichier : {filename}")
    except Exception as e:
        print(f"Erreur lors de la sauvegarde du fichier : {e}")


def extract_text_by_class(soup, balise,class_name):
    """
    Récupère tous les textes des balises <span> ayant une classe spécifique.
    """
    # Chercher toutes les balises <span> avec la classe donnée
    spans = soup.find_all(balise, class_=class_name)
    
    # Extraire et retourner le texte
    return [span.get_text(strip=True) for span in spans]


# Lonely planet extract
def LonelyPlanet_attractions(soup):
    
    texts = extract_text_by_class(soup,"span", "heading-05 font-semibold")
    df_LonelyPlanet=pd.DataFrame({'Title': texts})
    df_LonelyPlanet.insert(0, 'site', 'LonelyPlanet')
    df_LonelyPlanet['rank'] = range(len(df_LonelyPlanet))
    return df_LonelyPlanet


# Bucket List extract
def BucketList_attractions(soup):

    df_BucketList=pd.DataFrame()
    # Trouver toutes les balises <article>
    articles = soup.find_all('article', class_='listing-card bg-white shadow-listing')
    # Initialiser une liste pour stocker les résultats

    for article in articles:
        # Extraire le titre de la balise <h2> (nom de l'attraction)
        title_tag = article.find('h2', class_='text-2xl md:text-3xl font-bold')
        title = title_tag.get_text(strip=True) if title_tag else 'Titre non trouvé'

        # Initialiser un dictionnaire pour stocker les informations de l'attraction
        attraction_info = {'Title': title}

        # Trouver toutes les balises <p> avec les informations sur la durée, l'âge, etc.
        p_tags = article.find_all('p', class_='flex items-center space-x-1 text-lg')

        for p in p_tags:
            # Extraire le nom de la catégorie (par exemple "Duration", "Good for age", etc.)
            label_tag = p.find_all('span')[1]
            if label_tag:
                label_value = label_tag.get_text(strip=True).split(':')
                if len(label_value)==2:
                    label=label_value[0]
                    value=label_value[1]
                    attraction_info[label] = value

        # Ajouter l'attraction à la liste des résultats
        df_BucketList = pd.concat([df_BucketList, pd.DataFrame([attraction_info])], ignore_index=True)
    df_BucketList.insert(0, 'site', 'BucketList')
    df_BucketList['rank'] = range(len(df_BucketList))
    return df_BucketList


# WorldTravelGuide extract
def WorldTravelGuide_attractions(soup):

    df_WorldTravelGuide=pd.DataFrame()
    articles = soup.find_all('div', class_='high')
    articles.extend(soup.find_all('div', class_='medium'))

    for article in articles:
        # Extraire le titre de la balise <h2> (nom de l'attraction)
        title_tag = article.find('h3')
        title = title_tag.get_text(strip=True) if title_tag else 'Titre non trouvé'

        # Initialisation du dictionnaire pour stocker les informations extraites
        attraction_info = {'Title': title}

        # Extraire la description
        description_tag = article.find('p')
        if description_tag:
            attraction_info['Description'] = description_tag.get_text(strip=True)


        # Extraire les horaires d'ouverture
        opening_times_tag = article.find('b', string="Opening times: ")
        if opening_times_tag:
            opening_times = opening_times_tag.find_next('p')
            if opening_times:
                attraction_info['Opening times'] = opening_times.get_text(strip=True)

        # Extraire le site Web
        website_tag = article.find('b', string="Website: ")
        if website_tag:
            website = website_tag.find_next('a')
            if website and website.get('href'):
                attraction_info['Website'] = website.get('href')

        # Extraire les frais d'admission
        admission_fees_tag = article.find('b', string="Admission Fees: ")
        if admission_fees_tag:
            admission_fees = admission_fees_tag.find_next('p')
            if admission_fees:
                attraction_info['Admission Fees'] = admission_fees.get_text(strip=True)

        # Extraire l'accès handicapé
        disabled_access_tag = article.find('b', string="Disabled Access: ")
        if disabled_access_tag:
            #comment récupérer le texte juste après disabled_access_tag
            disabled_access_text = disabled_access_tag.next_sibling.strip() if disabled_access_tag.next_sibling else 'Non spécifié'
            attraction_info['Disabled Access'] = disabled_access_text
        df_WorldTravelGuide = pd.concat([df_WorldTravelGuide, pd.DataFrame([attraction_info])], ignore_index=True)

    df_WorldTravelGuide.insert(0, 'site', 'WorldTravelGuide')
    df_WorldTravelGuide['rank'] = range(len(df_WorldTravelGuide))
    return df_WorldTravelGuide


def CNTraveler_attractions(soup):
    df_CNTraveler=pd.DataFrame()
    articles = soup.find_all('div', class_='GallerySlideFigCaption-dOeyTg gWbVWR')
    for article in articles:
        # Extraire le titre de l'attraction
        title_tag = article.find('span',class_='GallerySlideCaptionHedText-iqjOmM jwPuvZ')
        title = title_tag.get_text(strip=True) if title_tag else 'Titre non trouvé'
        attraction_info = {'Title': title}
        description_tag = article.find('p')
        if description_tag:
            attraction_info['Description'] = description_tag.get_text(strip=True)
        df_CNTraveler = pd.concat([df_CNTraveler, pd.DataFrame([attraction_info])], ignore_index=True)
    df_CNTraveler.insert(0, 'site', 'CNTraveler')
    df_CNTraveler['rank'] = range(len(df_CNTraveler))
    return df_CNTraveler


def Routard_attractions(soup):
    df_Routard=pd.DataFrame()
    articles = soup.find_all('div', class_='bg-rtd-grey-100 flex h-96 w-60 flex-col rounded-xl p-4')
    for article in articles:
        # Extraire le titre de l'attraction
        title_tag = article.find('h2',class_='group-hover:text-rtd-green my-2 font-semibold')
        title = title_tag.get_text(strip=True) if title_tag else 'Titre non trouvé'
        attraction_info = {'Title': title}
        description_tag = article.find('div', class_='rtd-wysiwyg line-clamp-3')
        if description_tag:
            attraction_info['Description'] = description_tag.get_text(strip=True)
        df_Routard = pd.concat([df_Routard, pd.DataFrame([attraction_info])], ignore_index=True)
    df_Routard.insert(0, 'site', 'Routard')
    df_Routard['rank'] = range(len(df_Routard))
    return df_Routard


def translate_location(name, src_lang="en", dest_lang="fr"):
    translator = Translator()
    try:
        translation = translator.translate(name, src=src_lang, dest=dest_lang).text
        translation= translation.replace("'","-").replace(" ","-").lower()
        #suppression des accents
        translation = unicodedata.normalize('NFD', translation)
        text = ''.join(char for char in translation if unicodedata.category(char) != 'Mn')
    
        return text
    except Exception as e:
        print(f"Error during translation: {e}")
        return name

def country_to_continent(country_name):
    country_names = [country.name for country in pycountry.countries]
    country = get_close_matches(country_name, country_names,n=1)
    if len(country) == 1:
        country_alpha2 = pc.country_name_to_country_alpha2(country[0])
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
        return country_continent_name.lower()
    else: return None

def routard_city_to_region(city):
    cities_to_regions = {
        "strasbourg": "alsace",
        "bordeaux": "aquitaine-bordelais-landes",
        "rennes": "bretagne",
        "nice": "cote-d-azur",
        "paris": "ile-de-france",
        "montpellier": "languedoc-roussillon",
        "toulouse": "midi-toulousain-occitanie",
        "lille": "nord-pas-de-calais",
        "nantes": "pays-de-la-loire",
        "marseille": "provence"
    }
    return cities_to_regions[city]

def routard_continent(continent):
    routard_continent_fr = {
        "europe":"europe",
        "africa":"afrique",
        "north america":"ameriques",
        "south america":"ameriques",
        "asia":"asie",
        "oceania":"oceanie"
    }
    return routard_continent_fr[continent]

def main(country='france',city='paris',websites_to_call=['Routard','WorldTravelGuide','BucketList','LonelyPlanet','CNTraveler']):
    continent = country_to_continent(country)
    continent_fr = routard_continent(continent)
    country_fr = translate_location(country)
    city_fr = translate_location(city)

    URL_dict={
        'LonelyPlanet':f'https://www.lonelyplanet.com/{country}/{city}/attractions',
        'BucketList':f'https://www.bucketlisttravels.com/destination/{city}/best-things-to-see-and-do',
        'WorldTravelGuide':f'https://www.worldtravelguide.net/guides/{continent}/{country}/{city}/things-to-see/',
        'CNTraveler':f'https://www.cntraveler.com/gallery/best-things-to-do-in-{city}',
        'Routard':f'https://www.routard.com/fr/guide/top/{continent_fr}/{country_fr}/{city_fr}'
    }
    if country=='france':
        region = routard_city_to_region (city_fr)

        URL_dict['Routard']=f'https://www.routard.com/fr/guide/top/{country}/{region}/{city_fr}'

    URL_extractor={
        'LonelyPlanet_attractions': LonelyPlanet_attractions,
        'BucketList_attractions':BucketList_attractions,
        'WorldTravelGuide_attractions':WorldTravelGuide_attractions,
        'CNTraveler_attractions':CNTraveler_attractions,
        'Routard_attractions':Routard_attractions
    }
    df=pd.DataFrame()
    for website in websites_to_call:
        soup=fetch_and_parse_page(URL_dict[website])
        #save_html_to_txt(soup,f'{website}_{city}')
        if soup:
            df=pd.concat([df,URL_extractor[f'{website}_attractions'](soup)],ignore_index=True)
    return (df)
        

df_main=main("germany", "berlin")

In [3]:
df_main

Unnamed: 0,site,Title,Description,rank,Opening times,Website,Admission Fees,Disabled Access,Adult price,Good for age,Duration,When,Freq
0,Routard,Alexanderplatz,Place historique de Berlin marquée par l’esthé...,0,,,,,,,,,
1,Routard,Château de Charlottenburg (Schloss Charlottenb...,L’un des plus beaux exemples d’architecture ba...,1,,,,,,,,,
2,Routard,Checkpoint Charlie,"Lieu chargé de symboles, qui fut le seul point...",2,,,,,,,,,
3,Routard,East Side Gallery,L’East Side Gallery est un morceau du Mur de B...,3,,,,,,,,,
4,Routard,Église du Souvenir (Kaiser-Wilhelm-Gedächtnisk...,"Symbole des ravages de la guerre, surnommée « ...",4,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,CNTraveler,Memorial to the Murdered Jews of Europe,"A short walk fromBrandenburg Gate, this sprawl...",12,,,,,,,,,
126,CNTraveler,East Side Gallery,"With more than 100 paintings, the East Side Ga...",13,,,,,,,,,
127,CNTraveler,Tempelhofer Feld,Built by Hitler’s henchmen and used as a lifel...,14,,,,,,,,,
128,CNTraveler,Tiergarten,"Berlin's signature park and ""green lung,"" Tier...",15,,,,,,,,,


In [1]:
from googletrans import Translator


In [2]:
translator = Translator()
name = "la tour eiffel"
translation = translator.translate(name, src="fr", dest="en").text
print(translation)

The Eiffel Tower
