# 1. Imports

In [1]:
import pandas as pd
import re
import numpy as np

# 2. Load data

In [2]:
file_id = "1lfH64MX8NHuxn7745leZ6LaXRVLAAer77J336ZFOTIk"
gid = "1900938527"  # onglet cible
url = f"https://docs.google.com/spreadsheets/d/{file_id}/export?format=csv&gid={gid}"

df = pd.read_csv(url)  # éventuellement: encoding="utf-8"
df_clean = pd.DataFrame()
print(df.shape)


(54, 30)


In [3]:
df_clean = df

In [4]:
df_clean.columns

Index(['Horodateur', 'Quel est votre nationalité?',
       '  Dans quel pays résidez-vous actuellement ?  ',
       'Quelle est votre tranche d’âge ?  ',
       'Quelle est votre situation familiale ? ',
       'Quelle est votre tranche de revenus mensuels nets du foyer ? ',
       'À quelle fréquence voyagez vous à l’étranger (hors Europe) ?  ',
       'Avez-vous déjà voyagé au Japon ?  ',
       'Quelle durée de séjour avez-vous prévue ?  ',
       'Quelles régions du Japon vous intéressent le plus ? (Choisissez 3 max.)  ',
       'À quel point ces motivations influencent elles votre envie de voyager au Japon ? [Découverte de la culture et de l’histoire (temples, traditions, samouraïs, geishas, etc.)]',
       'À quel point ces motivations influencent elles votre envie de voyager au Japon ? [Gastronomie japonaise (sushis, ramen, wagyu, street food, etc.)]',
       'À quel point ces motivations influencent elles votre envie de voyager au Japon ? [Paysages naturels et randonnées (monta

# 3. Global cleaning helpers (functions)

In [5]:
df_clean = df_clean.rename(columns={
    "Quel est votre nationalité?": "nationality",
    "  Dans quel pays résidez-vous actuellement ?  ": "country",
    "Quelle est votre tranche d’âge ?  ": "age_group",
    "Quelle est votre situation familiale ? ": "family_situation",
    "Quelle est votre tranche de revenus mensuels nets du foyer ? ": "household_income_in_€",
    "À quelle fréquence voyagez vous à l’étranger (hors Europe) ?  ": "travel_frequency",
    "Avez-vous déjà voyagé au Japon ?  ": "been_to_Japan",
    "Quelle durée de séjour avez-vous prévue ?  ": "Japan_vac_duration",
    "Quelles régions du Japon vous intéressent le plus ? (Choisissez 3 max.)  ": "most_wanted_pref_to_visit",
    "À quel point ces motivations influencent elles votre envie de voyager au Japon ? [Découverte de la culture et de l’histoire (temples, traditions, samouraïs, geishas, etc.)]": "rating_interest_culture_and_history",
    "À quel point ces motivations influencent elles votre envie de voyager au Japon ? [Gastronomie japonaise (sushis, ramen, wagyu, street food, etc.)]": "rating_interest_food",
    "À quel point ces motivations influencent elles votre envie de voyager au Japon ? [Paysages naturels et randonnées (montagnes, volcans, cerisiers en fleurs, etc.)]": "rating_interest_nature_hiking",
    "À quel point ces motivations influencent elles votre envie de voyager au Japon ? [Technologie, innovation et shopping (Tokyo high-tech, Akihabara, mode, etc.)]": "rating_interest_shopping_and_techno",
    "À quel point ces motivations influencent elles votre envie de voyager au Japon ? [Festivals et événements (matsuri, concerts, sport, sumo, etc.)]": "rating_interest_events_and_festivals",
    "À quel point ces motivations influencent elles votre envie de voyager au Japon ? [Bien-être (onsen, ryokan, détente)]": "rating_interest_wellness",
    "À quel point ces motivations influencent elles votre envie de voyager au Japon ? [Parc d'attraction (Disneyland, Universal...)]": "rating_interest_theme_park",
    "Quel budget global prévoyez vous pour un voyage au Japon (par personne et par semaine , hors vol international ) ?  ": "Japan_budget_per_week",
    "Parmi les types d’hébergement suivants, lequel correspond le mieux à vos préférences principales pour un séjour au Japon ? ": "Japan_prefered_accomodation",
    "Quels sont les principaux freins ou difficultés que vous rencontrez (ou pourriez rencontrer) lors d’un voyage au Japon ? (Choisissez 3 max.)  ": "Japan_most_difficulties",
    "Si vous ne pouviez pas voyager au Japon, quelle destination alternative choisiriez-vous ?": "alternative_destination",
    "Quelle a été la principale raison pour laquelle vous auriez choisi cette destination plutôt que le Japon ? ": "alt_dest_main_reason",
    "Parmi les types d’hébergement suivants, lequel correspond le mieux à vos préférences principales lors de vos voyages dans d’autres pays (hors Japon)   ? ": "alt_dest_prefered_accomodation",
    "Lors de vos voyages dans d’autres pays (hors Japon), quel est votre budget moyen par semaine et par personne , hors vol international ?  ": "alt_dest_budget_per_week",
    "Lors de vos voyages dans d’autres pays (hors Japon), quel(s) mode(s) de transport utilisez-vous le plus souvent ?": "alt_dest_transportation",
    "Comment préparez-vous vos voyages en général ? (Multiple choix possible)": "trip_prep",
    "Quel canal utilisez-vous le plus pour réserver vos voyages ?  ": "booking_trip_channel",
    "Parmi les éléments suivants, lequel influence le plus votre choix de destination de vacances (hors Japon) ? ": "most_influencial_reason_to_choose_dest",
    "Lorsque vous voyagez en dehors du Japon, quelles sont les principales difficultés que vous rencontrez habituellement ?\n(Choisissez jusqu’à 3 réponses)": "alt_dest_most_difficulties",
    "Qu’est-ce qui rendrait le Japon plus attractif comme destination pour vous ?  ": "recomendation_to_improve_attractiveness"
})

In [6]:
def normalize_text(s):
    if pd.isna(s): return s
    return (str(s)
            .strip()
            .lower()
            .replace("é","e").replace("è", "e").replace("ê", "e")
            .replace("à","a").replace("ï", "i").replace("î", "i")
            .replace("$","").replace("€", ""))

mapping = {
    #France
    "france": "France",
    "française": "France",
    "français": "France",
    "francais": "France",
    "francaise": "France",
    "french": "France",
    "法国": "France",
    "fucking french": "France",

    #China
    "chine": "China",
    "chinoise": "China",
    "chinois": "China",
    "中国": "China",
    "chinese": "China",

    #Taiwan
    "taïwan ": "Taiwan",
    "taiwan": "Taiwan",
    "taiwanais": "Taiwan",
    "taiwanaise": "Taiwan",

    #Vietnam
    "vietnam": "Vietnam",
    "vietnamien": "Vietnam",
    "vietnamienne": "Vietnam",

    #Portugal
    "portuguese": "Portugal",

    #Israel
    "israel": "Israel",
    "israël": "Israel",
    "israelien": "Israel",
    "israelienne": "Israel",
    "israélien": "Israel",

    #Spain
    "espagne": "Spain",

    #Germany
    "allemagne": "Germany",

    #USA
    "united states of america my friend": "USA",

}

if "nationality" in df_clean.columns:
    df_clean["nationality"] = (df_clean["nationality"]
                               .map(normalize_text)
                               .map(mapping)
                               .fillna(df_clean["nationality"]))
    
if "country" in df_clean.columns:
    df_clean["country"] = (df_clean["country"]
                               .map(normalize_text)
                               .map(mapping)
                               .fillna(df_clean["country"]))

In [7]:
df_clean.columns

Index(['Horodateur', 'nationality', 'country', 'age_group', 'family_situation',
       'household_income_in_€', 'travel_frequency', 'been_to_Japan',
       'Japan_vac_duration', 'most_wanted_pref_to_visit',
       'rating_interest_culture_and_history', 'rating_interest_food',
       'rating_interest_nature_hiking', 'rating_interest_shopping_and_techno',
       'rating_interest_events_and_festivals', 'rating_interest_wellness',
       'rating_interest_theme_park', 'Japan_budget_per_week',
       'Japan_prefered_accomodation', 'Japan_most_difficulties',
       'alternative_destination', 'alt_dest_main_reason',
       'alt_dest_prefered_accomodation', 'alt_dest_budget_per_week',
       'alt_dest_transportation', 'trip_prep', 'booking_trip_channel',
       'most_influencial_reason_to_choose_dest', 'alt_dest_most_difficulties',
       'recomendation_to_improve_attractiveness'],
      dtype='object')

# 4. Column-by-column cleaning

In [8]:
print(df_clean["nationality"].value_counts())
print(df_clean["country"].value_counts())

nationality
France      37
China       10
Vietnam      1
Taiwan       1
Israel       1
Slovène      1
Japanese     1
Marocain     1
Portugal     1
Name: count, dtype: int64
country
France      47
Spain        2
Germany      1
Suisse       1
Japan        1
Portugal     1
USA          1
Name: count, dtype: int64


In [9]:
print(df_clean["age_group"].value_counts())

age_group
25 – 34 ans        19
35 – 44 ans        15
45 – 54 ans         9
18 – 24 ans         3
55 – 64 ans         2
65 ans et plus      2
35–44 years old     2
18–24 years old     2
Name: count, dtype: int64


In [10]:
def clean_age (age):
    if pd.isna(age):
        return None
    age = str(age).strip()
    if age.startswith("18"):
        return "18-24"
    elif age.startswith("25"):
        return "25-34"
    elif age.startswith("35"):
        return "35-44"
    elif age.startswith("45"):
        return "45-54"
    elif age.startswith("55"):
        return "55-64"
    elif age.startswith("65"):
        return "65 and over"
    else:
        return "18 and less"

df_clean["age_group"] = df_clean["age_group"].apply(clean_age)


In [11]:
print(df_clean["age_group"].value_counts())

age_group
25-34          19
35-44          17
45-54           9
18-24           5
55-64           2
65 and over     2
Name: count, dtype: int64


In [12]:
print(df_clean["family_situation"].value_counts())

family_situation
Célibataire                                        16
Marié(e)/Pacsé(e) avec enfant(s)                   10
En couple avec enfant(s)                            9
Marié(e)/Pacsé(e) sans enfant                       7
En couple sans enfant                               7
Préfère ne pas répondre                             1
Married / in a civil partnership, no children       1
Married / in a civil partnership, with children     1
In a relationship, no children                      1
Prefer not to say                                   1
Name: count, dtype: int64


In [13]:
family_situation_map = {
    
    "single": "Single",
    "celibataire": "Single",

    "en couple sans enfant": "Relationship_no_kids",
    "in a relationship, no children": "Relationship_no_kids",

    "en couple avec enfant(s)": "Relationship_with_kids",
    "in a relationship, with children": "Relationship_with_kids",

    "marie(e)/pacse(e) sans enfant": "Married_no_kids",
    "married / in a civil partnership, no children": "Married_no_kids",

    "marie(e)/pacse(e) avec enfant(s)": "Married_with_kids",
    "married / in a civil partnership, with children": "Married_with_kids",

    "prefere ne pas repondre": "Unknown",
    "prefer not to say": "Unknown"
}

df_clean["family_situation"] = (df_clean["family_situation"]
                            .map(normalize_text)
                            .map(family_situation_map)
                            .fillna(df_clean["family_situation"]))


In [14]:
print(df_clean["family_situation"].value_counts())

family_situation
Single                    16
Married_with_kids         11
Relationship_with_kids     9
Married_no_kids            8
Relationship_no_kids       8
Unknown                    2
Name: count, dtype: int64


In [15]:
print(df_clean["household_income_in_€"].value_counts())

household_income_in_€
1 500 – 1 999 €                       11
3 000 – 3 999 €                        7
2 500 – 2 999 €                        6
5000 – 5 999 €                         6
2 000 – 2 499 €                        5
Supérieur à 7 000 €                    4
Préfère ne pas répondre                4
4 000 – 4 999 €                        3
Moins de 1 500 €                       3
6 000 – 6 999 €                        1
$1,700 – $2,200 (~€1,500 – €1,999)     1
$3,400 – $4,400 (~€3,000 – €3,999)     1
Prefer not to say                      1
$5,600 – $6,600 (~€5,000 – €5,999)     1
Name: count, dtype: int64


In [16]:
clean_income = {
    
    "moins de 1 500 ": "1500 and less",
    "less than 1,700 (~1,500)": "1500 and less",

    "1 500 – 1 999 ": "1500-1999",
    "1,700 – 2,200 (~1,500 – 1,999)": "1500-1999",

    "2 000 – 2 499 ": "2000-2499",
    "2,200 – 2,700 (~2,000 – 2,499)": "2000-2499",

    "2 500 – 2 999 ": "2500-2999",
    "2,800 – 3,300 (~2,500 – 2,999)": "2500-2999",

    "3 000 – 3 999 ": "3000-3999",
    "3,400 – 4,400 (~3,000 – 3,999)": "3000-3999",

    "4 000 – 4 999 ": "4000–4999",
    "4,500 – 5,500 (~4,000 – 4,999)": "4000–4999",

    "5000 – 5 999 ": "5000–5999",
    "5,600 – 6,600 (~5,000 – 5,999)": "5000–5999",

    "6 000 – 6 999 ": "6000–6999",
    "6,700 – 7,700 (~6,000 – 6,999)": "6000–6999",

    "superieur a 7 000 ": "7000 and more",
    "more than 7,800 (~7,000+)": "7000 and more",

    "prefere ne pas repondre": "Unknown",
    "prefer not to say": "Unknown",
    }

df_clean["household_income_in_€"] = (df_clean["household_income_in_€"]
                            .map(normalize_text)
                            .map(clean_income)
                            .fillna(df_clean["household_income_in_€"]))


In [17]:
print(df_clean["household_income_in_€"].value_counts())

household_income_in_€
1500-1999        12
3000-3999         8
5000–5999         7
2500-2999         6
2000-2499         5
Unknown           5
7000 and more     4
4000–4999         3
1500 and less     3
6000–6999         1
Name: count, dtype: int64


In [18]:
print(df_clean["travel_frequency"].value_counts())

travel_frequency
Une fois par an                    17
Tous les 2–3 ans                   16
Plusieurs fois par an               8
Une fois tous les 5 ans ou plus     5
Jamais                              4
Every 2–3 years                     2
Once every 5 years or more          1
Several times a year                1
Name: count, dtype: int64


In [19]:
clean_travel_frequency = {
    
    "jamais": "Never",
    "une fois tous les 5 ans ou plus": "Once every 5 years or more",
    "tous les 2–3 ans": "Every 2–3 years",
    "une fois par an": "Once a year",
    "plusieurs fois par an": "Several times a year",
    }

df_clean["travel_frequency"] = (df_clean["travel_frequency"]
                            .map(normalize_text)
                            .map(clean_travel_frequency)
                            .fillna(df_clean["travel_frequency"]))


In [20]:
print(df_clean["travel_frequency"].value_counts())

travel_frequency
Every 2–3 years               18
Once a year                   17
Several times a year           9
Once every 5 years or more     6
Never                          4
Name: count, dtype: int64


In [21]:
print(df_clean["been_to_Japan"].value_counts())

been_to_Japan
Non, mais j’aimerais y aller        33
Non, et je ne suis pas intéressé     7
Oui, une fois                        6
Oui, plusieurs fois                  4
Yes, several times                   2
No, but I would like to go           2
Name: count, dtype: int64


In [22]:
clean_been_to_japan = {

    "oui, une fois": "Yes, once",
    "oui, plusieurs fois": "Yes, several times",
    "non, mais j’aimerais y aller": "No, but I would like to go",
    "non, et je ne suis pas interesse": "No, and I’m not interested",
    }

df_clean["been_to_Japan"] = (df_clean["been_to_Japan"]
                            .map(normalize_text)
                            .map(clean_been_to_japan)
                            .fillna(df_clean["been_to_Japan"]))

print(df_clean["been_to_Japan"].value_counts())

been_to_Japan
No, but I would like to go    35
No, and I’m not interested     7
Yes, several times             6
Yes, once                      6
Name: count, dtype: int64


In [23]:
print(df_clean["Japan_vac_duration"].value_counts())

Japan_vac_duration
2 semaines                              17
3 semaines                               8
Je ne sais pas / Pas assez renseigné     7
1 semaine                                6
4 semaines                               3
Plus de 4 semaines                       2
2 weeks                                  2
More than 4 weeks                        1
4 weeks                                  1
Name: count, dtype: int64


In [24]:
clean_Japan_vac_duration = {
    
    "1 semaine": "1 week",
    "2 semaines": "2 weeks",
    "3 semaines": "3 weeks",
    "4 semaines": "4 weeks",
    "plus de 4 semaines": "More than 4 weeks",
    "je ne sais pas / pas assez renseigne": "I don’t know yet / Not sure"
    }

df_clean["Japan_vac_duration"] = (df_clean["Japan_vac_duration"]
                            .map(normalize_text)
                            .map(clean_Japan_vac_duration)
                            .fillna(df_clean["Japan_vac_duration"]))

print(df_clean["Japan_vac_duration"].value_counts())

Japan_vac_duration
2 weeks                        19
3 weeks                         8
I don’t know yet / Not sure     7
1 week                          6
4 weeks                         4
More than 4 weeks               3
Name: count, dtype: int64


# 5. Multi-choice question processing

In [25]:
print(df_clean["most_wanted_pref_to_visit"].value_counts())

most_wanted_pref_to_visit
Je n’ai pas encore d’idée précise, j’ai besoin d’y réfléchir ou de me renseigner.                                                                                                                                                                                                                                                    13
Tokyo et sa région (Kanto), Kyoto / Osaka / Nara (Kansai)                                                                                                                                                                                                                                                                             5
Tokyo et sa région (Kanto), Kyoto / Osaka / Nara (Kansai), Chūbu (Nagoya, Alpes japonaises, Kanazawa, Takayama, Mont Fuji côté Yamanashi/Shizuoka)                                                                                                                                                                            

In [26]:
def smart_split(val):
    if pd.isna(val):
        return[]
    s = str(val)
    parts = re.split(r',(?![^()]*\))', s)
    parts = [p.strip() for p in parts if p.strip()]
    return parts

In [27]:
regions_list = df_clean["most_wanted_pref_to_visit"].apply(smart_split)

In [28]:
MAX_CHOICES = 5

def list_to_fixed_cols(lst, k=MAX_CHOICES):
    lst = (lst + [np.nan]*k) [:k]
    return pd.Series(lst, index=[f"most_wanted_pref_to_visit_{i+1}" for i in range(k)])

df_prefs = regions_list.apply(list_to_fixed_cols)
df_clean = pd.concat([df_clean, df_prefs], axis=1)


In [29]:
display(df_clean.head())

Unnamed: 0,Horodateur,nationality,country,age_group,family_situation,household_income_in_€,travel_frequency,been_to_Japan,Japan_vac_duration,most_wanted_pref_to_visit,...,trip_prep,booking_trip_channel,most_influencial_reason_to_choose_dest,alt_dest_most_difficulties,recomendation_to_improve_attractiveness,most_wanted_pref_to_visit_1,most_wanted_pref_to_visit_2,most_wanted_pref_to_visit_3,most_wanted_pref_to_visit_4,most_wanted_pref_to_visit_5
0,05/10/2025 13:09:39,France,France,35-44,Married_no_kids,1500-1999,Every 2–3 years,"Yes, several times",More than 4 weeks,"Kyoto / Osaka / Nara (Kansai), Région du Tohok...",...,Réseaux sociaux / influenceurs,Site officiel de compagnies aériennes ou hôtels,Explorer le patrimoine culturel et historique ...,"Barrière de la langue, Difficultés avec les tr...",Le Japon est parfait tel qu'il est,Kyoto / Osaka / Nara (Kansai),"Région du Tohoku (ex. Yamagata, Sendai)","Shikoku (île du pèlerinage des 88 temples, Mat...",,
1,06/10/2025 13:30:50,France,France,45-54,Relationship_with_kids,1500-1999,Every 2–3 years,"No, but I would like to go",2 weeks,"Je n’ai pas encore d’idée précise, j’ai besoin...",...,"Guides papier (Lonely Planet, Routard…)","Agence en ligne (ex. Expedia, Booking.com)",Explorer le patrimoine culturel et historique ...,"Coût de la vie (hébergement, nourriture, activ...","Déjà très attractif pour moi, juste une questi...",Je n’ai pas encore d’idée précise,j’ai besoin d’y réfléchir ou de me renseigner.,,,
2,06/10/2025 17:20:05,France,France,35-44,Single,1500-1999,Once a year,"No, but I would like to go",3 weeks,"Je n’ai pas encore d’idée précise, j’ai besoin...",...,"Sites spécialisés (Voyageurs du Monde, Comptoi...","Plateformes collaboratives (Airbnb, etc.)",Vivre une expérience unique ou dépaysante,"Barrière de la langue, Problèmes liés à la loc...",son prix,Je n’ai pas encore d’idée précise,j’ai besoin d’y réfléchir ou de me renseigner.,,,
3,06/10/2025 19:47:27,France,France,45-54,Single,2000-2499,Every 2–3 years,"No, but I would like to go",1 week,"Je n’ai pas encore d’idée précise, j’ai besoin...",...,"Guides papier (Lonely Planet, Routard…)","Agence en ligne (ex. Expedia, Booking.com)",Vivre une expérience unique ou dépaysante,combiner toutes les activités avec les lieux d...,un guide chatgpt,Je n’ai pas encore d’idée précise,j’ai besoin d’y réfléchir ou de me renseigner.,,,
4,06/10/2025 20:56:00,France,France,45-54,Married_no_kids,2500-2999,Every 2–3 years,"No, but I would like to go",3 weeks,Kyoto / Osaka / Nara (Kansai),...,"Sites spécialisés (Voyageurs du Monde, Comptoi...","Agence en ligne (ex. Expedia, Booking.com)",Découvrir la nature et les paysages,Barrière de la langue,Moins cher,Kyoto / Osaka / Nara (Kansai),,,,


In [30]:
clean_most_wanted_pref_to_visit = {
    
    "tokyo et sa region (kanto)": "Kanto",
    "tokyo and its region (kanto)": "Kanto",

    "kyoto / osaka / nara (kansai)": "Kansai",

    "region du tohoku (ex. yamagata, sendai)": "Tohoku",
    "tohoku region (e.g. yamagata, sendai)": "Tohoku",

    "chūgoku (hiroshima, miyajima, okayama, matsue)": "Chūgoku",

    "shikoku (ile du pelerinage des 88 temples, matsuyama, iya valley)": "Shikoku",
    "shikoku (88 temple pilgrimage island, matsuyama, iya valley)": "Shikoku",

    "chūbu (nagoya, alpes japonaises, kanazawa, takayama, mont fuji côte yamanashi/shizuoka)": "Chūbu",
    "chubu (nagoya, japanese alps, kanazawa, takayama, mt. fuji – yamanashi/shizuoka side)": "Chūbu",

    "je n’ai pas encore d’idee precise, j’ai besoin d’y reflechir ou de me renseigner.": "Unknown",
    "j’ai besoin d’y reflechir ou de me renseigner.": None,
    "je n’ai pas encore d’idee precise": "Unknown",
    "i don’t have a clear idea yet / i need to think or find out more": "Unknown",
    }

cols = ["most_wanted_pref_to_visit_1", "most_wanted_pref_to_visit_2", "most_wanted_pref_to_visit_3",
        "most_wanted_pref_to_visit_4", "most_wanted_pref_to_visit_5"]
df_clean[cols] = (df_clean[cols]
                            .applymap(normalize_text)
                            .applymap(lambda x: clean_most_wanted_pref_to_visit.get(x, x)))



  .applymap(normalize_text)
  .applymap(lambda x: clean_most_wanted_pref_to_visit.get(x, x)))


In [31]:
print(df_clean["most_wanted_pref_to_visit_1"].value_counts())
print(df_clean["most_wanted_pref_to_visit_2"].value_counts())
print(df_clean["most_wanted_pref_to_visit_3"].value_counts())
print(df_clean["most_wanted_pref_to_visit_4"].value_counts())
print(df_clean["most_wanted_pref_to_visit_5"].value_counts())

most_wanted_pref_to_visit_1
Kanto       25
Unknown     14
Kansai       7
hokkaido     1
Name: count, dtype: int64
most_wanted_pref_to_visit_2
Kansai      23
Tohoku       4
hokkaido     1
Chūgoku      1
okinawa      1
Unknown      1
Name: count, dtype: int64
most_wanted_pref_to_visit_3
hokkaido    8
Shikoku     4
Chūbu       4
okinawa     3
Unknown     3
Chūgoku     1
Tohoku      1
Name: count, dtype: int64
most_wanted_pref_to_visit_4
okinawa    7
Name: count, dtype: int64
most_wanted_pref_to_visit_5
Tohoku     2
Chūgoku    2
Chūbu      1
Shikoku    1
Unknown    1
Name: count, dtype: int64


In [32]:
print(df_clean["rating_interest_culture_and_history"].value_counts())

rating_interest_culture_and_history
Très important          18
Essentiel               12
Assez important         10
Peu important            3
Essential                2
Moderately important     2
Name: count, dtype: int64


In [33]:
clean_rating_japan = {
    "Pas du tout important": "Not important at all",
    "Peu important": "Slightly important",
    "Assez important": "Moderately important",
    "Très important": "Very important",
    "Essentiel": "Essential",

}
rating_cols = ['rating_interest_culture_and_history', 'rating_interest_food',
       'rating_interest_nature_hiking', 'rating_interest_shopping_and_techno',
       'rating_interest_events_and_festivals', 'rating_interest_wellness',
       'rating_interest_theme_park']
df_clean[rating_cols] = (df_clean[rating_cols]
                            .applymap(lambda x: clean_rating_japan.get(x, x)))



  .applymap(lambda x: clean_rating_japan.get(x, x)))


In [34]:
print(df_clean["rating_interest_culture_and_history"].value_counts())
print(df_clean["rating_interest_food"].value_counts())
print(df_clean["rating_interest_nature_hiking"].value_counts())
print(df_clean["rating_interest_shopping_and_techno"].value_counts())
print(df_clean["rating_interest_events_and_festivals"].value_counts())
print(df_clean["rating_interest_wellness"].value_counts())
print(df_clean["rating_interest_theme_park"].value_counts())

rating_interest_culture_and_history
Very important          18
Essential               14
Moderately important    12
Slightly important       3
Name: count, dtype: int64
rating_interest_food
Essential               24
Very important          10
Moderately important    10
Slightly important       3
Name: count, dtype: int64
rating_interest_nature_hiking
Very important          17
Essential               16
Moderately important    11
Slightly important       2
Not important at all     1
Name: count, dtype: int64
rating_interest_shopping_and_techno
Slightly important      16
Moderately important    12
Not important at all     8
Very important           6
Essential                5
Name: count, dtype: int64
rating_interest_events_and_festivals
Moderately important    20
Slightly important      10
Very important           9
Not important at all     6
Essential                2
Name: count, dtype: int64
rating_interest_wellness
Moderately important    15
Very important          12
Slightly i

In [35]:
print(df_clean["Japan_budget_per_week"].value_counts())

Japan_budget_per_week
Je ne sais pas / Pas assez renseigné    10
500 – 1 000 €                            9
1 000 – 1 500 €                          8
1 500 – 2 500 €                          8
Plus de 2 500 €                          6
Moins de 500 €                           2
More than $2,750 (~€2,500+)              2
$550 – $1,100 (~€500 – €1,000)           1
Less than $550 (~€500)                   1
Name: count, dtype: int64


In [36]:
clean_budget_japan = {
    "Moins de 500 €": "Less than 500",
    "Less than $550 (~€500)": "Less than 500",

    "500 – 1 000 €": "500-1000",
    "$550 – $1,100 (~€500 – €1,000)": "500-1000",

    "1 000 – 1 500 €": "1000-1500",
    "$1,100 – $1,650 (~€1,000 – €1,500)": "1000-1500",

    "1 500 – 2 500 €": "1500-2500",
    "$1,650 – $2,750 (~€1,500 – €2,500)": "1500-2500",

    "Plus de 2 500 €": "More than 2500",
    "More than $2,750 (~€2,500+)": "More than 2500",

    "Je ne sais pas / Pas assez renseigné": "Unknown",
    "I don’t know / Not sure yet": "Unknown",

}
df_clean["Japan_budget_per_week"] = (df_clean["Japan_budget_per_week"]
                            .map(clean_budget_japan)
                            .fillna(df_clean["Japan_budget_per_week"]))


In [37]:
print(df_clean["Japan_budget_per_week"].value_counts())

Japan_budget_per_week
500-1000          10
Unknown           10
1000-1500          8
1500-2500          8
More than 2500     8
Less than 500      3
Name: count, dtype: int64


In [38]:
print(df_clean["Japan_prefered_accomodation"].value_counts())

Japan_prefered_accomodation
Hôtel classique (3–4 étoiles)             17
Ryokan (auberge traditionnelle)           12
Airbnb / logement chez l’habitant          7
Hôtel haut de gamme / luxe (5 étoiles)     5
Capsule hôtel                              1
hostel/ auberge de jeunesse                1
Airbnb / homestay                          1
Standard hotel (3–4 stars)                 1
Any                                        1
Luxury / high-end hotel (5 stars)          1
Name: count, dtype: int64


In [39]:
clean_japan_accomodation = {
    "Hôtel classique (3–4 étoiles)": "Standard hotel (3–4 stars)",
    "Hôtel haut de gamme / luxe (5 étoiles)": "Luxury / high-end hotel (5 stars)",
    "Ryokan (auberge traditionnelle)": "Ryokan (traditional Japanese inn)",
    "Capsule hôtel": "Capsule hotel",
    "Airbnb / logement chez l’habitant": "Airbnb / homestay",
    "hostel/ auberge de jeunesse": "Hostel"
}
df_clean["Japan_prefered_accomodation"] = (df_clean["Japan_prefered_accomodation"]
                                           .map(clean_japan_accomodation)
                                           .fillna(df_clean["Japan_prefered_accomodation"]))

print(df_clean["Japan_prefered_accomodation"].value_counts())

Japan_prefered_accomodation
Standard hotel (3–4 stars)           18
Ryokan (traditional Japanese inn)    12
Airbnb / homestay                     8
Luxury / high-end hotel (5 stars)     6
Capsule hotel                         1
Hostel                                1
Any                                   1
Name: count, dtype: int64


In [40]:
print(df_clean["Japan_most_difficulties"].value_counts())

Japan_most_difficulties
La barrière de la langue                                                                                                                                                                                                                           7
La barrière de la langue, Le coût de la vie sur place (hébergement, restauration, activités)                                                                                                                                                       6
La barrière de la langue, Le coût de la vie sur place (hébergement, restauration, activités), L’affluence touristique (lieux bondés, files d’attente)                                                                                              5
L’affluence touristique (lieux bondés, files d’attente)                                                                                                                                                                                          

In [41]:
def smart_split_diff(val):
    if pd.isna(val):
        return[]
    s = str(val)
    parts = re.split(r',(?![^()]*\))', s)
    parts = [p.strip() for p in parts if p.strip()]
    return parts

diffs_list = df_clean["Japan_most_difficulties"].apply(smart_split_diff)

MAX_CHOICES_DIFF = 5

def list_to_fixed_cols_diffs(lst, k=MAX_CHOICES_DIFF):
    lst = (lst + [np.nan]*k) [:k]
    return pd.Series(lst, index=[f"Japan_most_difficulties_{i+1}" for i in range(k)])

df_diffs = diffs_list.apply(list_to_fixed_cols_diffs)
df_clean = pd.concat([df_clean, df_diffs], axis=1)

display(df_clean.head())

Unnamed: 0,Horodateur,nationality,country,age_group,family_situation,household_income_in_€,travel_frequency,been_to_Japan,Japan_vac_duration,most_wanted_pref_to_visit,...,most_wanted_pref_to_visit_1,most_wanted_pref_to_visit_2,most_wanted_pref_to_visit_3,most_wanted_pref_to_visit_4,most_wanted_pref_to_visit_5,Japan_most_difficulties_1,Japan_most_difficulties_2,Japan_most_difficulties_3,Japan_most_difficulties_4,Japan_most_difficulties_5
0,05/10/2025 13:09:39,France,France,35-44,Married_no_kids,1500-1999,Every 2–3 years,"Yes, several times",More than 4 weeks,"Kyoto / Osaka / Nara (Kansai), Région du Tohok...",...,Kansai,Tohoku,Shikoku,,,La barrière de la langue,"L’affluence touristique (lieux bondés, files d...",Le manque d’information touristique en françai...,,
1,06/10/2025 13:30:50,France,France,45-54,Relationship_with_kids,1500-1999,Every 2–3 years,"No, but I would like to go",2 weeks,"Je n’ai pas encore d’idée précise, j’ai besoin...",...,Unknown,,,,,La barrière de la langue,"Le coût de la vie sur place (hébergement, rest...",Le manque d’information touristique en françai...,,
2,06/10/2025 17:20:05,France,France,35-44,Single,1500-1999,Once a year,"No, but I would like to go",3 weeks,"Je n’ai pas encore d’idée précise, j’ai besoin...",...,Unknown,,,,,La barrière de la langue,Les problèmes liés à la location de voiture (p...,"Le coût de la vie sur place (hébergement, rest...",,
3,06/10/2025 19:47:27,France,France,45-54,Single,2000-2499,Every 2–3 years,"No, but I would like to go",1 week,"Je n’ai pas encore d’idée précise, j’ai besoin...",...,Unknown,,,,,Les problèmes liés à la location de voiture (p...,"Le coût de la vie sur place (hébergement, rest...",,,
4,06/10/2025 20:56:00,France,France,45-54,Married_no_kids,2500-2999,Every 2–3 years,"No, but I would like to go",3 weeks,Kyoto / Osaka / Nara (Kansai),...,Kansai,,,,,La barrière de la langue,"Le coût de la vie sur place (hébergement, rest...",,,


In [42]:
print(df_clean["Japan_most_difficulties_1"].value_counts())
#print(df_clean["Japan_most_difficulties_2"].value_counts())
#print(df_clean["Japan_most_difficulties_3"].value_counts())
#print(df_clean["Japan_most_difficulties_4"].value_counts())
#print(df_clean["Japan_most_difficulties_5"].value_counts())

Japan_most_difficulties_1
La barrière de la langue                                                                       32
Le coût de la vie sur place (hébergement, restauration, activités)                              6
Les problèmes liés à la location de voiture (permis international, conduite à gauche, etc.)     4
L’affluence touristique (lieux bondés, files d’attente)                                         4
Les difficultés liées aux transports (train, navigation, réservations)                          2
The language barrier                                                                            2
The cost of living (accommodation, food, activities)                                            2
没兴趣                                                                                             1
Le manque d’information touristique en français/anglais                                         1
Name: count, dtype: int64


In [43]:
clean_most_wanted_pref_to_visit = {
    
    "la barriere de la langue": "Language",
    "the language barrier": "Language",
    
    "les difficultes liees aux transports (train, navigation, reservations)": "Transportation",
    "difficulties with transportation (trains, navigation, reservations)": "Transportation",

    "les problemes lies a la location de voiture (permis international, conduite a gauche, etc.)": "Car rental",
    "problems with car rental (international license, driving on the left, etc.)": "Car rental",
    
    "le coût de la vie sur place (hebergement, restauration, activites)": "Expensive",
    "the cost of living (accommodation, food, activities)": "Expensive",
    "expensive": "Expensive",

    "l’affluence touristique (lieux bondes, files d’attente)": "Crowded/Popularity",
    "tourist crowds (busy places, long queues)": "Crowded/Popularity",

    "le manque d’information touristique en français/anglais": "Translation",
    "lack of tourist information in english or french": "Translation",

    "没兴趣": None,
    "catastrophe naturelle": "Disaster",
    "c'est la destination trop a la mode que tout le monde veut faire. je prefere les destinations qui sortent du lot.": "Crowded/Popularity",
    "les insupportables fans du japon": "Crowded/Popularity"

    }

cols = ["Japan_most_difficulties_1", "Japan_most_difficulties_2", "Japan_most_difficulties_3",
        "Japan_most_difficulties_4", "Japan_most_difficulties_5"]
df_clean[cols] = (df_clean[cols]
                            .applymap(normalize_text)
                            .applymap(lambda x: clean_most_wanted_pref_to_visit.get(x, x)))



  .applymap(normalize_text)
  .applymap(lambda x: clean_most_wanted_pref_to_visit.get(x, x)))


In [44]:
print(df_clean["Japan_most_difficulties_1"].value_counts())
#print(df_clean["Japan_most_difficulties_2"].value_counts())
#print(df_clean["Japan_most_difficulties_3"].value_counts())
#print(df_clean["Japan_most_difficulties_4"].value_counts())
#print(df_clean["Japan_most_difficulties_5"].value_counts())

Japan_most_difficulties_1
Language              34
Expensive              8
Car rental             4
Crowded/Popularity     4
Transportation         2
Translation            1
Name: count, dtype: int64


In [45]:
print(df_clean["alternative_destination"].value_counts())

alternative_destination
Corée du Sud          16
USA / Canada          11
Autres pays d’Asie     6
Thaïlande              5
Europe                 4
Chine                  3
Vietnam                2
Thailand               2
Amérique du Sud        1
Afrique australe       1
tout est possible      1
Taiwan                 1
China                  1
Name: count, dtype: int64


In [46]:
clean_alternative_destination = {
    
    "coree du sud": "South Korea",
    "south korea": "South Korea",

    "chine": "China",
    "china": "China",

    "thailande": "Thailand",

    "vietnam": "Vietnam",

    "autres pays d’asie": "Asia",
    "Other Asian countries": "Asia",
    "asia": "Asia",

    "usa / canada": "USA / Canada",
    "usa": "USA / Canada",
    "canada": "USA / Canada",

    "europe": "Europe",

    }

df_clean["alternative_destination"] = (df_clean["alternative_destination"]
                            .map(normalize_text)
                            .map(clean_alternative_destination)
                            .fillna(df_clean["alternative_destination"]))

print(df_clean["alternative_destination"].value_counts())

alternative_destination
South Korea          16
USA / Canada         11
Thailand              7
Asia                  6
Europe                4
China                 4
Vietnam               2
Amérique du Sud       1
Afrique australe      1
tout est possible     1
Taiwan                1
Name: count, dtype: int64


In [47]:
print(df_clean["alt_dest_main_reason"].value_counts())

alt_dest_main_reason
Influence des amis/de la famille                                                                                                     13
Moins cher                                                                                                                           11
Plus pratique                                                                                                                         8
Déjà familier                                                                                                                         7
Influence from friends or family                                                                                                      2
没兴趣                                                                                                                                   1
Cheaper                                                                                                                               1
Pas trop de raisons particu

In [48]:
clean_alt_dest_reason = {
    
    # Cost
    "moins cher": "Cost",
    "cheaper": "Cost",

    # Distance / Convenience
    "plus proche": "Convenience",
    "plus pratique": "Convenience",

    # Familiarity
    "deja familier": "Familiarity",
    "already familiar with the destination": "Familiarity",

    # Social Influence
    "influence des amis/de la famille": "Social",
    "influence from friends or family": "Social",

    # Nature / landscapes
    "l'asie en general m'attire enormement et j'ai vu dans des reportages des endroits de chine merveilleux que j'aimerais decouvrir !": "Nature",
    "pour l'histoire, les paysages,...": "Nature",
    "paysages": "Nature",
    "grands espaces": "Nature",

    # Cultural interest
    "drama et k-pop": "Cultural",
    "plus d'interet personnelle, moins touristique et plus singulier": "Cultural",

    # No specific reason
    "grand interet": "None",
    "没兴趣": "None",
    "aucune": "None",
    "pas trop de raisons particulieres si ce n'est qu'ils sont cousins français :)": "None",

    }

df_clean["alt_dest_main_reason"] = (df_clean["alt_dest_main_reason"]
                            .map(normalize_text)
                            .map(clean_alt_dest_reason)
                            .fillna(df_clean["alt_dest_main_reason"]))

print(df_clean["alt_dest_main_reason"].value_counts())

alt_dest_main_reason
Social         15
Cost           12
Convenience     9
Familiarity     8
None            4
Nature          4
Cultural        2
Name: count, dtype: int64


In [49]:
print(df_clean["alt_dest_prefered_accomodation"].value_counts())

alt_dest_prefered_accomodation
Hôtel classique (3–4 étoiles)             23
Location type Airbnb / appartement        18
Hôtel haut de gamme / luxe (5 étoiles)     8
Auberge de jeunesse                        1
Airbnb-style rental / apartment            1
Standard hotel (3–4 stars)                 1
Hostel                                     1
Luxury / high-end hotel (5 stars)          1
Name: count, dtype: int64


In [50]:
clean_alt_pref_accomodation = {
    
    "hôtel classique (3–4 etoiles)": "Standard hotel (3–4 stars)",
    "hôtel haut de gamme / luxe (5 etoiles)": "Luxury / high-end hotel (5 stars)",
    "location type airbnb / appartement": "Airbnb-style rental / apartment",
    "auberge de jeunesse": "Hostel",
    "resort / club vacances": "Resort / holiday club",

    }

df_clean["alt_dest_prefered_accomodation"] = (df_clean["alt_dest_prefered_accomodation"]
                            .map(normalize_text)
                            .map(clean_alt_pref_accomodation)
                            .fillna(df_clean["alt_dest_prefered_accomodation"]))

print(df_clean["alt_dest_prefered_accomodation"].value_counts())

alt_dest_prefered_accomodation
Standard hotel (3–4 stars)           24
Airbnb-style rental / apartment      19
Luxury / high-end hotel (5 stars)     9
Hostel                                2
Name: count, dtype: int64


In [51]:
print(df_clean["alt_dest_budget_per_week"].value_counts())

alt_dest_budget_per_week
500 – 1 000 €                         25
1 000 – 1 500 €                       12
1 500 – 2 500 €                        6
Moins de 500 €                         5
Plus de 2 500 €                        2
Less than $550 (~€500)                 2
$550 – $1,100 (~€500 – €1,000)         1
$1,650 – $2,750 (~€1,500 – €2,500)     1
Name: count, dtype: int64


In [52]:
df_clean["alt_dest_budget_per_week"] = (df_clean["alt_dest_budget_per_week"]
                            .map(clean_budget_japan)
                            .fillna(df_clean["alt_dest_budget_per_week"]))

print(df_clean["alt_dest_budget_per_week"].value_counts())

alt_dest_budget_per_week
500-1000          26
1000-1500         12
Less than 500      7
1500-2500          7
More than 2500     2
Name: count, dtype: int64


In [53]:
print(df_clean["alt_dest_transportation"].value_counts())

alt_dest_transportation
Transport en commun (bus, métro, train)           31
Voiture de location                               10
Taxi / VTC (Uber, Grab…)                           9
Public transportation (bus, subway, train)         3
Taxi / Ride-hailing service (Uber, Grab, etc.)     1
Name: count, dtype: int64


In [54]:
clean_alt_dest_transport = {
    "transport en commun (bus, metro, train)": "Public transportation",
    "public transportation (bus, subway, train)": "Public transportation",

    "voiture de location": "Rental",
    "rental car": "Rental",

    "taxi / vtc (uber, grab…)": "Taxi",
    "taxi / ride-hailing service (uber, grab, etc.)": "Taxi",

    "bus touristiques / circuits organises": "Organized tours",
    "tourist buses / organized tours": "Organized tours",
    
}

df_clean["alt_dest_transportation"] = (df_clean["alt_dest_transportation"]
                                       .map(normalize_text)
                                       .map(clean_alt_dest_transport)
                                       .fillna(df_clean["alt_dest_transportation"]))

print(df_clean["alt_dest_transportation"].value_counts())

alt_dest_transportation
Public transportation    34
Taxi                     10
Rental                   10
Name: count, dtype: int64


In [55]:
print(df_clean["trip_prep"].value_counts())

trip_prep
Guides papier (Lonely Planet, Routard…)                                                        11
Sites spécialisés (Voyageurs du Monde, Comptoir des Voyages…)                                  11
Réseaux sociaux / influenceurs                                                                  8
Blogs de voyage                                                                                 8
Bouche-à-oreille / amis                                                                         7
Agence de voyages                                                                               5
Social media / influencers                                                                      2
Specialized travel websites (e.g. Audley Travel, Intrepid Travel, Responsible Travel, etc.)     1
Printed travel guides (e.g. Lonely Planet, Routard, etc.)                                       1
Name: count, dtype: int64


In [56]:
clean_trip_prep = {
    
    "agence de voyages": "Agency",
    "travel agency": "Agency",

    "sites specialises (voyageurs du monde, comptoir des voyages…)": "Websites",
    "specialized travel websites (e.g. audley travel, intrepid travel, responsible travel, etc.)": "Websites",

    "reseaux sociaux / influenceurs": "Influencers",
    "social media / influencers": "Influencers",

    "blogs de voyage": "Blogs",
    "travel blogs": "Blogs",

    "guides papier (lonely planet, routard…)": "Books",
    "printed travel guides (e.g. lonely planet, routard, etc.)": "Books",

    "bouche-a-oreille / amis": "Social",
    "word of mouth / friends": "Social",

    }

df_clean["trip_prep"] = (df_clean["trip_prep"]
                            .map(normalize_text)
                            .map(clean_trip_prep)
                            .fillna(df_clean["trip_prep"]))

print(df_clean["trip_prep"].value_counts())

trip_prep
Books          12
Websites       12
Influencers    10
Blogs           8
Social          7
Agency          5
Name: count, dtype: int64


In [57]:
print(df_clean["booking_trip_channel"].value_counts())

booking_trip_channel
Agence en ligne (ex. Expedia, Booking.com)          28
Site officiel de compagnies aériennes ou hôtels     11
Plateformes collaboratives (Airbnb, etc.)            6
Agence de voyages physique                           5
Official airline or hotel website                    2
Online travel agency (e.g. Expedia, Booking.com)     2
Name: count, dtype: int64


In [58]:
clean_booking_trip_channel = {
    
    "agence en ligne (ex. expedia, booking.com)": "Online agency",
    "online travel agency (e.g. expedia, booking.com)": "Online agency",

    "site officiel de compagnies aeriennes ou hôtels": "Direct",
    "official airline or hotel website": "Direct",

    "agence de voyages physique": "Store",
    "physical travel agency": "Store",

    "plateformes collaboratives (airbnb, etc.)": "Platforms",
    "collaborative platforms (e.g. airbnb, etc.)": "Platforms",

    }

df_clean["booking_trip_channel"] = (df_clean["booking_trip_channel"]
                            .map(normalize_text)
                            .map(clean_booking_trip_channel)
                            .fillna(df_clean["booking_trip_channel"]))

print(df_clean["booking_trip_channel"].value_counts())

booking_trip_channel
Online agency    30
Direct           13
Platforms         6
Store             5
Name: count, dtype: int64


In [59]:
print(df_clean["most_influencial_reason_to_choose_dest"].value_counts())

most_influencial_reason_to_choose_dest
Explorer le patrimoine culturel et historique (monuments, musées…)       12
Vivre une expérience unique ou dépaysante                                12
Découvrir la nature et les paysages                                      11
Découvrir la gastronomie locale                                           6
Rejoindre des proches (famille, amis)                                     4
Se détendre / se ressourcer                                               3
Profiter de la plage et du climat                                         2
Exploring cultural and historical heritage (monuments, museums, etc.)     1
Discovering nature and landscapes                                         1
Discovering local gastronomy                                              1
Relaxing / recharging                                                     1
Name: count, dtype: int64


In [60]:
clean_most_influencial_reason_to_choose_dest = {
    
    "decouvrir la nature et les paysages": "Nature",
    "discovering nature and landscapes": "Nature",

    "profiter de la plage et du climat": "Beaches",
    "enjoying the beach and the climate": "Beaches",

    "explorer le patrimoine culturel et historique (monuments, musees…)": "Cultural",
    "exploring cultural and historical heritage (monuments, museums, etc.)": "Cultural",

    "decouvrir la gastronomie locale": "Food",
    "discovering local gastronomy": "Food",

    "vivre une experience unique ou depaysante": "Uniqueness",
    "experiencing something unique or different": "Uniqueness",

    "rejoindre des proches (famille, amis)": "Family",
    "visiting relatives or friends": "Family",

    "se detendre / se ressourcer": "Relaxing",
    "relaxing / recharging": "Relaxing",

    }

df_clean["most_influencial_reason_to_choose_dest"] = (df_clean["most_influencial_reason_to_choose_dest"]
                            .map(normalize_text)
                            .map(clean_most_influencial_reason_to_choose_dest)
                            .fillna(df_clean["most_influencial_reason_to_choose_dest"]))

print(df_clean["most_influencial_reason_to_choose_dest"].value_counts())

most_influencial_reason_to_choose_dest
Cultural      13
Uniqueness    12
Nature        12
Food           7
Family         4
Relaxing       4
Beaches        2
Name: count, dtype: int64


In [61]:
print(df_clean["alt_dest_most_difficulties"].value_counts())

alt_dest_most_difficulties
Coût de la vie (hébergement, nourriture, activités), Foule touristique (sites bondés, files d’attente)                                                                               8
Foule touristique (sites bondés, files d’attente)                                                                                                                                    5
Barrière de la langue                                                                                                                                                                4
Barrière de la langue, Coût de la vie (hébergement, nourriture, activités), Foule touristique (sites bondés, files d’attente)                                                        3
Coût de la vie (hébergement, nourriture, activités)                                                                                                                                  3
Barrière de la langue, Foule touristique (sites bondés, fi

In [62]:
alt_dest_diff_list = df_clean["alt_dest_most_difficulties"].apply(smart_split)
MAX_CHOICES = 5

def list_to_fixed_cols(lst, k=MAX_CHOICES):
    lst = (lst + [np.nan]*k) [:k]
    return pd.Series(lst, index=[f"alt_dest_most_difficulties_{i+1}" for i in range(k)])

df_alt_dest_diff_list = alt_dest_diff_list.apply(list_to_fixed_cols)
df_clean = pd.concat([df_clean, df_alt_dest_diff_list], axis=1)

In [63]:
df_clean.head()

Unnamed: 0,Horodateur,nationality,country,age_group,family_situation,household_income_in_€,travel_frequency,been_to_Japan,Japan_vac_duration,most_wanted_pref_to_visit,...,Japan_most_difficulties_1,Japan_most_difficulties_2,Japan_most_difficulties_3,Japan_most_difficulties_4,Japan_most_difficulties_5,alt_dest_most_difficulties_1,alt_dest_most_difficulties_2,alt_dest_most_difficulties_3,alt_dest_most_difficulties_4,alt_dest_most_difficulties_5
0,05/10/2025 13:09:39,France,France,35-44,Married_no_kids,1500-1999,Every 2–3 years,"Yes, several times",More than 4 weeks,"Kyoto / Osaka / Nara (Kansai), Région du Tohok...",...,Language,Crowded/Popularity,Translation,,,Barrière de la langue,"Difficultés avec les transports (train, naviga...",Problèmes liés à la location de voiture (permi...,,
1,06/10/2025 13:30:50,France,France,45-54,Relationship_with_kids,1500-1999,Every 2–3 years,"No, but I would like to go",2 weeks,"Je n’ai pas encore d’idée précise, j’ai besoin...",...,Language,Expensive,Translation,,,"Coût de la vie (hébergement, nourriture, activ...","Foule touristique (sites bondés, files d’attente)",Manque d’informations touristiques en anglais,,
2,06/10/2025 17:20:05,France,France,35-44,Single,1500-1999,Once a year,"No, but I would like to go",3 weeks,"Je n’ai pas encore d’idée précise, j’ai besoin...",...,Language,Car rental,Expensive,,,Barrière de la langue,Problèmes liés à la location de voiture (permi...,"Coût de la vie (hébergement, nourriture, activ...",,
3,06/10/2025 19:47:27,France,France,45-54,Single,2000-2499,Every 2–3 years,"No, but I would like to go",1 week,"Je n’ai pas encore d’idée précise, j’ai besoin...",...,Car rental,Expensive,,,,combiner toutes les activités avec les lieux d...,,,,
4,06/10/2025 20:56:00,France,France,45-54,Married_no_kids,2500-2999,Every 2–3 years,"No, but I would like to go",3 weeks,Kyoto / Osaka / Nara (Kansai),...,Language,Expensive,,,,Barrière de la langue,,,,


In [64]:
clean_alt_dest_most_difficulties = {
    
    "barriere de la langue": "Language",
    "language barrier": "Language",

    "difficultes avec les transports (train, navigation, reservations)": "Transportation",
    "transportation issues (train, navigation, reservations)": "Transportation",
    "combiner toutes les activites avec les lieux d hebergement": "Transportation",

    "problemes lies a la location de voiture (permis international, conduite a gauche, etc.)": "Car rental",
    "car rental problems (international license, driving on the left, etc.)": "Car rental",

    "coût de la vie (hebergement, nourriture, activites)": "Expensive",
    "cost of living (accommodation, food, activities)": "Expensive",

    "foule touristique (sites bondes, files d’attente)": "Crowded",
    "tourist crowds (busy sites, long queues)": "Crowded",
    "les autres voyageurs": "Crowded",

    "manque d’informations touristiques en anglais": "Translation",
    "lack of tourist information in english": "Translation",

    "没兴趣": "None",
    "la meme": "None",
    "adaptation a la nourriture locale": "Food",
    
    "": "",

    }

cols = ["alt_dest_most_difficulties_1", "alt_dest_most_difficulties_2", "alt_dest_most_difficulties_3",
        "alt_dest_most_difficulties_4", "alt_dest_most_difficulties_5"]
df_clean[cols] = (df_clean[cols]
                            .applymap(normalize_text)
                            .applymap(lambda x: clean_alt_dest_most_difficulties.get(x, x)))

print(df_clean["alt_dest_most_difficulties_1"].value_counts())

alt_dest_most_difficulties_1
Language          22
Expensive         16
Crowded            8
Transportation     5
Car rental         2
None               1
Name: count, dtype: int64


  .applymap(normalize_text)
  .applymap(lambda x: clean_alt_dest_most_difficulties.get(x, x)))


In [65]:
print(df_clean["recomendation_to_improve_attractiveness"].value_counts())

recomendation_to_improve_attractiveness
Je ne sais pas                                                                                                                                                    4
Le Japon est parfait tel qu'il est                                                                                                                                1
Culture japonaise                                                                                                                                                 1
Une plus grande ouverture vers la langue anglaise au niveau des services et commerces                                                                             1
manga                                                                                                                                                             1
Des vols décarbonés                                                                                                                         

In [66]:
df_clean.columns

Index(['Horodateur', 'nationality', 'country', 'age_group', 'family_situation',
       'household_income_in_€', 'travel_frequency', 'been_to_Japan',
       'Japan_vac_duration', 'most_wanted_pref_to_visit',
       'rating_interest_culture_and_history', 'rating_interest_food',
       'rating_interest_nature_hiking', 'rating_interest_shopping_and_techno',
       'rating_interest_events_and_festivals', 'rating_interest_wellness',
       'rating_interest_theme_park', 'Japan_budget_per_week',
       'Japan_prefered_accomodation', 'Japan_most_difficulties',
       'alternative_destination', 'alt_dest_main_reason',
       'alt_dest_prefered_accomodation', 'alt_dest_budget_per_week',
       'alt_dest_transportation', 'trip_prep', 'booking_trip_channel',
       'most_influencial_reason_to_choose_dest', 'alt_dest_most_difficulties',
       'recomendation_to_improve_attractiveness',
       'most_wanted_pref_to_visit_1', 'most_wanted_pref_to_visit_2',
       'most_wanted_pref_to_visit_3', 'most_wa

In [67]:
pd.set_option('display.max_columns', None)
display(df_clean.head())

Unnamed: 0,Horodateur,nationality,country,age_group,family_situation,household_income_in_€,travel_frequency,been_to_Japan,Japan_vac_duration,most_wanted_pref_to_visit,rating_interest_culture_and_history,rating_interest_food,rating_interest_nature_hiking,rating_interest_shopping_and_techno,rating_interest_events_and_festivals,rating_interest_wellness,rating_interest_theme_park,Japan_budget_per_week,Japan_prefered_accomodation,Japan_most_difficulties,alternative_destination,alt_dest_main_reason,alt_dest_prefered_accomodation,alt_dest_budget_per_week,alt_dest_transportation,trip_prep,booking_trip_channel,most_influencial_reason_to_choose_dest,alt_dest_most_difficulties,recomendation_to_improve_attractiveness,most_wanted_pref_to_visit_1,most_wanted_pref_to_visit_2,most_wanted_pref_to_visit_3,most_wanted_pref_to_visit_4,most_wanted_pref_to_visit_5,Japan_most_difficulties_1,Japan_most_difficulties_2,Japan_most_difficulties_3,Japan_most_difficulties_4,Japan_most_difficulties_5,alt_dest_most_difficulties_1,alt_dest_most_difficulties_2,alt_dest_most_difficulties_3,alt_dest_most_difficulties_4,alt_dest_most_difficulties_5
0,05/10/2025 13:09:39,France,France,35-44,Married_no_kids,1500-1999,Every 2–3 years,"Yes, several times",More than 4 weeks,"Kyoto / Osaka / Nara (Kansai), Région du Tohok...",Essential,Essential,Very important,Slightly important,Moderately important,Slightly important,Not important at all,500-1000,Airbnb / homestay,"La barrière de la langue, L’affluence touristi...",Thailand,Cost,Airbnb-style rental / apartment,500-1000,Taxi,Influencers,Direct,Cultural,"Barrière de la langue, Difficultés avec les tr...",Le Japon est parfait tel qu'il est,Kansai,Tohoku,Shikoku,,,Language,Crowded/Popularity,Translation,,,Language,Transportation,Car rental,,
1,06/10/2025 13:30:50,France,France,45-54,Relationship_with_kids,1500-1999,Every 2–3 years,"No, but I would like to go",2 weeks,"Je n’ai pas encore d’idée précise, j’ai besoin...",Essential,Essential,Essential,Not important at all,Slightly important,Moderately important,Not important at all,500-1000,Airbnb / homestay,"La barrière de la langue, Le coût de la vie su...",Asia,Cost,Standard hotel (3–4 stars),Less than 500,Public transportation,Books,Online agency,Cultural,"Coût de la vie (hébergement, nourriture, activ...","Déjà très attractif pour moi, juste une questi...",Unknown,,,,,Language,Expensive,Translation,,,Expensive,Crowded,Translation,,
2,06/10/2025 17:20:05,France,France,35-44,Single,1500-1999,Once a year,"No, but I would like to go",3 weeks,"Je n’ai pas encore d’idée précise, j’ai besoin...",Very important,Very important,Very important,Slightly important,Moderately important,Moderately important,Not important at all,Unknown,Ryokan (traditional Japanese inn),"La barrière de la langue, Les problèmes liés à...",South Korea,Cost,Airbnb-style rental / apartment,500-1000,Public transportation,Websites,Platforms,Uniqueness,"Barrière de la langue, Problèmes liés à la loc...",son prix,Unknown,,,,,Language,Car rental,Expensive,,,Language,Car rental,Expensive,,
3,06/10/2025 19:47:27,France,France,45-54,Single,2000-2499,Every 2–3 years,"No, but I would like to go",1 week,"Je n’ai pas encore d’idée précise, j’ai besoin...",Moderately important,Essential,Essential,Moderately important,Moderately important,Moderately important,Slightly important,1000-1500,Standard hotel (3–4 stars),Les problèmes liés à la location de voiture (p...,South Korea,,Standard hotel (3–4 stars),1000-1500,Public transportation,Books,Online agency,Uniqueness,combiner toutes les activités avec les lieux d...,un guide chatgpt,Unknown,,,,,Car rental,Expensive,,,,Transportation,,,,
4,06/10/2025 20:56:00,France,France,45-54,Married_no_kids,2500-2999,Every 2–3 years,"No, but I would like to go",3 weeks,Kyoto / Osaka / Nara (Kansai),Very important,Very important,Very important,Not important at all,Not important at all,Very important,Not important at all,Unknown,Ryokan (traditional Japanese inn),"La barrière de la langue, Le coût de la vie su...",Vietnam,Cost,Standard hotel (3–4 stars),500-1000,Public transportation,Websites,Online agency,Nature,Barrière de la langue,Moins cher,Kansai,,,,,Language,Expensive,,,,Language,,,,


In [68]:
df_clean = df_clean.drop(columns= ["most_wanted_pref_to_visit", "Japan_most_difficulties", "alt_dest_most_difficulties"])

In [69]:
df_clean.to_csv("df_clean.csv", index=False)
