In [43]:
import pandas as pd
import re

def extract_main_language(languages):
    # Buscar idioma oficial
    official_match = re.findall(r'([\w\s\-]+?)(?:\s|\,)\(official\)', languages, re.IGNORECASE)
    if official_match:
        language = official_match[0].strip()
        return re.sub(r'Castilian Spanish', 'Spanish', language)

    # Extraer idiomas con porcentajes y seleccionar el de mayor porcentaje si no hay oficial
    percentages = re.findall(r'([\w\s\-]+?)\s(\d+)%', languages)
    if percentages:
        percentages = sorted(percentages, key=lambda x: int(x[1]), reverse=True)
        language = percentages[0][0].strip()
        return re.sub(r'Castilian Spanish', 'Spanish', language)

    # Si no se encuentra oficial ni porcentaje, devolver el primer idioma listado
    language = languages.split(',')[0].strip()
    return re.sub(r'Castilian Spanish', 'Spanish', language)

def refine_language_mapping(language, country=None):
    # Remover paréntesis, porcentajes y ajustar el nombre de algunos idiomas
    language = re.sub(r'\s*\(.*\)|\s*\d+%.*', '', language).strip()
    
    # Mapeos manuales adicionales para asegurar coincidencias
    manual_mappings = {
        "Mandarin Chinese": "Mandarin Chinese",
        "Standard Arabic": "Arabic",
        "Egyptian Spoken Arabic": "Arabic",
        "Moroccan Spoken Arabic": "Arabic",
        "Algerian Spoken Arabic": "Arabic",
        "Tunisian Spoken Arabic": "Arabic",
        "Sanaani Spoken Arabic": "Arabic",
        "South Levantine Spoken Arabic": "Arabic",
        "Standard German": "German",
        "Bavarian": "German",
        "Persian": "Iranian Persian",  # Ajustes basados en Top 100 Languages.csv
        "Albanian": "Albanian",
        "Catalan": "Catalan",
        "Armenian": "Armenian",
        "Azerbaijani": "Azerbaijani Turkic",
        "Belarusian": "Belarusian",
        "Dzongkha": "Dzongkha",
        "Bosnian": "Bosnian",
        "Bulgarian": "Bulgarian",
        "Kirundi": "Kirundi",
        "Comorian": "Comorian",
        "Croatian": "Croatian",
        "Danish": "Danish",
        "Tigrinya": "Tigrinya",
        "Estonian": "Estonian",
        "Finnish": "Finnish",
        "Georgian": "Georgian",
        "Haitian Creole": "Haitian Creole",
        "Icelandic": "Icelandic",
        "Hebrew": "Hebrew",
        "Kyrgyz": "Kyrgyz",
        "Lao": "Lao",
        "Latvian": "Latvian",
        "Lithuanian": "Lithuanian",
        "Luxembourgish": "Luxembourgish",
        "Macedonian": "Macedonian",
        "Malagasy": "Malagasy",
        "Chichewa": "Chichewa",
        "Dhivehi": "Dhivehi",
        "Maltese": "Maltese",
        "Marshallese": "Marshallese",
        "Mongolian": "Mongolian",
        "Montenegrin": "Montenegrin",
        "Nauruan": "Nauruan",
        "Norwegian": "Norwegian",
        "Palauan": "Palauan",
        "Samoan": "Samoan",
        "Serbian": "Serbian",
        "Seychellois Creole": "Seychellois Creole",
        "Slovak": "Slovak",
        "Slovenian": "Slovenian",
        "Tajik": "Tajik",
        "Tongan": "Tongan",
        "Turkmen": "Turkmen",
        "Tuvaluan": "Tuvaluan",
        "Uzbek": "Northern Uzbek",  # Ajuste basado en Top 100 Languages.csv
        "Bislama": "Bislama"
        # Otros posibles mapeos manuales...
    }

    # Reglas especiales para ciertos países
    country_specific_mappings = {
        "Germany": "German",
        "Austria": "German",
        "Switzerland": "German",
        "Canada": "English",
        "Belgium": "Dutch",
        "Luxembourg": "Luxembourgish",
        "Morocco": "Arabic",
        "Algeria": "Arabic",
        "Lebanon": "Arabic",
        "Tunisia": "Arabic",
        "Libya": "Arabic",
        "Egypt": "Arabic",
        "Brazil":"Portuguese",
        # Otros países que necesitan reglas específicas
    }

    # Si hay un mapeo específico para el país, utilizarlo
    if country and country in country_specific_mappings:
        return country_specific_mappings[country]

    return manual_mappings.get(language, "")

# Diccionario para agregar idiomas faltantes
language_corrections = {
    # Mismos mapeos que antes
    # ...
}

# Cargar los datasets
data_top_languages = pd.read_csv('data/Top 100 Languages.csv')
data_top_languages['Second Languages'] = data_top_languages['Total Speakers'] - data_top_languages['Native Speakers']
data_countries = pd.read_csv('data/countries-languages.csv')

# Expandir los idiomas hablados y aplicar la función para obtener el idioma principal
data_countries['Main Language'] = data_countries['Languages Spoken'].apply(extract_main_language)

# Aplicar el refinamiento y mapeo manual al dataset de países
data_countries['Main Language'] = data_countries.apply(lambda row: refine_language_mapping(row['Main Language'], row['Country']), axis=1)

# Completar los idiomas faltantes
data_countries['Main Language'] = data_countries.apply(lambda row: language_corrections.get(row['Country'], row['Main Language']), axis=1)

# Filtrar para mantener solo un idioma por país
data_countries = data_countries.groupby('Country').agg({'Main Language': 'first'}).reset_index()

# Unir los datasets en base a los nombres de idiomas normalizados
merged_data = data_top_languages.merge(data_countries, left_on='Language', right_on='Main Language', how='right')

# Rellenar los datos faltantes con los valores agrupados si el idioma ya existe en otro país
for index, row in merged_data.iterrows():
    if pd.isna(row['Language']) and row['Main Language'] in data_top_languages['Language'].values:
        matched_row = data_top_languages[data_top_languages['Language'] == row['Main Language']].iloc[0]
        for col in ['Language', 'Total Speakers', 'Native Speakers', 'Origin', 'Second Languages']:
            merged_data.at[index, col] = matched_row[col]

# Guardar el dataset combinado y ajustado
merged_data.to_csv('data/Combined_Languages_Dataset_Final_Adjusted_Filled.csv', index=False)

print("Proceso completado. Archivo guardado como 'Combined_Languages_Dataset_Final_Adjusted_Filled.csv'")

Unnamed: 0,Country,Languages Spoken,Main Language,Language
0,Afghanistan,"Dari Persian, Pashtu (both official), other Tu...",,
1,Albania,"Albanian (Tosk is the official dialect), Greek",Albanian,Albanian
2,Algeria,"Arabic (official), French, Berber dialects",Arabic,Arabic
3,Andorra,"Catalán (official), French, Castilian, Portuguese",,
4,Angola,"Portuguese (official), Bantu and other African...",,


In [11]:
# Ejemplo de uso de la función con datos de prueba
test_cases = [
    "Dari Persian, Pashtu (both official), other Turkic and minor languages",
    "Albanian (Tosk is the official dialect), Greek",
    "Spanish 40%, French 60% (all official)",
    "Castilian Spanish 74% (official), Catalan, Galician",
    "English (official), local dialects"
]

for case in test_cases:
    print(f"Testing: {case}")
    print(f"Extracted Main Language: {extract_main_language(case)}")

Testing: Dari Persian, Pashtu (both official), other Turkic and minor languages
Extracted Main Language: Dari Persian
Testing: Albanian (Tosk is the official dialect), Greek
Extracted Main Language: Albanian (Tosk is the official dialect)
Testing: Spanish 40%, French 60% (all official)
Extracted Main Language: French
Testing: Castilian Spanish 74% (official), Catalan, Galician
Extracted Main Language: Spanish
Testing: English (official), local dialects
Extracted Main Language: English


In [12]:
def extract_main_language(languages):
    print(f"Processing: {languages}")  # Mostrar entrada actual
    official_match = re.search(r'(\w+[\w\s]*)(?:\s|\,)\(official\)', languages)
    if official_match:
        language = official_match.group(1).strip()
        language = re.sub(r'Castilian Spanish', 'Spanish', language)
        print(f"Official: {language}")  # Mostrar idioma oficial encontrado
        return language

    percentages = re.findall(r'(\w+[\w\s]*?)\s(\d+)%', languages)
    if percentages:
        percentages = sorted(percentages, key=lambda x: int(x[1]), reverse=True)
        language = percentages[0][0].strip()
        language = re.sub(r'Castilian Spanish', 'Spanish', language)
        print(f"By Percentage: {language}")  # Mostrar idioma seleccionado por porcentaje
        return language

    language = languages.split(',')[0].strip()
    language = re.sub(r'Castilian Spanish', 'Spanish', language)
    print(f"By Default: {language}")  # Mostrar idioma por defecto
    return language


In [13]:
import pandas as pd
final_data = pd.read_csv('data/Combined_Languages_Dataset.csv')
print(final_data.head())  # Ver primeras filas
print(final_data[final_data['Country'] == 'Brazil'])  # Ver datos de Brasil
print(final_data[final_data['Country'] == 'Spain'])  # Ver datos de España


  Language  Total Speakers  Native Speakers         Origin  Second Languages  \
0  English      1132366680      379007140.0  Indo-European       753359540.0   
1  English      1132366680      379007140.0  Indo-European       753359540.0   
2  English      1132366680      379007140.0  Indo-European       753359540.0   
3  English      1132366680      379007140.0  Indo-European       753359540.0   
4  English      1132366680      379007140.0  Indo-European       753359540.0   

                          Country  
0             Antigua and Barbuda  
1                          Panama  
2                      Seychelles  
3                           Samoa  
4  St. Vincent and the Grenadines  
   Language  Total Speakers  Native Speakers         Origin  Second Languages  \
44  English      1132366680      379007140.0  Indo-European       753359540.0   

   Country  
44  Brazil  
    Language  Total Speakers  Native Speakers         Origin  \
113  Spanish       534335730      460093030.0  Ind