In [30]:
# Import libreries for data manipulation
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [31]:
# Load the dataset
df = pd.read_csv('data.csv')

# Display the first few rows of the dataset
print("First few rows of the dataset:")
df.head(10)

First few rows of the dataset:


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M
5,2023,SE,FT,Applied Scientist,222200,USD,222200,US,0,US,L
6,2023,SE,FT,Applied Scientist,136000,USD,136000,US,0,US,L
7,2023,SE,FT,Data Scientist,219000,USD,219000,CA,0,CA,M
8,2023,SE,FT,Data Scientist,141000,USD,141000,CA,0,CA,M
9,2023,SE,FT,Data Scientist,147100,USD,147100,US,0,US,M


In [32]:
# Show unique values in each column
print("\nUnique values in each column:")
for column in df.columns:
    unique_values = df[column].unique()
    print(f"{column}: {unique_values}")


Unique values in each column:
work_year: [2023 2022 2020 2021]
experience_level: ['SE' 'MI' 'EN' 'EX']
employment_type: ['FT' 'CT' 'FL' 'PT']
job_title: ['Principal Data Scientist' 'ML Engineer' 'Data Scientist'
 'Applied Scientist' 'Data Analyst' 'Data Modeler' 'Research Engineer'
 'Analytics Engineer' 'Business Intelligence Engineer'
 'Machine Learning Engineer' 'Data Strategist' 'Data Engineer'
 'Computer Vision Engineer' 'Data Quality Analyst'
 'Compliance Data Analyst' 'Data Architect'
 'Applied Machine Learning Engineer' 'AI Developer' 'Research Scientist'
 'Data Analytics Manager' 'Business Data Analyst' 'Applied Data Scientist'
 'Staff Data Analyst' 'ETL Engineer' 'Data DevOps Engineer' 'Head of Data'
 'Data Science Manager' 'Data Manager' 'Machine Learning Researcher'
 'Big Data Engineer' 'Data Specialist' 'Lead Data Analyst'
 'BI Data Engineer' 'Director of Data Science'
 'Machine Learning Scientist' 'MLOps Engineer' 'AI Scientist'
 'Autonomous Vehicle Technician' 'Applied M

In [33]:
# Show the shape of the dataset
print("\nShape of the dataset:")
print(df.shape)


Shape of the dataset:
(3755, 11)


In [34]:
# show the information of the dataset
print("\nInformation of the dataset:")
df.info()


Information of the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB


In [35]:
# ---------------------------------------------------
# FUNCIONES DE MAPEO Y CATEGORIZACIÓN
# ---------------------------------------------------

# 1. Agrupar job titles en 10 categorías principales
def categorize_job_title(title):
    title = title.lower()
    
    if any(term in title for term in ['scientist', 'science']):
        if any(term in title for term in ['lead', 'principal', 'staff', 'head', 'director']):
            return 'Senior Data Scientist'
        elif 'applied' in title:
            return 'Applied Data Scientist'
        else:
            return 'Data Scientist'
            
    elif any(term in title for term in ['analyst', 'analytics']):
        if any(term in title for term in ['bi', 'business', 'financial', 'marketing']):
            return 'Business Analyst'
        else:
            return 'Data Analyst'
            
    elif any(term in title for term in ['ml', 'machine learning']):
        if any(term in title for term in ['researcher', 'research', 'scientist']):
            return 'ML Researcher'
        else:
            return 'ML Engineer'
            
    elif any(term in title for term in ['engineer', 'etl', 'infrastructure', 'cloud', 'database']):
        if 'data' in title:
            return 'Data Engineer'
        else:
            return 'MLOps/Infrastructure Engineer'
            
    elif any(term in title for term in ['head', 'director', 'lead', 'manager', 'management']):
        return 'Data Leadership'
        
    elif any(term in title for term in ['architect']):
        return 'Data Architect'
        
    elif any(term in title for term in ['computer vision', 'nlp', 'ai', 'deep learning', 'vision']):
        return 'AI Specialist'
            
    else:
        return 'Other Data Professional'

# 2. Mapeos básicos para códigos
experience_mapping = {
    'EN': 'Entry-Level',
    'MI': 'Mid-Level',
    'SE': 'Senior',
    'EX': 'Executive'
}

employment_mapping = {
    'FT': 'Full-time',
    'PT': 'Part-time',
    'CT': 'Contract',
    'FL': 'Freelance'
}

remote_mapping = {
    0: 'On-site',
    50: 'Hybrid',
    100: 'Remote'
}

company_size_mapping = {
    'S': 'Small',
    'M': 'Medium',
    'L': 'Large'
}

# 3. Stack tecnológico basado en categorías de trabajo
def assign_tech_stack(job_category):
    tech_stacks = {
        'Data Scientist': 'Python, R, SQL, Scikit-learn, Pandas, Statistical Analysis',
        'Senior Data Scientist': 'Python, R, SQL, Cloud, Advanced ML, Leadership',
        'Applied Data Scientist': 'Python, PyTorch/TensorFlow, SQL, Domain Expertise',
        'Data Analyst': 'SQL, Python, Excel, Tableau/PowerBI, Data Visualization',
        'Business Analyst': 'SQL, Excel, Tableau/PowerBI, Domain Knowledge, Statistics',
        'ML Engineer': 'Python, TensorFlow/PyTorch, MLOps, Docker, Git',
        'ML Researcher': 'Python, PyTorch, Research Methods, Mathematics, Publications',
        'Data Engineer': 'Python, SQL, Spark, Airflow, Cloud Platforms, ETL',
        'MLOps/Infrastructure Engineer': 'Docker, Kubernetes, CI/CD, Cloud, ML Platforms',
        'Data Leadership': 'Project Management, Team Leadership, Strategy, Stakeholder Management',
        'Data Architect': 'System Design, Cloud Architecture, Data Modeling, Governance',
        'AI Specialist': 'Python, Deep Learning, NLP/Computer Vision, Research Skills'
    }
    return tech_stacks.get(job_category, 'Python, SQL, Data Analysis Tools')

# 4. Nivel de inglés basado en ubicación
def assign_english_level(country, job_category):
    native_english = ['US', 'CA', 'GB', 'AU', 'NZ', 'IE']
    
    if country in native_english:
        return 'Native'
    
    high_english = ['NL', 'SE', 'DK', 'FI', 'NO', 'DE', 'SG', 'IL']
    if country in high_english:
        return 'Advanced'
    
    # Roles de liderazgo y especializados suelen requerir mejor nivel
    if job_category in ['Data Leadership', 'ML Researcher', 'Senior Data Scientist', 'AI Specialist']:
        return 'Advanced'
        
    return 'Intermediate'

# 5. Agrupación geográfica por regiones/continentes
region_mapping = {
    'US': 'North America', 'CA': 'North America', 'MX': 'North America',
    'GB': 'Europe', 'DE': 'Europe', 'FR': 'Europe', 'ES': 'Europe', 'IT': 'Europe', 
    'NL': 'Europe', 'SE': 'Europe', 'CH': 'Europe', 'IE': 'Europe', 'AT': 'Europe',
    'BE': 'Europe', 'DK': 'Europe', 'FI': 'Europe', 'NO': 'Europe', 'PT': 'Europe',
    'PL': 'Europe', 'CZ': 'Europe', 'RO': 'Europe', 'GR': 'Europe', 'HU': 'Europe',
    'IN': 'Asia', 'CN': 'Asia', 'JP': 'Asia', 'SG': 'Asia', 'HK': 'Asia', 'MY': 'Asia',
    'TH': 'Asia', 'VN': 'Asia', 'ID': 'Asia', 'IL': 'Asia', 'TR': 'Asia',
    'AU': 'Oceania', 'NZ': 'Oceania',
    'BR': 'South America', 'AR': 'South America', 'CL': 'South America', 'CO': 'South America',
    'NG': 'Africa', 'ZA': 'Africa', 'KE': 'Africa', 'GH': 'Africa', 'EG': 'Africa'
}

# 6. Índice de costo de vida por país (aproximado)
col_index = {
    'US': 100, 'CH': 135, 'NO': 125, 'DK': 120, 'IE': 115, 'IS': 113, 
    'GB': 105, 'CA': 95, 'AU': 105, 'NZ': 100, 'JP': 110, 'SE': 115, 'FI': 110,
    'DE': 90, 'FR': 90, 'ES': 75, 'IT': 80, 'PT': 65, 'GR': 65, 'SG': 110, 
    'HK': 115, 'IN': 30, 'BR': 45, 'MX': 40, 'CN': 50, 'RU': 45, 'ZA': 50,
    'NG': 35, 'KE': 40, 'PL': 55, 'UA': 35, 'TR': 40, 'RO': 45
}

# 7. Índice de demanda de habilidades
demand_index = {
    'ML Engineer': 9.5,
    'Data Scientist': 9.0,
    'Applied Data Scientist': 8.8,
    'Senior Data Scientist': 9.2,
    'Data Engineer': 9.7,
    'Data Analyst': 8.5,
    'Business Analyst': 8.0,
    'MLOps/Infrastructure Engineer': 9.4,
    'Data Leadership': 8.8,
    'Data Architect': 9.0,
    'AI Specialist': 9.6
}

# 8. Antigüedad del rol en el mercado
role_age_mapping = {
    'Data Scientist': 'Established',
    'Senior Data Scientist': 'Established',
    'Data Analyst': 'Traditional',
    'Business Analyst': 'Traditional',
    'Data Engineer': 'Established',
    'ML Engineer': 'Emerging',
    'MLOps/Infrastructure Engineer': 'New',
    'AI Specialist': 'Emerging',
    'ML Researcher': 'Specialized',
    'Data Leadership': 'Established',
    'Data Architect': 'Established'
}

# 9. Ruta de carrera aproximada
career_path_mapping = {
    'Data Scientist': 'Technical',
    'Senior Data Scientist': 'Technical',
    'Applied Data Scientist': 'Technical',
    'ML Engineer': 'Technical',
    'ML Researcher': 'Research',
    'Data Engineer': 'Technical',
    'Data Analyst': 'Analytical',
    'Business Analyst': 'Business',
    'Data Leadership': 'Management',
    'Data Architect': 'Architecture',
    'AI Specialist': 'Research',
    'MLOps/Infrastructure Engineer': 'DevOps'
}

# 10. Grado de especialización técnica
tech_specialization = {
    'ML Engineer': 8,
    'ML Researcher': 9,
    'Data Scientist': 7,
    'Senior Data Scientist': 8,
    'Applied Data Scientist': 8.5,
    'Data Engineer': 7.5,
    'Data Analyst': 5,
    'Business Analyst': 4,
    'Data Leadership': 6,
    'Data Architect': 8,
    'AI Specialist': 9,
    'MLOps/Infrastructure Engineer': 8.5
}

# 11. Índice de impacto de IA
ai_impact = {
    'Data Scientist': 'Augmented by AI',
    'Senior Data Scientist': 'Manages AI',
    'Applied Data Scientist': 'Develops AI',
    'ML Engineer': 'Develops AI',
    'ML Researcher': 'Creates New AI',
    'Data Engineer': 'Moderately Impacted',
    'Data Analyst': 'Highly Augmented',
    'Business Analyst': 'Moderately Augmented',
    'Data Leadership': 'Directs AI Strategy',
    'Data Architect': 'Moderately Impacted',
    'AI Specialist': 'Creates New AI',
    'MLOps/Infrastructure Engineer': 'Supports AI'
}

# ---------------------------------------------------
# FUNCIONES DE CÁLCULO
# ---------------------------------------------------

# 1. Función para calcular el índice de competitividad salarial
def calculate_salary_competitiveness(salary_usd, job_category, experience_level):
    # Salarios de referencia por categoría y nivel
    salary_benchmarks = {
        ('Data Scientist', 'Entry-Level'): 80000,
        ('Data Scientist', 'Mid-Level'): 120000,
        ('Data Scientist', 'Senior'): 160000,
        ('Senior Data Scientist', 'Senior'): 200000,
        ('ML Engineer', 'Mid-Level'): 140000,
        ('ML Engineer', 'Senior'): 180000,
        ('Data Analyst', 'Entry-Level'): 60000,
        ('Data Analyst', 'Mid-Level'): 90000,
        ('Data Engineer', 'Mid-Level'): 130000,
        ('Data Engineer', 'Senior'): 170000,
        ('Business Analyst', 'Mid-Level'): 85000,
        ('AI Specialist', 'Senior'): 190000,
        ('Data Leadership', 'Senior'): 200000,
        ('Data Leadership', 'Executive'): 250000
    }
    
    benchmark = salary_benchmarks.get((job_category, experience_level), 100000)
    ratio = salary_usd / benchmark
    
    if ratio >= 1.5:
        return 'Highly Competitive'
    elif ratio >= 1.0:
        return 'Competitive'
    elif ratio >= 0.8:
        return 'Average'
    else:
        return 'Below Average'

# 2. Función para calcular período económico
def economic_period(year):
    if year < 2020:
        return 'Pre-Pandemic'
    elif year == 2020:
        return 'Pandemic Start'
    elif year == 2021:
        return 'Pandemic'
    else:
        return 'Post-Pandemic'

# 3. Función para calcular índice de balance trabajo-vida
def calculate_work_life_index(remote_ratio, emp_type):
    base_score = 0
    
    # Remote ratio factor
    if remote_ratio == 100:
        base_score += 4
    elif remote_ratio == 50:
        base_score += 3
    else:
        base_score += 2
        
    # Employment type factor
    if emp_type == 'FL':  # Freelance
        base_score += 5
    elif emp_type == 'PT':  # Part-time
        base_score += 4
    elif emp_type == 'CT':  # Contract
        base_score += 3
    else:  # Full-time
        base_score += 2
        
    return base_score / 2  # Normalización a escala 1-5

# 4. Función para inferir sector de empresa
def infer_company_sector(company_size, location):
    if company_size == 'L':
        if location in ['US', 'GB', 'CA', 'AU', 'SG', 'HK']:
            return 'Technology'
        elif location in ['CH', 'DE', 'FR', 'JP']:
            return 'Manufacturing/Industry'
        else:
            return 'Services'
    elif company_size == 'M':
        if location in ['US', 'GB', 'SG', 'IL', 'DE', 'SE']:
            return 'Technology'
        elif location in ['IN', 'BR', 'MX']:
            return 'Services'
        else:
            return 'Mixed'
    else:  # Small
        if location in ['US', 'GB', 'IL', 'SG']:
            return 'Startup/Tech'
        else:
            return 'Services/Consulting'

# 5. Función para normalizar salario por experiencia y región
def normalize_by_experience_region(salary, experience, region):
    # Factores de normalización
    exp_factor = {'EN': 0.6, 'MI': 1.0, 'SE': 1.4, 'EX': 2.0}.get(experience, 1.0)
    region_factor = {
        'North America': 1.0, 
        'Europe': 1.1, 
        'Asia': 1.3, 
        'Oceania': 1.0,
        'South America': 1.5, 
        'Africa': 1.6
    }.get(region, 1.2)
    
    return salary / (exp_factor * region_factor)

# 6. Función para estimar compensación total
def estimate_total_compensation(base_salary, company_size, company_location):
    # Estima bonos y beneficios
    equity_factor = {'S': 0.05, 'M': 0.1, 'L': 0.15}.get(company_size, 0)
    
    bonus_locations = {
        'US': 0.15, 'GB': 0.1, 'DE': 0.08, 'CA': 0.12, 'AU': 0.1, 
        'SG': 0.15, 'CH': 0.1, 'JP': 0.2, 'HK': 0.15
    }
    bonus_factor = bonus_locations.get(company_location, 0.05)
    
    return base_salary * (1 + equity_factor + bonus_factor)

# 7. Función para calcular riesgo de automatización
def automation_risk_score(job_category):
    risk_scores = {
        'Data Analyst': 7.5,
        'Business Analyst': 6.5,
        'Data Scientist': 4.0,
        'ML Engineer': 3.0,
        'Senior Data Scientist': 3.5,
        'Applied Data Scientist': 3.8,
        'Data Engineer': 4.5,
        'Data Leadership': 2.0,
        'Data Architect': 3.0,
        'ML Researcher': 1.5,
        'AI Specialist': 2.0,
        'MLOps/Infrastructure Engineer': 3.5
    }
    return risk_scores.get(job_category, 5.0)

# ---------------------------------------------------
# FUNCIÓN PRINCIPAL DE TRANSFORMACIÓN
# ---------------------------------------------------

def transform_salary_dataset(df):
    # Hacer una copia para no modificar el original
    clean_df = df.copy()
    
    # PASO 1: Limpieza básica
    # Convertir work_year a entero si es necesario
    if clean_df['work_year'].dtype != 'int64':
        clean_df['work_year'] = clean_df['work_year'].astype(int)
    
    # PASO 2: Categorización y mapeo de valores
    # Categorías de trabajo
    clean_df['job_category'] = clean_df['job_title'].apply(categorize_job_title)
    
    # Mapeos descriptivos
    clean_df['experience_level_desc'] = clean_df['experience_level'].map(experience_mapping)
    clean_df['employment_type_desc'] = clean_df['employment_type'].map(employment_mapping)
    clean_df['work_setting'] = clean_df['remote_ratio'].map(remote_mapping)
    clean_df['company_size_desc'] = clean_df['company_size'].map(company_size_mapping)
    
    # PASO 3: Información regional y geográfica
    clean_df['region'] = clean_df['company_location'].map(region_mapping).fillna('Other')
    clean_df['residence_region'] = clean_df['employee_residence'].map(region_mapping).fillna('Other')
    clean_df['domestic_employment'] = clean_df['employee_residence'] == clean_df['company_location']
    clean_df['cost_of_living_index'] = clean_df['employee_residence'].map(col_index).fillna(70)
    
    # PASO 4: Información de habilidades y perfil profesional
    clean_df['tech_stack'] = clean_df['job_category'].apply(assign_tech_stack)
    clean_df['english_level'] = clean_df.apply(
        lambda x: assign_english_level(x['employee_residence'], x['job_category']), 
        axis=1
    )
    clean_df['tech_specialization'] = clean_df['job_category'].map(tech_specialization).fillna(6)
    clean_df['role_maturity'] = clean_df['job_category'].map(role_age_mapping).fillna('Other')
    clean_df['career_path'] = clean_df['job_category'].map(career_path_mapping).fillna('Other')
    clean_df['ai_relationship'] = clean_df['job_category'].map(ai_impact).fillna('Undefined')
    clean_df['demand_index'] = clean_df['job_category'].map(demand_index).fillna(8.0)
    
    # PASO 5: Análisis salarial y contextual
    # Periodo económico
    clean_df['economic_period'] = clean_df['work_year'].apply(economic_period)
    
    # Métricas salariales
    clean_df['adjusted_salary'] = clean_df['salary_in_usd'] / (clean_df['cost_of_living_index'] / 100)
    clean_df['salary_to_experience_ratio'] = clean_df['salary_in_usd'] / clean_df['experience_level'].map(
        {'EN': 1, 'MI': 2, 'SE': 3, 'EX': 4}
    )
    clean_df['normalized_salary'] = clean_df.apply(
        lambda x: normalize_by_experience_region(
            x['salary_in_usd'], 
            x['experience_level'], 
            x['region']
        ), 
        axis=1
    )
    clean_df['total_compensation_estimate'] = clean_df.apply(
        lambda x: estimate_total_compensation(
            x['salary_in_usd'], 
            x['company_size'], 
            x['company_location']
        ),
        axis=1
    )
    
    # PASO 6: Índices de análisis
    clean_df['work_life_balance'] = clean_df.apply(
        lambda x: calculate_work_life_index(x['remote_ratio'], x['employment_type']), 
        axis=1
    )
    clean_df['automation_risk'] = clean_df['job_category'].apply(automation_risk_score)
    clean_df['company_sector'] = clean_df.apply(
        lambda x: infer_company_sector(x['company_size'], x['company_location']),
        axis=1
    )
    
    # PASO 7: Bandas salariales
    salary_quantiles = clean_df['salary_in_usd'].quantile([0.25, 0.5, 0.75])
    
    # Usando pd.cut() que es más seguro para categorizar
    clean_df['salary_bracket'] = pd.cut(
        clean_df['salary_in_usd'],
        bins=[0, salary_quantiles[0.25], salary_quantiles[0.5], salary_quantiles[0.75], float('inf')],
        labels=['Low', 'Medium-Low', 'Medium-High', 'High'],
        include_lowest=True
    )
    
    # Competitividad salarial
    clean_df['salary_competitiveness'] = clean_df.apply(
        lambda x: calculate_salary_competitiveness(
            x['salary_in_usd'], 
            x['job_category'], 
            x['experience_level_desc']
        ),
        axis=1
    )
    
    return clean_df

# ---------------------------------------------------
# EJECUCIÓN DEL PROCESO
# ---------------------------------------------------

# Aplicar todas las transformaciones y enriquecimientos al dataset
enriched_df = transform_salary_dataset(df)

# Mostrar información sobre el dataset enriquecido
print("\nInformación del dataset enriquecido:")
enriched_df.info()

# Mostrar las primeras filas del dataset enriquecido
print("\nPrimeras filas del dataset enriquecido:")
enriched_df.head()


Información del dataset enriquecido:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   work_year                    3755 non-null   int64   
 1   experience_level             3755 non-null   object  
 2   employment_type              3755 non-null   object  
 3   job_title                    3755 non-null   object  
 4   salary                       3755 non-null   int64   
 5   salary_currency              3755 non-null   object  
 6   salary_in_usd                3755 non-null   int64   
 7   employee_residence           3755 non-null   object  
 8   remote_ratio                 3755 non-null   int64   
 9   company_location             3755 non-null   object  
 10  company_size                 3755 non-null   object  
 11  job_category                 3755 non-null   object  
 12  experience_level_desc   

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,...,economic_period,adjusted_salary,salary_to_experience_ratio,normalized_salary,total_compensation_estimate,work_life_balance,automation_risk,company_sector,salary_bracket,salary_competitiveness
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,...,Post-Pandemic,114462.666667,28615.666667,55744.805195,103016.4,3.0,3.5,Services,Low,Below Average
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,...,Post-Pandemic,30000.0,15000.0,30000.0,36000.0,3.5,3.0,Startup/Tech,Low,Below Average
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,...,Post-Pandemic,25500.0,12750.0,25500.0,30600.0,3.5,3.0,Startup/Tech,Low,Below Average
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,...,Post-Pandemic,184210.526316,58333.333333,125000.0,213500.0,3.0,4.0,Mixed,Medium-High,Competitive
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,...,Post-Pandemic,126315.789474,40000.0,85714.285714,146400.0,3.0,4.0,Mixed,Medium-Low,Below Average


In [36]:
# Eliminar columnas redundantes o menos relevantes
columns_to_drop = [
    # Códigos originales reemplazados por versiones descriptivas
    'experience_level',      # Reemplazado por experience_level_desc
    'employment_type',       # Reemplazado por employment_type_desc
    'company_size',          # Reemplazado por company_size_desc
    
    # Datos monetarios redundantes
    'salary',                # Ya tenemos salary_in_usd normalizado
    'salary_currency',       # Ya no es necesario con salary_in_usd
    
    # Columnas potencialmente menos relevantes según el análisis
    'remote_ratio',          # Reemplazado por work_setting
]

# Aplicar la eliminación
clean_df = enriched_df.drop(columns=columns_to_drop)

# Verificar el nuevo conjunto de datos
print(f"Dimensiones originales: {enriched_df.shape}")
print(f"Dimensiones después de eliminar columnas: {clean_df.shape}")
print(f"Columnas eliminadas: {len(columns_to_drop)}")

# Guardar el dataset final limpio
clean_df.to_csv('clean_salary_data.csv', index=False)
print("\nDataset limpio guardado como 'clean_salary_data.csv'")

Dimensiones originales: (3755, 37)
Dimensiones después de eliminar columnas: (3755, 31)
Columnas eliminadas: 6

Dataset limpio guardado como 'clean_salary_data.csv'
