In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from ydata_profiling import ProfileReport

In [19]:
df = pd.read_csv(r'D:\ansiedad\AnxietyLevelByCovidcsv')
df.head()

Unnamed: 0,sex,education_level,shift,marital_status,category,age_range,seniority_range,anxiety_level
0,female,bachelor,night a,domestic partnership,gen nurse,20 to 29,1 to 5,mild
1,female,graduate,morning,married,spec nurse,40 to 49,16 to 20,minimal
2,male,graduate,morning,single,spec nurse,30 to 39,11 to 15,mild
3,female,bachelor,morning,domestic partnership,spec nurse,40 to 49,16 to 20,moderate
4,female,graduate,morning,single,gen nurse,30 to 39,11 to 15,mild


In [20]:
profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [21]:
for column in df.columns:
    print(f"\n{column.upper()}:")
    print(df[column].value_counts())
    print("-" * 50)


SEX:
sex
female    108
male       32
Name: count, dtype: int64
--------------------------------------------------

EDUCATION_LEVEL:
education_level
bachelor           98
technical level    31
graduate           11
Name: count, dtype: int64
--------------------------------------------------

SHIFT:
shift
morning      52
afternoon    50
night a      20
night b      18
Name: count, dtype: int64
--------------------------------------------------

MARITAL_STATUS:
marital_status
married                 58
single                  55
domestic partnership    21
widowed                  1
Name: count, dtype: int64
--------------------------------------------------

CATEGORY:
category
gen nurse     74
nurse aux     54
spec nurse     7
head nurse     5
Name: count, dtype: int64
--------------------------------------------------

AGE_RANGE:
age_range
30 to 39       66
20 to 29       34
40 to 49       31
50 and over     9
Name: count, dtype: int64
--------------------------------------------------


In [22]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder

def codificar_variables(df):
    """
    Codifica las variables del DataFrame según el plan especificado.

    Args:
        df: DataFrame de pandas con los datos originales.

    Returns:
        DataFrame: DataFrame con las variables codificadas.
    """

    df_codificado = df.copy()

    # 1. anxiety_level: Codificación Ordinal (0, 1, 2, 3)
    anxiety_mapping = {
        "minimal": 0,
        "mild": 1,
        "moderate": 2,
        "severe": 3,
    }
    df_codificado["anxiety_level"] = df_codificado["anxiety_level"].map(anxiety_mapping)

    # 2. sex: Codificación Binaria (0 y 1)
    # Asumimos que 'female' es 0 y 'male' es 1.  Podría hacerse al revés.
    sex_mapping = {"female": 0, "male": 1}
    df_codificado["sex"] = df_codificado["sex"].map(sex_mapping)

    # 3. education_level: Codificación Ordinal (0, 1, 2)
    education_mapping = {
        "technical level": 0,
        "bachelor": 1,
        "graduate": 2,
    }
    df_codificado["education_level"] = df_codificado["education_level"].map(
        education_mapping
    )


    # 4. shift: One-Hot Encoding
    df_codificado = pd.get_dummies(
        df_codificado, columns=["shift"], prefix="shift", dummy_na=False
    )  # dummy_na=False evita crear una columna para valores NaN

    # 5. marital_status: One-Hot Encoding + Manejo de 'widowed'
    # Opción 1: Eliminar la fila con 'widowed'
    # df_codificado = df_codificado[df_codificado['marital_status'] != 'widowed']
    # Opción 2: Agrupar 'widowed' con otra categoría (e.g., 'single')
    df_codificado['marital_status'] = df_codificado['marital_status'].replace('widowed', 'single')
    df_codificado = pd.get_dummies(df_codificado, columns=['marital_status'], prefix='marital', dummy_na=False)



    # 6. category: One-Hot Encoding + Agrupación
    # Agrupamos 'spec nurse' y 'head nurse' en 'other'
    df_codificado["category"] = df_codificado["category"].replace(
        ["spec nurse", "head nurse"], "other"
    )
    df_codificado = pd.get_dummies(
        df_codificado, columns=["category"], prefix="category", dummy_na=False
    )

    # 7. age_range: Codificación Ordinal
    age_categories = ["20 to 29", "30 to 39", "40 to 49", "50 and over"]
    age_encoder = OrdinalEncoder(categories=[age_categories])
    df_codificado["age_range"] = age_encoder.fit_transform(
        df_codificado[["age_range"]]
    )
    df_codificado["age_range"] = df_codificado["age_range"].astype(int)

    # 8. seniority_range: Codificación Ordinal
    seniority_categories = [
        "1 to 5",
        "6 to 10",
        "11 to 15",
        "16 to 20",
        "21 and over",
    ]
    seniority_encoder = OrdinalEncoder(categories=[seniority_categories])
    df_codificado["seniority_range"] = seniority_encoder.fit_transform(
        df_codificado[["seniority_range"]]
    )
    df_codificado["seniority_range"] = df_codificado["seniority_range"].astype(int)

    return df_codificado


# Ejemplo de uso (asumiendo que tus datos están en un DataFrame llamado 'df'):
# Supongamos que 'df' es tu DataFrame original

#  Crea un pequeño DataFrame de ejemplo para demostrar el código
data = {
    'anxiety_level': ['minimal', 'mild', 'moderate', 'severe', 'minimal', 'mild'],
    'sex': ['female', 'male', 'female', 'male', 'female', 'female'],
    'education_level': ['bachelor', 'technical level', 'graduate', 'bachelor', 'technical level', 'bachelor'],
    'shift': ['morning', 'afternoon', 'night a', 'night b', 'morning', 'afternoon'],
    'marital_status': ['married', 'single', 'domestic partnership', 'widowed', 'married', 'single'],
    'category': ['gen nurse', 'nurse aux', 'spec nurse', 'head nurse', 'gen nurse', 'nurse aux'],
    'age_range': ['30 to 39', '20 to 29', '40 to 49', '50 and over', '30 to 39', '20 to 29'],
    'seniority_range': ['1 to 5', '6 to 10', '11 to 15', '16 to 20', '1 to 5', '6 to 10']
}
df = pd.DataFrame(data)



In [23]:

df_codificado = codificar_variables(df)
print(df_codificado)

# Ahora puedes aplicar técnicas de aumento de datos a df_codificado

   anxiety_level  sex  education_level  age_range  seniority_range  \
0              0    0                1          1                0   
1              1    1                0          0                1   
2              2    0                2          2                2   
3              3    1                1          3                3   
4              0    0                0          1                0   
5              1    0                1          0                1   

   shift_afternoon  shift_morning  shift_night a  shift_night b  \
0            False           True          False          False   
1             True          False          False          False   
2            False          False           True          False   
3            False          False          False           True   
4            False           True          False          False   
5             True          False          False          False   

   marital_domestic partnership  marita