In [8]:
import pandas as pd
from scipy.stats import kurtosis
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
file_path = './Práctica_one.csv'
data = pd.read_csv(file_path)
data

Unnamed: 0,Nombre,Edad,Formación académica,Antigüedad,Resultado de evaluación
0,Alvarado Zepeda Diana,51.0,Ingeniería Industrial,16.0,3.625
1,Bonilla Sandoval Martín,65.0,Licenciatura en sociología,9.0,6.070
2,Bonilla Sánchez Daniel,28.0,Lic. en matemáticas,15.0,4.630
3,Bravo Cortéz Marcos,53.0,Ingeniería en Mecatrónica,11.0,6.525
4,Bravo Gomez Pedro,33.0,Ingeniería en Mecatrónica,11.0,7.990
...,...,...,...,...,...
70,Ávila López Carmen,68.0,Ing. en mecatrónica,6.0,
71,Jiménez de Alba Enrique,58.0,,,5.755
72,Sánchez Pineda Ramón,36.0,,15.0,4.680
73,Pérez Jiménez Francisco,64.0,Ingeniería en Mecatrónica,,4.410


In [10]:
# Evaluar si para cada variable, los datos faltantes superan el 60% de las observaciones y eliminarlas si es necesario
missing_percentage = data.isnull().mean() * 100
columns_to_drop = missing_percentage[missing_percentage > 60].index
data_cleaned = data.drop(columns=columns_to_drop)

In [11]:
# Definir función para identificar si una variable tiene distribución uniforme (Curtosis menor a -1)
def is_uniform_distribution(series):
    return kurtosis(series.dropna()) < -1

In [12]:
# Definir función para detectar valores atípicos (outliers) usando el método del rango intercuartil
def has_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return ((series < lower_bound) | (series > upper_bound)).any()

In [13]:
# Sustituir los valores faltantes según los criterios especificados
for column in data_cleaned.columns:
    if data_cleaned[column].dtype == 'object':  # If col es categórica
        mode_value = data_cleaned[column].mode()[0]
        data_cleaned[column].fillna(mode_value, inplace=True)
    else:  # If col es numérica
        if is_uniform_distribution(data_cleaned[column]):
            # If distribución uniforme, sustituir por valor aleatorio
            random_value = np.random.choice(data_cleaned[column].dropna())
            data_cleaned[column].fillna(random_value, inplace=True)
        else:
            if has_outliers(data_cleaned[column]):
                # If valores atípicos, sustituir por la mediana
                median_value = data_cleaned[column].median()
                data_cleaned[column].fillna(median_value, inplace=True)
            else:
                # Else sustituir por la media
                mean_value = data_cleaned[column].mean()
                data_cleaned[column].fillna(mean_value, inplace=True)

data_cleaned

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_cleaned[column].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_cleaned[column].fillna(random_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we ar

Unnamed: 0,Nombre,Edad,Formación académica,Antigüedad,Resultado de evaluación
0,Alvarado Zepeda Diana,51.0,Ingeniería Industrial,16.000000,3.6250
1,Bonilla Sandoval Martín,65.0,Licenciatura en sociología,9.000000,6.0700
2,Bonilla Sánchez Daniel,28.0,Lic. en matemáticas,15.000000,4.6300
3,Bravo Cortéz Marcos,53.0,Ingeniería en Mecatrónica,11.000000,6.5250
4,Bravo Gomez Pedro,33.0,Ingeniería en Mecatrónica,11.000000,7.9900
...,...,...,...,...,...
70,Ávila López Carmen,68.0,Ing. en mecatrónica,6.000000,5.4375
71,Jiménez de Alba Enrique,58.0,Ingeniería Industrial,15.732394,5.7550
72,Sánchez Pineda Ramón,36.0,Ingeniería Industrial,15.000000,4.6800
73,Pérez Jiménez Francisco,64.0,Ingeniería en Mecatrónica,15.732394,4.4100


In [14]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data_cleaned['Formación académica'].fillna(''))

# Convertir la matriz TfIdf a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenar el DataFrame original con el nuevo DataFrame de TfIdf
data_cleaned = pd.concat([data_cleaned, tfidf_df], axis=1)
data_cleaned.drop(columns=['Formación académica'], inplace=True)

data_cleaned.head()

Unnamed: 0,Nombre,Edad,Antigüedad,Resultado de evaluación,administración,agronomía,civil,computación,comunicaciones,contaduría,...,ing,ingeniería,lic,licenciatura,matemáticas,mecatrónica,mecánica,psicología,sociología,topografía
0,Alvarado Zepeda Diana,51.0,16.0,3.625,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.628832,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bonilla Sandoval Martín,65.0,9.0,6.07,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.546169,0.0,0.0,0.0,0.0,0.788128,0.0
2,Bonilla Sánchez Daniel,28.0,15.0,4.63,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.558443,0.0,0.780477,0.0,0.0,0.0,0.0,0.0
3,Bravo Cortéz Marcos,53.0,11.0,6.525,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.481888,0.0,0.0,0.0,0.772436,0.0,0.0,0.0,0.0
4,Bravo Gomez Pedro,33.0,11.0,7.99,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.481888,0.0,0.0,0.0,0.772436,0.0,0.0,0.0,0.0


No venía el requisito de colocar conclusiones pero me gustaría decir que fue un buen ejercicio porque utiliza varios conceptos vistos y creo que devuelve un producto bastante consistente. No estoy seguro de si hice correctamente la transformación tfidf, pero así decía la documentación.