In [3]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np
import re

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames
pd.set_option('display.max_rows', None) # ver todas las filas

In [None]:
df1 = pd.read_csv("StudentsPerformance.csv")
df2 = pd.read_csv("xAPI-Edu-Data.csv")

In [None]:
# Funcion para realizar un EDA Básico
def eda_basico(df):
    # 🌷 Mostrar ejemplos de datos para tener una primera idea
    print('🌷 Ejemplo de datos del DF:')
    display(df.head(3))    # Primeras 3 filas
    display(df.tail(3))    # Últimas 3 filas
    display(df.sample(3))  # 3 filas aleatorias
    print('─' * 100)

    # 🌻 Número total de filas (observaciones)
    print('🌻 Número de filas:')
    print(df.shape[0])
    print('─' * 100)

    # 🌱 Número total de columnas (variables)
    print('🌱 Número de columnas:')
    print(df.shape[1])
    print('─' * 100)

    # 🌼 Información general del DataFrame (tipos, nulos, etc.)
    print('🌼 Información de la tabla:')
    display(df.info())
    print('─' * 100)

    # 🌑 Lista con los nombres de las columnas
    print('🌑 Nombre de las columnas:')
    print(df.columns.tolist())
    print('─' * 100)

    # 🍄 Estadísticas descriptivas de columnas numéricas
    print('🍄 Descripción de los datos numéricos:')
    display(df.describe().T)
    print('─' * 100)

    # 🌋 Estadísticas descriptivas de columnas categóricas (si existen)
    print('🌋 Descripción de los datos no-numéricos:')
    if not df.select_dtypes(include='object').empty:
        display(df.describe(include='object').T)
    else:
        print('📭 No hay columnas tipo "object" en este DataFrame.')
    print('─' * 100)

    # 🍂 Cuántos valores únicos hay en cada columna
    print('🍂 Número de valores únicos por columna:')
    display(df.nunique())
    print('─' * 100)

    # 🐖 Cuántos valores nulos hay en cada columna
    print('🐖 Valores nulos por columna:')
    display(df.isnull().sum())
    print('─' * 100)

    # 🐲 Revisión de filas duplicadas
    print('🐲 Filas duplicadas:')
    duplicados = df.duplicated().sum()
    if duplicados > 0:
        print(f'📛 Hay {duplicados} filas duplicadas.')
        display(df[df.duplicated()].head(3))
    else:
        print('✅ No hay filas duplicadas.')
    print('─' * 100)

    # 🪹 Columnas constantes (con solo un valor único)
    print('🪹 Columnas constantes (solo un valor):')
    constantes = df.columns[df.nunique() <= 1]
    if not constantes.empty:
        print(f'📌 {len(constantes)} columnas con solo un valor:')
        display(constantes.tolist())
    else:
        print('✅ No hay columnas constantes.')
    print('─' * 100)

    # 🚀 Ver valores únicos de columnas categóricas (si existen)
    print('🚀 Valores únicos en columnas categóricas:')
    cat_cols = df.select_dtypes(include='object')
    if not cat_cols.empty:
        for col in cat_cols.columns:
            print(f'🔸 {col}')
            print(cat_cols[col].unique())
            print('─' * 60)
    else:
        print('📭 No hay columnas categóricas.')
    print('─' * 100)

    # 🧬 Resumen de tipos de datos presentes en el DataFrame
    print('🧬 Tipos de datos por columna:')
    display(df.dtypes.value_counts())
    print('─' * 100)

In [7]:
eda_basico(df1)

🌷 Ejemplo de datos del DF:


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77
999,female,group D,some college,free/reduced,none,77,86,86


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
22,male,group D,some college,standard,none,44,54,53
321,female,group E,high school,standard,none,75,86,79
85,female,group C,some college,standard,none,73,80,82


────────────────────────────────────────────────────────────────────────────────────────────────────
🌻 Número de filas:
1000
────────────────────────────────────────────────────────────────────────────────────────────────────
🌱 Número de columnas:
8
────────────────────────────────────────────────────────────────────────────────────────────────────
🌼 Información de la tabla:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7  

None

────────────────────────────────────────────────────────────────────────────────────────────────────
🌑 Nombre de las columnas:
['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course', 'math score', 'reading score', 'writing score']
────────────────────────────────────────────────────────────────────────────────────────────────────
🍄 Descripción de los datos numéricos:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
math score,1000.0,66.089,15.16308,0.0,57.0,66.0,77.0,100.0
reading score,1000.0,69.169,14.600192,17.0,59.0,70.0,79.0,100.0
writing score,1000.0,68.054,15.195657,10.0,57.75,69.0,79.0,100.0


────────────────────────────────────────────────────────────────────────────────────────────────────
🌋 Descripción de los datos no-numéricos:


Unnamed: 0,count,unique,top,freq
gender,1000,2,female,518
race/ethnicity,1000,5,group C,319
parental level of education,1000,6,some college,226
lunch,1000,2,standard,645
test preparation course,1000,2,none,642


────────────────────────────────────────────────────────────────────────────────────────────────────
🍂 Número de valores únicos por columna:


gender                          2
race/ethnicity                  5
parental level of education     6
lunch                           2
test preparation course         2
math score                     81
reading score                  72
writing score                  77
dtype: int64

────────────────────────────────────────────────────────────────────────────────────────────────────
🐖 Valores nulos por columna:


gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

────────────────────────────────────────────────────────────────────────────────────────────────────
🐲 Filas duplicadas:
✅ No hay filas duplicadas.
────────────────────────────────────────────────────────────────────────────────────────────────────
🪹 Columnas constantes (solo un valor):
✅ No hay columnas constantes.
────────────────────────────────────────────────────────────────────────────────────────────────────
🚀 Valores únicos en columnas categóricas:
🔸 gender
['female' 'male']
────────────────────────────────────────────────────────────
🔸 race/ethnicity
['group B' 'group C' 'group A' 'group D' 'group E']
────────────────────────────────────────────────────────────
🔸 parental level of education
["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
────────────────────────────────────────────────────────────
🔸 lunch
['standard' 'free/reduced']
────────────────────────────────────────────────────────────
🔸 test preparation cour

object    5
int64     3
Name: count, dtype: int64

────────────────────────────────────────────────────────────────────────────────────────────────────


In [8]:
eda_basico(df2)

🌷 Ejemplo de datos del DF:


Unnamed: 0,Gender,Nationality,Place of birth,Educational Stages,Grade Levels,Section ID,Topic,Semester,Parent responsible for student,Raised hand,Visited resources,Viewing announcements,Discussion groups,Parent Answering Survey,ParentschoolSatisfaction,StudentAbsenceDays,Total
0,Male,KuwaIT,KuwaIT,lowerlevel,G-04,A,IT,First,Father,15,16,2,20,Yes,Good,Under-7,Middle-Level
1,Male,KuwaIT,KuwaIT,lowerlevel,G-04,A,IT,First,Father,20,20,3,25,Yes,Good,Under-7,Middle-Level
2,Male,KuwaIT,KuwaIT,lowerlevel,G-04,A,IT,First,Father,10,7,0,30,No,Bad,Above-7,Low-Level


Unnamed: 0,Gender,Nationality,Place of birth,Educational Stages,Grade Levels,Section ID,Topic,Semester,Parent responsible for student,Raised hand,Visited resources,Viewing announcements,Discussion groups,Parent Answering Survey,ParentschoolSatisfaction,StudentAbsenceDays,Total
477,Female,Jordan,Jordan,MiddleSchool,G-08,A,Geology,Second,Father,55,74,25,29,No,Bad,Under-7,Middle-Level
478,Female,Jordan,Jordan,MiddleSchool,G-08,A,History,First,Father,30,17,14,57,No,Bad,Above-7,Low-Level
479,Female,Jordan,Jordan,MiddleSchool,G-08,A,History,Second,Father,35,14,23,62,No,Bad,Above-7,Low-Level


Unnamed: 0,Gender,Nationality,Place of birth,Educational Stages,Grade Levels,Section ID,Topic,Semester,Parent responsible for student,Raised hand,Visited resources,Viewing announcements,Discussion groups,Parent Answering Survey,ParentschoolSatisfaction,StudentAbsenceDays,Total
56,Male,USA,USA,MiddleSchool,G-08,B,Math,First,Father,19,5,4,1,Yes,Good,Above-7,Low-Level
103,"Male,",KuwaIT,KuwaIT,lowerlevel,G-02,B,IT,First,Father,1,7,6,10,No,Bad,Above-7,Low-Level
346,Female,Jordan,Jordan,lowerlevel,G-02,B,French,Second,mom,24,97,15,14,No,Good,Under-7,High-Level


────────────────────────────────────────────────────────────────────────────────────────────────────
🌻 Número de filas:
480
────────────────────────────────────────────────────────────────────────────────────────────────────
🌱 Número de columnas:
17
────────────────────────────────────────────────────────────────────────────────────────────────────
🌼 Información de la tabla:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 480 entries, 0 to 479
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Gender                          480 non-null    object
 1   Nationality                     480 non-null    object
 2   Place of birth                  480 non-null    object
 3   Educational Stages              480 non-null    object
 4   Grade Levels                    480 non-null    object
 5   Section ID                      480 non-null    object
 6   Topic                           

None

────────────────────────────────────────────────────────────────────────────────────────────────────
🌑 Nombre de las columnas:
['Gender', 'Nationality', 'Place of birth', 'Educational Stages ', 'Grade Levels', 'Section ID', 'Topic', 'Semester', 'Parent responsible for student', 'Raised hand', 'Visited resources', 'Viewing announcements', 'Discussion groups', 'Parent Answering Survey', 'ParentschoolSatisfaction', 'StudentAbsenceDays', 'Total']
────────────────────────────────────────────────────────────────────────────────────────────────────
🍄 Descripción de los datos numéricos:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Raised hand,480.0,46.775,30.779223,0.0,15.75,50.0,75.0,100.0
Visited resources,480.0,54.797917,33.080007,0.0,20.0,65.0,84.0,99.0
Viewing announcements,480.0,37.91875,26.611244,0.0,14.0,33.0,58.0,98.0
Discussion groups,480.0,43.283333,27.637735,1.0,20.0,39.0,70.0,99.0


────────────────────────────────────────────────────────────────────────────────────────────────────
🌋 Descripción de los datos no-numéricos:


Unnamed: 0,count,unique,top,freq
Gender,480,4,"Male,",160
Nationality,480,14,KuwaIT,179
Place of birth,480,14,KuwaIT,180
Educational Stages,480,3,MiddleSchool,248
Grade Levels,480,10,G-02,147
Section ID,480,3,A,283
Topic,480,12,IT,95
Semester,480,3,First,244
Parent responsible for student,480,2,Father,283
Parent Answering Survey,480,2,Yes,270


────────────────────────────────────────────────────────────────────────────────────────────────────
🍂 Número de valores únicos por columna:


Gender                             4
Nationality                       14
Place of birth                    14
Educational Stages                 3
Grade Levels                      10
Section ID                         3
Topic                             12
Semester                           3
Parent responsible for student     2
Raised hand                       82
Visited resources                 89
Viewing announcements             88
Discussion groups                 90
Parent Answering Survey            2
ParentschoolSatisfaction           2
StudentAbsenceDays                 2
Total                              4
dtype: int64

────────────────────────────────────────────────────────────────────────────────────────────────────
🐖 Valores nulos por columna:


Gender                            0
Nationality                       0
Place of birth                    0
Educational Stages                0
Grade Levels                      0
Section ID                        0
Topic                             0
Semester                          0
Parent responsible for student    0
Raised hand                       0
Visited resources                 0
Viewing announcements             0
Discussion groups                 0
Parent Answering Survey           0
ParentschoolSatisfaction          0
StudentAbsenceDays                0
Total                             0
dtype: int64

────────────────────────────────────────────────────────────────────────────────────────────────────
🐲 Filas duplicadas:
📛 Hay 2 filas duplicadas.


Unnamed: 0,Gender,Nationality,Place of birth,Educational Stages,Grade Levels,Section ID,Topic,Semester,Parent responsible for student,Raised hand,Visited resources,Viewing announcements,Discussion groups,Parent Answering Survey,ParentschoolSatisfaction,StudentAbsenceDays,Total
326,"Male,",Jordan,Jordan,lowerlevel,G-02,A,French,First,Father,10,15,10,21,No,Bad,Above-7,Low-Level
327,"Male,",Jordan,Jordan,lowerlevel,G-02,A,French,Second,Father,30,10,20,5,No,Bad,Above-7,Low-Level


────────────────────────────────────────────────────────────────────────────────────────────────────
🪹 Columnas constantes (solo un valor):
✅ No hay columnas constantes.
────────────────────────────────────────────────────────────────────────────────────────────────────
🚀 Valores únicos en columnas categóricas:
🔸 Gender
['Male' 'Female' 'Male,' 'Female,']
────────────────────────────────────────────────────────────
🔸 Nationality
['KuwaIT' 'lebanon' 'Egypt' 'SaudiArabia' 'USA' 'Jordan' 'venzuela' 'Iran'
 'Tunis' 'Morocco' 'Syria' 'Palestine' 'Iraq' 'Lybia']
────────────────────────────────────────────────────────────
🔸 Place of birth
['KuwaIT' 'lebanon' 'Egypt' 'SaudiArabia' 'USA' 'Jordan' 'venzuela' 'Iran'
 'Tunis' 'Morocco' 'Syria' 'Iraq' 'Palestine' 'Lybia']
────────────────────────────────────────────────────────────
🔸 Educational Stages 
['lowerlevel' 'MiddleSchool' 'HighSchool']
────────────────────────────────────────────────────────────
🔸 Grade Levels
['G-04' 'G-07' 'G-08' 'G-06

object    13
int64      4
Name: count, dtype: int64

────────────────────────────────────────────────────────────────────────────────────────────────────
