# University Student Dropout: A Longitudinal Dataset of Demographic, Socioeconomic, and Academic Indicators
In this document I walk through the data provided by this research and do basic training before separating code into final executables


In [1]:
import pandas as pd
import re


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## Loading datasets for 2021-2022
The goal is to compare them and merge into single dataset for 2 years
I decided to focus on two most recent datasets

The datasets have relative naming for months like 'pft_events_2021_10'
Rename them to same format and merge

In [3]:

df_2021 = pd.read_csv("../data/raw/dataset_2021_hash.csv", sep=';', low_memory=False)
df_2022 = pd.read_csv("../data/raw/dataset_2022_hash.csv", sep=';', low_memory=False)

In [4]:
print(f"Shapes: 2021:{df_2021.shape}; 2022:{df_2022.shape}")
print(f"Columns: 2021: {len(df_2021.columns)}; 2022:{len(df_2022.columns)}")

cols_2021 = set(df_2021.columns)
cols_2022 = set(df_2022.columns)

print(f"Columns in 2021 but not 2022: {cols_2021 - cols_2022}")
print(f"Columns in 2022 but not 2021: {cols_2022 - cols_2021}")
print(f"Shared columns: {len(cols_2021.intersection(cols_2022))}")

Shapes: 2021:(153120, 178); 2022:(159173, 169)
Columns: 2021: 178; 2022:169
Columns in 2021 but not 2022: {'pft_visits_2022_5', 'pft_visits_2022_4', 'pft_events_2022_3', 'resource_events_2021_10', 'n_resource_days_2021_10', 'pft_events_2022_1', 'n_resource_days_2022_1', 'pft_visits_2022_1', 'n_resource_days_2021_11', 'pft_test_submissions_2021_12', 'pft_visits_2022_3', 'pft_events_2022_6', 'n_resource_days_2022_4', 'n_resource_days_2022_8', 'pft_days_logged_2021_12', 'pft_total_minutes_2022_7', 'pft_visits_2021_12', 'resource_events_2022_1', 'pft_assignment_submissions_2022_6', 'pft_visits_2021_11', 'pft_total_minutes_2022_2', 'resource_events_2022_3', 'pft_assignment_submissions_2022_8', 'pft_days_logged_2021_11', 'pft_days_logged_2022_3', 'n_wifi_days_2021_10', 'pft_assignment_submissions_2021_10', 'pft_assignment_submissions_2022_2', 'pft_test_submissions_2022_8', 'resource_events_2022_4', 'pft_total_minutes_2021_10', 'pft_test_submissions_2022_6', 'n_wifi_days_2022_8', 'pft_events_

In [5]:
def align_academic_months(df, cohort_year):
    """
    Renames columns like 'pft_events_2021_10' to 'pft_events_m2' 
    assuming September is Month 1.
    """
    new_cols = {}
    for col in df.columns:
        # Match pattern like _2021_9 or _2022_1
        match = re.search(r'_(\d{4})_(\d{1,2})$', col)
        if match:
            year = int(match.group(1))
            month = int(match.group(2))
            
            # Relative month (Sept=1, Oct=2... Aug=12)
            rel_month = (month - 9) % 12 + 1
            
            base_name = col[:match.start()]
            new_cols[col] = f"{base_name}_m{rel_month}"
            
    return df.rename(columns=new_cols)


In [7]:
df_2021 = align_academic_months(df_2021, 2021)
df_2022 = align_academic_months(df_2022, 2022)

print(f"Shapes: 2021:{df_2021.shape}; 2022:{df_2022.shape}")
print(f"Columns: 2021: {len(df_2021.columns)}; 2022:{len(df_2022.columns)}")

# 2021 set has 9 more columns which we drop by taking intersection:
shared_cols = list(set(df_2021.columns).intersection(set(df_2022.columns)))

df_2021 = df_2021[shared_cols]
df_2022 = df_2022[shared_cols]



cols_2021 = set(df_2021.columns)
cols_2022 = set(df_2022.columns)

print(f"Shared columns: {len(cols_2021.intersection(cols_2022))}")



Shapes: 2021:(153120, 169); 2022:(159173, 169)
Columns: 2021: 169; 2022:169
Shared columns: 169


### Merge both datasets now

In [None]:

# Cohort year indicator
df_2021['cohort_year'] = 2021
df_2022['cohort_year'] = 2022

df_combined = pd.concat([df_2021, df_2022], ignore_index=True)
print(f"Combined shape: {df_combined.shape}")

Combined shape: (312293, 170)


## Data analysis and cleaning

Examining data and preparing it for use in XGBoost

In [4]:
print(df_2022.info())
print(f"\nHow many missing values per column, given {df_2022.shape[0]} rows")
print(df_2022.isnull().sum().to_string())
print()

# Calculate percentage of missing values
missing_percent = (df_2022.isnull().sum() / len(df_2022)) * 100
print("Percentage of missing values per column:")
print(missing_percent.sort_values(ascending=False).to_string())

df_2022.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153120 entries, 0 to 153119
Columns: 178 entries, dni_hash to n_resource_days_2022_8
dtypes: float64(1), int64(7), object(170)
memory usage: 207.9+ MB
None

How many missing values per column, given 153120 rows
dni_hash                                   0
tit_hash                                   0
asi_hash                                   0
anyo_ingreso                           37358
tipo_ingreso                           37374
nota10_hash                            41810
nota14_hash                            40714
campus_hash                                0
estudios_p_hash                         1081
estudios_m_hash                         1081
dedicacion                                 0
desplazado_hash                            0
abandono_hash                              0
preferencia_seleccion                  38408
baja_fecha                            137357
caca                                       0
grupos_por_tipocred

Unnamed: 0,dni_hash,tit_hash,asi_hash,anyo_ingreso,tipo_ingreso,nota10_hash,nota14_hash,campus_hash,estudios_p_hash,estudios_m_hash,dedicacion,desplazado_hash,abandono_hash,preferencia_seleccion,baja_fecha,caca,grupos_por_tipocredito_hash,matricula_activa,nota_asig_hash,fecha_datos,curso_mas_bajo,curso_mas_alto,cred_mat1,cred_mat2,cred_mat3,cred_mat4,cred_mat5,cred_mat6,cred_sup_normal,cred_sup_espec,cred_sup,cred_mat_normal,cred_mat_movilidad,cred_ptes_acta,cred_mat_practicas,cred_mat_sem_a,cred_mat_sem_b,cred_mat_anu,cred_mat_total,cred_sup_sem_a,cred_sup_sem_b,cred_sup_anu,cred_sup_total,rendimiento_cuat_a,rendimiento_cuat_b,rendimiento_total,exento_npp,anyo_inicio_estudios,es_retitulado,es_adaptado,cred_sup_1o,cred_sup_2o,cred_sup_3o,cred_sup_4o,cred_sup_5o,cred_sup_6o,practicas,actividades,ajuste,cred_sup_tit,cred_pend_sup_tit,impagado_curso_mat,asig1,pract1,activ1,total1,ajuste1,rend_total_ultimo,rend_total_penultimo,rend_total_antepenultimo,pft_events_2021_9,pft_days_logged_2021_9,pft_visits_2021_9,pft_assignment_submissions_2021_9,pft_test_submissions_2021_9,pft_total_minutes_2021_9,n_wifi_days_2021_9,resource_events_2021_9,n_resource_days_2021_9,pft_events_2021_10,pft_days_logged_2021_10,pft_visits_2021_10,pft_assignment_submissions_2021_10,pft_test_submissions_2021_10,pft_total_minutes_2021_10,n_wifi_days_2021_10,resource_events_2021_10,n_resource_days_2021_10,pft_events_2021_11,pft_days_logged_2021_11,pft_visits_2021_11,pft_assignment_submissions_2021_11,pft_test_submissions_2021_11,pft_total_minutes_2021_11,n_wifi_days_2021_11,resource_events_2021_11,n_resource_days_2021_11,pft_events_2021_12,pft_days_logged_2021_12,pft_visits_2021_12,pft_assignment_submissions_2021_12,pft_test_submissions_2021_12,pft_total_minutes_2021_12,n_wifi_days_2021_12,resource_events_2021_12,n_resource_days_2021_12,pft_events_2022_1,pft_days_logged_2022_1,pft_visits_2022_1,pft_assignment_submissions_2022_1,pft_test_submissions_2022_1,pft_total_minutes_2022_1,n_wifi_days_2022_1,resource_events_2022_1,n_resource_days_2022_1,pft_events_2022_2,pft_days_logged_2022_2,pft_visits_2022_2,pft_assignment_submissions_2022_2,pft_test_submissions_2022_2,pft_total_minutes_2022_2,n_wifi_days_2022_2,resource_events_2022_2,n_resource_days_2022_2,pft_events_2022_3,pft_days_logged_2022_3,pft_visits_2022_3,pft_assignment_submissions_2022_3,pft_test_submissions_2022_3,pft_total_minutes_2022_3,n_wifi_days_2022_3,resource_events_2022_3,n_resource_days_2022_3,pft_events_2022_4,pft_days_logged_2022_4,pft_visits_2022_4,pft_assignment_submissions_2022_4,pft_test_submissions_2022_4,pft_total_minutes_2022_4,n_wifi_days_2022_4,resource_events_2022_4,n_resource_days_2022_4,pft_events_2022_5,pft_days_logged_2022_5,pft_visits_2022_5,pft_assignment_submissions_2022_5,pft_test_submissions_2022_5,pft_total_minutes_2022_5,n_wifi_days_2022_5,resource_events_2022_5,n_resource_days_2022_5,pft_events_2022_6,pft_days_logged_2022_6,pft_visits_2022_6,pft_assignment_submissions_2022_6,pft_test_submissions_2022_6,pft_total_minutes_2022_6,n_wifi_days_2022_6,resource_events_2022_6,n_resource_days_2022_6,pft_events_2022_7,pft_days_logged_2022_7,pft_visits_2022_7,pft_assignment_submissions_2022_7,pft_test_submissions_2022_7,pft_total_minutes_2022_7,n_wifi_days_2022_7,resource_events_2022_7,n_resource_days_2022_7,pft_events_2022_8,pft_days_logged_2022_8,pft_visits_2022_8,pft_assignment_submissions_2022_8,pft_test_submissions_2022_8,pft_total_minutes_2022_8,n_wifi_days_2022_8,resource_events_2022_8,n_resource_days_2022_8
0,319636fc9270,620c9c332101,378cc790fd99,20120,NAP,,9456,e4f95d56d90df35e,F,L,TC,A,B,,,2021,e7d0293bc1c5cb4a,10,55,2023-06-28 14:19:30,3,3,0,0,420,0,0,0,375,0,375,420,0,0,0,105,315,0,420,105,270,0,375,1000,8571,8929,,2012,,,600,600,555,0,0,0,0,366,,20766,3234,,375,,,,0,1000,1000,1000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,,,,,,,,,40,,,,,,,,,40,,,,,,,,,10,,,,,,,,,40,,,,,,,,,30,,,,,,,,,,,,,,,,,,,,
1,319636fc9270,620c9c332101,b2677d6af4ae,20120,NAP,,9456,e4f95d56d90df35e,F,L,TC,A,B,,,2021,b9d618bf3e564eb9,10,60,2023-06-28 14:19:30,3,3,0,0,420,0,0,0,375,0,375,420,0,0,0,105,315,0,420,105,270,0,375,1000,8571,8929,,2012,,,600,600,555,0,0,0,0,366,,20766,3234,,375,,,,0,1000,1000,1000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,,,,,,,,,40,,,,,,,,,40,,,,,,,,,10,,,,,,,,,40,,,,,,,,,30,,,,,,,,,,,,,,,,,,,,
2,319636fc9270,620c9c332101,4596fcf257c4,20120,NAP,,9456,e4f95d56d90df35e,F,L,TC,A,B,,,2021,05bf3985b2c32c01,10,7,2023-06-28 14:19:30,3,3,0,0,420,0,0,0,375,0,375,420,0,0,0,105,315,0,420,105,270,0,375,1000,8571,8929,,2012,,,600,600,555,0,0,0,0,366,,20766,3234,,375,,,,0,1000,1000,1000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,,,,,,,,,40,,,,,,,,,40,,,,,,,,,10,,,,,,,,,40,,,,,,,,,30,,,,,,,,,,,,,,,,,,,,
3,319636fc9270,620c9c332101,64ce47d15992,20120,NAP,,9456,e4f95d56d90df35e,F,L,TC,A,B,,,2021,331e7a195c0445d7,10,68,2023-06-28 14:19:30,3,3,0,0,420,0,0,0,375,0,375,420,0,0,0,105,315,0,420,105,270,0,375,1000,8571,8929,,2012,,,600,600,555,0,0,0,0,366,,20766,3234,,375,,,,0,1000,1000,1000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,,,,,,,,,40,,,,,,,,,40,,,,,,,,,10,,,,,,,,,40,,,,,,,,,30,,,,,,,,,,,,,,,,,,,,
4,319636fc9270,620c9c332101,f8e5affd8ab2,20120,NAP,,9456,e4f95d56d90df35e,F,L,TC,A,B,,,2021,a3cf9c33ba52ecd8,10,81,2023-06-28 14:19:30,3,3,0,0,420,0,0,0,375,0,375,420,0,0,0,105,315,0,420,105,270,0,375,1000,8571,8929,,2012,,,600,600,555,0,0,0,0,366,,20766,3234,,375,,,,0,1000,1000,1000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,,,,,,,,,40,,,,,,,,,40,,,,,,,,,10,,,,,,,,,40,,,,,,,,,30,,,,,,,,,,,,,,,,,,,,


In [31]:
# Identify columns with more than 50% missing values
cols_to_drop = missing_percent[missing_percent > 20].index.tolist()

print(f"Dropping {len(cols_to_drop)} columns: {cols_to_drop}")

# Drop the columns from the dataframe
df_2022_cleaned = df_2022.drop(columns=cols_to_drop)

print(f"Original shape: {df_2022.shape}")
print(f"New shape after dropping columns: {df_2022_cleaned.shape}")
df_2022_cleaned.head()

Dropping 117 columns: ['anyo_ingreso', 'tipo_ingreso', 'nota10_hash', 'nota14_hash', 'preferencia_seleccion', 'baja_fecha', 'exento_npp', 'es_retitulado', 'es_adaptado', 'actividades', 'ajuste', 'impagado_curso_mat', 'pract1', 'activ1', 'total1', 'rend_total_ultimo', 'rend_total_penultimo', 'rend_total_antepenultimo', 'pft_events_2022_9', 'pft_days_logged_2022_9', 'pft_visits_2022_9', 'pft_assignment_submissions_2022_9', 'pft_test_submissions_2022_9', 'pft_total_minutes_2022_9', 'n_wifi_days_2022_9', 'resource_events_2022_9', 'n_resource_days_2022_9', 'pft_events_2022_10', 'pft_days_logged_2022_10', 'pft_visits_2022_10', 'pft_assignment_submissions_2022_10', 'pft_test_submissions_2022_10', 'pft_total_minutes_2022_10', 'n_wifi_days_2022_10', 'resource_events_2022_10', 'n_resource_days_2022_10', 'pft_events_2022_11', 'pft_days_logged_2022_11', 'pft_visits_2022_11', 'pft_assignment_submissions_2022_11', 'pft_test_submissions_2022_11', 'pft_total_minutes_2022_11', 'n_wifi_days_2022_11', 'r

Unnamed: 0,dni_hash,tit_hash,asi_hash,campus_hash,estudios_p_hash,estudios_m_hash,dedicacion,desplazado_hash,abandono_hash,caca,grupos_por_tipocredito_hash,matricula_activa,nota_asig_hash,fecha_datos,curso_mas_bajo,curso_mas_alto,cred_mat1,cred_mat2,cred_mat3,cred_mat4,cred_mat5,cred_mat6,cred_sup_normal,cred_sup_espec,cred_sup,cred_mat_normal,cred_mat_movilidad,cred_ptes_acta,cred_mat_practicas,cred_mat_sem_a,cred_mat_sem_b,cred_mat_anu,cred_mat_total,cred_sup_sem_a,cred_sup_sem_b,cred_sup_anu,cred_sup_total,rendimiento_cuat_a,rendimiento_cuat_b,rendimiento_total,anyo_inicio_estudios,cred_sup_1o,cred_sup_2o,cred_sup_3o,cred_sup_4o,cred_sup_5o,cred_sup_6o,practicas,cred_sup_tit,cred_pend_sup_tit,asig1,ajuste1
0,319636fc9270,620c9c332101,4596fcf257c4,e4f95d56d90df35e,F,L,TC,A,B,2022,9d4d56e973e0be4b,10,0,2023-11-20 12:04:51,3,4,0,0,45,585,0,0,285,0,285,650,0,0,0,330,180,120,630,285,0,0,285,8636,0,4524,2012,600,600,555,285,0,0,0,20766,3234,285,0
1,319636fc9270,620c9c332101,81f4b5a1d0a8,e4f95d56d90df35e,F,L,TC,A,B,2022,0206bf80627626d5,10,0,2023-11-20 12:04:51,3,4,0,0,45,585,0,0,285,0,285,650,0,0,0,330,180,120,630,285,0,0,285,8636,0,4524,2012,600,600,555,285,0,0,0,20766,3234,285,0
2,319636fc9270,620c9c332101,442fcac005ed,e4f95d56d90df35e,F,L,TC,A,B,2022,1c2c484557b418df,10,19,2023-11-20 12:04:51,3,4,0,0,45,585,0,0,285,0,285,650,0,0,0,330,180,120,630,285,0,0,285,8636,0,4524,2012,600,600,555,285,0,0,0,20766,3234,285,0
3,319636fc9270,620c9c332101,3dc87ab71825,e4f95d56d90df35e,F,L,TC,A,B,2022,e4a1114afa2006ec,10,0,2023-11-20 12:04:51,3,4,0,0,45,585,0,0,285,0,285,650,0,0,0,330,180,120,630,285,0,0,285,8636,0,4524,2012,600,600,555,285,0,0,0,20766,3234,285,0
4,319636fc9270,620c9c332101,677c622c0bfb,e4f95d56d90df35e,F,L,TC,A,B,2022,da465a23551b7687,10,0,2023-11-20 12:04:51,3,4,0,0,45,585,0,0,285,0,285,650,0,0,0,330,180,120,630,285,0,0,285,8636,0,4524,2012,600,600,555,285,0,0,0,20766,3234,285,0


File exists: True
