# Predicción del rendimiento académico: un enfoque de aprendizaje automático que utiliza LightGBM para la detección temprana del abandono escolar

In [None]:
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('data/students_dropout/dataset.csv')

In [None]:
class StudentDropoutAnalysis:
    def __init__(self):
        self.df = pd.read_csv('data/students_dropout/dataset.csv')
        self.prepare_data()
        
    def prepare_data(self):
        """Preparación inicial de datos"""
        # Calcular métricas derivadas
        self.df['academic_load'] = (self.df['Curricular units 1st sem (enrolled)'] + 
                                  self.df['Curricular units 2nd sem (enrolled)'])
        self.df['academic_performance'] = (self.df['Curricular units 1st sem (approved)'] / 
                                         self.df['Curricular units 1st sem (enrolled)'])
        self.df['grade_evolution'] = (self.df['Curricular units 2nd sem (grade)'] - 
                                    self.df['Curricular units 1st sem (grade)'])

    def basic_statistics(self):
        """Estadísticas básicas y distribución de variables"""
        print("\nInformación básica del dataset:")
        print(self.df.info())
        
        print("\nEstadísticas descriptivas:")
        print(self.df.describe())
        
        print("\nDistribución de Target:")
        print(self.df['Target'].value_counts(normalize=True))

    def demographic_analysis(self):
        """Análisis demográfico completo"""
        # Distribución de edad
        fig1 = px.histogram(self.df, x="Age at enrollment", color="Target",
                          title="Distribución de Edad por Resultado",
                          marginal="box")
        fig1.show()
        
        # Análisis por género y estado civil
        fig = make_subplots(rows=2, cols=2, 
                           subplot_titles=('Distribución por Género',
                                         'Estado Civil',
                                         'Nacionalidad',
                                         'Desplazamiento'))
        
        # Género
        gender_data = pd.crosstab(self.df['Gender'], self.df['Target'])
        fig.add_trace(go.Bar(x=gender_data.index, y=gender_data['Dropout'], name='Dropout'), row=1, col=1)
        fig.add_trace(go.Bar(x=gender_data.index, y=gender_data['Graduate'], name='Graduate'), row=1, col=1)
        
        # Estado Civil
        marital_data = pd.crosstab(self.df['Marital status'], self.df['Target'])
        fig.add_trace(go.Bar(x=marital_data.index, y=marital_data['Dropout'], name='Dropout'), row=1, col=2)
        fig.add_trace(go.Bar(x=marital_data.index, y=marital_data['Graduate'], name='Graduate'), row=1, col=2)
        
        # Nacionalidad
        nationality_data = self.df['Nacionality'].value_counts().head(10)
        fig.add_trace(go.Bar(x=nationality_data.index, y=nationality_data.values), row=2, col=1)
        
        # Desplazamiento
        displaced_data = pd.crosstab(self.df['Displaced'], self.df['Target'])
        fig.add_trace(go.Bar(x=displaced_data.index, y=displaced_data['Dropout'], name='Dropout'), row=2, col=2)
        
        fig.update_layout(height=800, title_text="Análisis Demográfico")
        fig.show()

    def academic_performance_analysis(self):
        """Análisis detallado del rendimiento académico"""
        fig = make_subplots(rows=3, cols=2,
                            subplot_titles=('Rendimiento 1er Semestre',
                                        'Rendimiento 2do Semestre',
                                        'Unidades Aprobadas vs Matriculadas',
                                        'Evolución de Notas',
                                        'Tasa de Aprobación',
                                        'Evaluaciones Totales'))
        
        # Rendimiento primer semestre
        fig.add_trace(
            go.Box(x=self.df['Target'], 
                y=self.df['Curricular units 1st sem (grade)'],
                name='1er Sem'),
            row=1, col=1
        )
        
        # Rendimiento segundo semestre
        fig.add_trace(
            go.Box(x=self.df['Target'], 
                y=self.df['Curricular units 2nd sem (grade)'],
                name='2do Sem'),
            row=1, col=2
        )
        
        # Unidades aprobadas vs matriculadas
        self.df['units_success_rate_1st'] = (
            self.df['Curricular units 1st sem (approved)'] / 
            self.df['Curricular units 1st sem (enrolled)']
        )
        fig.add_trace(
            go.Box(x=self.df['Target'], 
                y=self.df['units_success_rate_1st'],
                name='Tasa Éxito 1S'),
            row=2, col=1
        )
        
        # Evolución entre semestres
        self.df['grade_evolution'] = (
            self.df['Curricular units 2nd sem (grade)'] - 
            self.df['Curricular units 1st sem (grade)']
        )
        fig.add_trace(
            go.Box(x=self.df['Target'], 
                y=self.df['grade_evolution'],
                name='Evolución'),
            row=2, col=2
        )
        
        # Tasa de aprobación general
        self.df['overall_approval_rate'] = (
            (self.df['Curricular units 1st sem (approved)'] + 
            self.df['Curricular units 2nd sem (approved)']) /
            (self.df['Curricular units 1st sem (enrolled)'] + 
            self.df['Curricular units 2nd sem (enrolled)'])
        )
        fig.add_trace(
            go.Box(x=self.df['Target'], 
                y=self.df['overall_approval_rate'],
                name='Tasa Global'),
            row=3, col=1
        )
        
        # Total evaluaciones
        self.df['total_evaluations'] = (
            self.df['Curricular units 1st sem (evaluations)'] + 
            self.df['Curricular units 2nd sem (evaluations)']
        )
        fig.add_trace(
            go.Box(x=self.df['Target'], 
                y=self.df['total_evaluations'],
                name='Evaluaciones'),
            row=3, col=2
        )
        
        # Actualizar layout
        fig.update_layout(
            height=1200,
            title_text="Análisis de Rendimiento Académico",
            showlegend=True
        )
        
        # Añadir estadísticas clave
        stats_text = (
            f"Promedio 1er Sem: {self.df['Curricular units 1st sem (grade)'].mean():.2f}<br>"
            f"Promedio 2do Sem: {self.df['Curricular units 2nd sem (grade)'].mean():.2f}<br>"
            f"Tasa Aprobación Media: {self.df['overall_approval_rate'].mean():.2%}"
        )
        
        fig.add_annotation(
            xref="paper", yref="paper",
            x=1.0, y=1.1,
            text=stats_text,
            showarrow=False,
            font=dict(size=10),
            align="left"
        )
        
        fig.show()

        # Análisis adicional de correlaciones académicas
        academic_cols = [col for col in self.df.columns if 'Curricular units' in col]
        corr_matrix = self.df[academic_cols].corr()
        
        fig2 = go.Figure(data=go.Heatmap(
            z=corr_matrix,
            x=corr_matrix.columns,
            y=corr_matrix.columns,
            colorscale='RdBu',
            zmin=-1, zmax=1
        ))
        
        fig2.update_layout(
            title='Correlaciones entre Variables Académicas',
            height=800,
            width=1000,
            xaxis={'tickangle': 45}
        )
        
        fig2.show()

    def economic_context_analysis(self):
        """Análisis del contexto económico"""
        fig = make_subplots(rows=2, cols=2,
                           subplot_titles=('Desempleo por Resultado',
                                         'Inflación por Resultado',
                                         'GDP por Resultado',
                                         'Distribución de Becas'))
        
        # Factores económicos
        fig.add_trace(
            go.Box(x=self.df['Target'], y=self.df['Unemployment rate']),
            row=1, col=1
        )
        
        fig.add_trace(
            go.Box(x=self.df['Target'], y=self.df['Inflation rate']),
            row=1, col=2
        )
        
        fig.add_trace(
            go.Box(x=self.df['Target'], y=self.df['GDP']),
            row=2, col=1
        )
        
        # Análisis de becas
        scholarship_data = pd.crosstab(self.df['Target'], self.df['Scholarship holder'])
        fig.add_trace(
            go.Bar(x=scholarship_data.index, y=scholarship_data[1], name='Con Beca'),
            row=2, col=2
        )
        
        fig.update_layout(height=800, title_text="Análisis del Contexto Económico")
        fig.show()

    def family_background_analysis(self):
        """Análisis del contexto familiar"""
        fig = make_subplots(rows=2, cols=2,
                           subplot_titles=('Calificación de la Madre',
                                         'Calificación del Padre',
                                         'Ocupación de la Madre',
                                         'Ocupación del Padre'))
        
        # Calificaciones de los padres
        mother_qual = pd.crosstab(self.df["Mother's qualification"], self.df['Target'])
        father_qual = pd.crosstab(self.df["Father's qualification"], self.df['Target'])
        
        fig.add_trace(go.Bar(x=mother_qual.index, y=mother_qual['Dropout'], name='Dropout'), row=1, col=1)
        fig.add_trace(go.Bar(x=father_qual.index, y=father_qual['Dropout'], name='Dropout'), row=1, col=2)
        
        # Ocupaciones de los padres
        mother_occ = pd.crosstab(self.df["Mother's occupation"], self.df['Target'])
        father_occ = pd.crosstab(self.df["Father's occupation"], self.df['Target'])
        
        fig.add_trace(go.Bar(x=mother_occ.index, y=mother_occ['Dropout'], name='Dropout'), row=2, col=1)
        fig.add_trace(go.Bar(x=father_occ.index, y=father_occ['Dropout'], name='Dropout'), row=2, col=2)
        
        fig.update_layout(height=1000, title_text="Análisis del Contexto Familiar")
        fig.show()

    def correlation_analysis(self):
        """Análisis de correlaciones"""
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        corr_matrix = self.df[numeric_cols].corr()
        
        fig = go.Figure(data=go.Heatmap(
            z=corr_matrix,
            x=corr_matrix.columns,
            y=corr_matrix.columns,
            colorscale='RdBu',
            zmin=-1, zmax=1
        ))
        
        fig.update_layout(
            title='Matriz de Correlación',
            height=1000,
            width=1000
        )
        fig.show()

    def course_specific_analysis(self):
        """Análisis específico por curso"""
        # Tasa de deserción por curso
        course_data = pd.crosstab(self.df['Course'], self.df['Target'])
        course_dropout_rate = course_data['Dropout'] / course_data.sum(axis=1)
        
        fig = go.Figure(data=[
            go.Bar(x=course_dropout_rate.index, y=course_dropout_rate.values)
        ])
        
        fig.update_layout(
            title='Tasa de Deserción por Curso',
            xaxis_title='Curso',
            yaxis_title='Tasa de Deserción'
        )
        fig.show()
        
        # Rendimiento por curso
        fig = px.box(self.df, x='Course', y='Curricular units 1st sem (grade)',
                    color='Target', title='Rendimiento por Curso')
        fig.show()

    def generate_insights(self):
        """Genera insights principales del análisis"""
        insights = {
            'total_students': len(self.df),
            'dropout_rate': (self.df['Target'] == 'Dropout').mean(),
            'avg_age': self.df['Age at enrollment'].mean(),
            'scholarship_impact': pd.crosstab(self.df['Scholarship holder'], 
                                            self.df['Target'])['Dropout'].mean(),
            'performance_correlation': self.df['Curricular units 1st sem (grade)'].corr(
                self.df['Curricular units 2nd sem (grade)'])
        }
        return pd.Series(insights)

In [None]:
class StudentDropoutModel:
    def __init__(self, test_size=0.2, random_state=42):
        self.test_size = test_size
        self.random_state = random_state
        self.label_encoders = {}
        self.selected_features = None
        self.load_data()
        
    def load_data(self):
        df = pd.read_csv('data/students_dropout/dataset.csv')
        self.train_df, self.test_df = train_test_split(
            df, 
            test_size=self.test_size,
            random_state=self.random_state,
            stratify=df['Target']
        )
        
    def remove_multicollinearity(self, threshold=0.75):
        """Remove highly correlated features"""
        correlation_matrix = self.X.corr().abs()
        upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
        to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
        
        self.X = self.X.drop(columns=to_drop)
        self.X_test = self.X_test.drop(columns=to_drop)
        print(f"Removed {len(to_drop)} collinear features: {to_drop}")
        
    def select_features(self, k=20):
        """Select top k features using LGBM feature importance"""
        model = LGBMClassifier(
            n_estimators=1000,
            importance_type='gain',
            random_state=42
        )
        
        model.fit(self.X, self.y)
        
        feature_imp = pd.DataFrame({
            'feature': self.X.columns,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        self.selected_features = feature_imp.head(k)['feature'].tolist()
        self.X = self.X[self.selected_features]
        self.X_test = self.X_test[self.selected_features]
        print(f"Selected top {k} features")
        
    def model_data_preparation(self):
        categorical_features = ['Marital status', 'Application mode', 'Course', 
                            'Previous qualification', 'Nacionality', 'Gender']
        
        combined_data = pd.concat([self.train_df, self.test_df], axis=0)
        
        for feature in categorical_features:
            self.label_encoders[feature] = LabelEncoder()
            encoded_values = self.label_encoders[feature].fit_transform(combined_data[feature])
            self.train_df[feature] = encoded_values[:len(self.train_df)]
            self.test_df[feature] = encoded_values[len(self.train_df):]
        
        self.target_encoder = LabelEncoder()
        self.y = self.target_encoder.fit_transform(self.train_df['Target'])
        self.X = self.train_df.drop('Target', axis=1)
        self.X_test = self.test_df.drop('Target', axis=1)
        
    def create_features(self):
        for df in [self.train_df, self.test_df]:
            # Academic Performance - First Semester
            df['approval_rate_1st'] = df['Curricular units 1st sem (approved)'] / df['Curricular units 1st sem (enrolled)']
            df['evaluation_success_1st'] = df['Curricular units 1st sem (approved)'] / df['Curricular units 1st sem (evaluations)']
            df['grade_eval_ratio_1st'] = df['Curricular units 1st sem (grade)'] / (df['Curricular units 1st sem (evaluations)'] + 1)
            df['academic_load_1st'] = df['Curricular units 1st sem (enrolled)'] * df['Curricular units 1st sem (evaluations)']
            
            # Performance Indicators
            df['performance_index'] = df['Curricular units 1st sem (approved)'] * df['Curricular units 1st sem (grade)']
            df['attendance_rate'] = 1 - (df['Curricular units 1st sem (without evaluations)'] / df['Curricular units 1st sem (enrolled)'])
            
            # Economic Features
            df['economic_stress'] = df['Unemployment rate'] * (1 + df['Inflation rate']) / (df['GDP'] + 1)
            df['financial_status'] = df['Tuition fees up to date'] * (1 - df['Debtor'])
            
            # Demographic Interactions
            df['parent_qual_sum'] = df["Mother's qualification"] + df["Father's qualification"]
            df['age_qual_interaction'] = df['Age at enrollment'] * df['parent_qual_sum']
            df['scholarship_eco'] = df['Scholarship holder'] * df['economic_stress']
            
            # Categorical Interactions
            df['academic_background'] = df['Previous qualification'] * df['approval_rate_1st']
            df['enrollment_context'] = df['Application order'] * df['approval_rate_1st']
    
    def optimize_parameters(self):
        """Find optimal parameters using basic grid search"""
        param_grid = {
            'n_estimators': [500, 1000, 1500],
            'learning_rate': [0.01, 0.03, 0.05],
            'max_depth': [10, 15, 20],
            'num_leaves': [31, 50, 63],
            'colsample_bytree': [0.7, 0.8, 0.9],
            'subsample': [0.7, 0.8, 0.9],
            'min_child_samples': [20, 30, 50],
            'reg_alpha': [0.0, 0.1, 0.3],
            'reg_lambda': [0.0, 0.1, 0.3]
        }
        
        # Calcular class weights
        class_weights = dict(zip(
            range(len(np.unique(self.y))),
            compute_class_weight('balanced', classes=np.unique(self.y), y=self.y)
        ))
        
        best_score = 0
        best_params = None
        
        for params in [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]:
            model = LGBMClassifier(**params, class_weight=class_weights ,random_state=42, objective='multiclass', verbose=-1)
            scores = cross_val_score(model, self.X, self.y, cv=5, scoring='accuracy')
            mean_score = np.mean(scores)
            
            if mean_score > best_score:
                best_score = mean_score
                best_params = params
        
        print(f"Best parameters found: {best_params}")
        print(f"Best CV score: {best_score:.4f}")
        return best_params

    def train_model(self, n_splits=5):
        # Remove multicollinearity and select features
        self.remove_multicollinearity()
        self.select_features()
        
        # Get optimal parameters
        best_params = self.optimize_parameters()
        best_params.update({
            'objective': 'multiclass',
            'verbosity': -1,
            'random_state': 42
        })
        
        class_weights = dict(zip(
            range(len(np.unique(self.y))),
            compute_class_weight('balanced', classes=np.unique(self.y), y=self.y)
        ))
        
        best_params.update({'class_weight': class_weights})
        
        kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        self.fold_scores = []
        self.predictions = np.zeros((len(self.test_df), len(np.unique(self.y))))
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(self.X, self.y)):
            X_train, X_val = self.X.iloc[train_idx], self.X.iloc[val_idx]
            y_train, y_val = self.y[train_idx], self.y[val_idx]
            
            model = LGBMClassifier(**best_params)
            model.fit(
                    X_train, y_train,
                    eval_set=[(X_val, y_val)],
                    eval_metric='multi_logloss',
                    early_stopping_rounds=50,
                    verbose=False
                )
            
            val_preds = model.predict(X_val)
            fold_score = accuracy_score(y_val, val_preds)
            self.fold_scores.append(fold_score)
            
            self.predictions += model.predict_proba(self.X_test) / n_splits
            print(f"Fold {fold + 1} Accuracy: {fold_score:.4f}")
        
        print(f"\nMean CV Accuracy: {np.mean(self.fold_scores):.4f} ± {np.std(self.fold_scores):.4f}")
        return self.predictions

    def get_results(self):
        pred_labels = np.argmax(self.predictions, axis=1)
        final_predictions = self.target_encoder.inverse_transform(pred_labels)
        
        results_df = pd.DataFrame({
            'True_Label': self.test_df['Target'],
            'Predicted': final_predictions,
            'Dropout_Prob': self.predictions[:, 0],
            'Graduate_Prob': self.predictions[:, 1],
            'Enrolled_Prob': self.predictions[:, 2]
        })
        
        return results_df    

## Análisis Exploratorio de Datos - Deserción Estudiantil

In [None]:
analysis = StudentDropoutAnalysis()

In [None]:
# Ejecutar análisis completo
analysis.basic_statistics()
analysis.demographic_analysis()
analysis.academic_performance_analysis()
analysis.economic_context_analysis()
analysis.family_background_analysis()
analysis.correlation_analysis()
analysis.course_specific_analysis()

In [None]:
# Mostrar insights
print("\nInsights principales:")
print(analysis.generate_insights())

## Modelando la desercion

In [None]:
model = StudentDropoutModel()

In [None]:
model.model_data_preparation()
model.create_features()

In [None]:
predictions = model.train_model()

In [None]:
results = model.get_results()

In [None]:
print("\nMuestra de predicciones:")
print(results.head())