In [39]:
!pip install streamlit
!pip install plotly



In [51]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import os

In [41]:
# Set file path
DATA_PATH = "C:/Intro_DataScience/Week4/Academic_Success_Data.csv"

In [42]:
class StudentDropoutPredictor:
    def __init__(self):
        self.model = RandomForestClassifier(random_state=42)
        self.scaler = StandardScaler()
        
    def load_data(self):
        """Load the dataset."""
        try:
            # Load data
            df = pd.read_csv(DATA_PATH)
            print("Data loaded successfully!")
            print("Shape of data:", df.shape)
            return df
        except Exception as e:
            print(f"Error loading data: {str(e)}")
            return None
    
    def preprocess_data(self, df):
        """Preprocess the data."""
        # Filter for only Dropout and Graduate
        df = df[df['Target'].isin(['Dropout', 'Graduate'])]
        
        # Feature engineering
        df['first_sem_success_ratio'] = df['Curricular units 1st sem (approved)'] / df['Curricular units 1st sem (enrolled)'].replace(0, 1)
        df['second_sem_success_ratio'] = df['Curricular units 2nd sem (approved)'] / df['Curricular units 2nd sem (enrolled)'].replace(0, 1)
        df['average_grade'] = (df['Curricular units 1st sem (grade)'] + df['Curricular units 2nd sem (grade)']) / 2
        
        return df
    
    def prepare_features(self, df):
        """Prepare features for modeling."""
        # Select features
        features = [
            'Age at enrollment', 'Previous qualification (grade)', 'Admission grade',
            'Curricular units 1st sem (enrolled)', 'Curricular units 2nd sem (enrolled)',
            'first_sem_success_ratio', 'second_sem_success_ratio', 'average_grade'
        ]
        
        X = df[features]
        y = df['Target']
        
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        return X_train_scaled, X_test_scaled, y_train, y_test, features
    
    def train_model(self, X_train, y_train):
        """Train the model."""
        self.model.fit(X_train, y_train)
        print("Model trained successfully!")
    
    def evaluate_model(self, X_test, y_test):
        """Evaluate the model."""
        y_pred = self.model.predict(X_test)
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
    
    def predict(self, input_data):
        """Make predictions."""
        # Scale the input data
        input_scaled = self.scaler.transform(input_data)
        # Make prediction
        prediction = self.model.predict(input_scaled)
        probability = self.model.predict_proba(input_scaled)
        return prediction[0], probability[0]


In [52]:
def main():
    # Create predictor instance
    predictor = StudentDropoutPredictor()
    
    # Load data
    print("Loading data...")
    df = predictor.load_data()
    
    if df is not None:
        # Preprocess data
        print("\nPreprocessing data...")
        df_processed = predictor.preprocess_data(df)
        
        # Prepare features
        print("\nPreparing features...")
        X_train, X_test, y_train, y_test, features = predictor.prepare_features(df_processed)
        
        # Train model
        print("\nTraining model...")
        predictor.train_model(X_train, y_train)
        
        # Evaluate model
        print("\nEvaluating model...")
        predictor.evaluate_model(X_test, y_test)
        
        # Example prediction
        print("\nMaking example prediction...")
        example_data = pd.DataFrame({
            'Age at enrollment': [20],
            'Previous qualification (grade)': [120],
            'Admission grade': [130],
            'Curricular units 1st sem (enrolled)': [6],
            'Curricular units 2nd sem (enrolled)': [6],
            'first_sem_success_ratio': [0.8],
            'second_sem_success_ratio': [0.8],
            'average_grade': [12.5]
        })
        
        prediction, probability = predictor.predict(example_data)
        print(f"\nPredicted outcome: {prediction}")
        print(f"Probability: {max(probability):.2%}")

if __name__ == "__main__":
    main()

Loading data...


AttributeError: 'StudentDropoutPredictor' object has no attribute 'load_data'

Alternate method

In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [54]:
class StudentDropoutPredictor:
    def __init__(self):
        """Initialize the predictor with model and scaler."""
        self.model = RandomForestClassifier(random_state=42)
        self.scaler = StandardScaler()
        self.features = None
    
    def load_data(self):
        """Load the dataset."""
        try:
            # Load data with explicit encoding
            df = pd.read_csv("C:/Intro_DataScience/Week4/Academic_Success_Data.csv", encoding='utf-8')
            print("Data loaded successfully!")
            print("Shape of data:", df.shape)
            return df
        except FileNotFoundError:
            print("Error: File not found. Please check the file path.")
            return None
        except Exception as e:
            print(f"Error loading data: {str(e)}")
            return None
    
    def preprocess_data(self, df):
        """Preprocess the data."""
        try:
            # Filter for only Dropout and Graduate
            df = df[df['Target'].isin(['Dropout', 'Graduate'])]
            
            # Feature engineering
            df['first_sem_success_ratio'] = (
                df['Curricular units 1st sem (approved)'] / 
                df['Curricular units 1st sem (enrolled)'].replace(0, 1)
            )
            
            df['second_sem_success_ratio'] = (
                df['Curricular units 2nd sem (approved)'] / 
                df['Curricular units 2nd sem (enrolled)'].replace(0, 1)
            )
            
            df['average_grade'] = (
                df['Curricular units 1st sem (grade)'] + 
                df['Curricular units 2nd sem (grade)']
            ) / 2
            
            print("Data preprocessing completed successfully!")
            return df
            
        except Exception as e:
            print(f"Error in preprocessing: {str(e)}")
            return None
    
    def prepare_features(self, df):
        """Prepare features for modeling."""
        try:
            # Select features
            self.features = [
                'Age at enrollment',
                'Previous qualification (grade)',
                'Admission grade',
                'Curricular units 1st sem (enrolled)',
                'Curricular units 2nd sem (enrolled)',
                'first_sem_success_ratio',
                'second_sem_success_ratio',
                'average_grade'
            ]
            
            X = df[self.features]
            y = df['Target']
            
            # Split the data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42
            )
            
            # Scale features
            X_train_scaled = self.scaler.fit_transform(X_train)
            X_test_scaled = self.scaler.transform(X_test)
            
            print("Features prepared successfully!")
            return X_train_scaled, X_test_scaled, y_train, y_test
            
        except Exception as e:
            print(f"Error in feature preparation: {str(e)}")
            return None
    
    def train_model(self, X_train, y_train):
        """Train the model."""
        try:
            self.model.fit(X_train, y_train)
            print("Model trained successfully!")
            return True
        except Exception as e:
            print(f"Error in model training: {str(e)}")
            return False
    
    def evaluate_model(self, X_test, y_test):
        """Evaluate the model."""
        try:
            y_pred = self.model.predict(X_test)
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))
            return True
        except Exception as e:
            print(f"Error in model evaluation: {str(e)}")
            return False
    
    def predict(self, input_data):
        """Make predictions."""
        try:
            # Ensure input data has all required features
            for feature in self.features:
                if feature not in input_data.columns:
                    raise ValueError(f"Missing feature: {feature}")
            
            # Scale the input data
            input_scaled = self.scaler.transform(input_data)
            
            # Make prediction
            prediction = self.model.predict(input_scaled)
            probability = self.model.predict_proba(input_scaled)
            
            return prediction[0], probability[0]
            
        except Exception as e:
            print(f"Error in prediction: {str(e)}")
            return None, None

In [55]:
def main():
    # Create predictor instance
    predictor = StudentDropoutPredictor()
    
    # Load data
    print("Loading data...")
    df = predictor.load_data()
    
    if df is not None:
        # Preprocess data
        print("\nPreprocessing data...")
        df_processed = predictor.preprocess_data(df)
        
        if df_processed is not None:
            # Prepare features
            print("\nPreparing features...")
            result = predictor.prepare_features(df_processed)
            
            if result is not None:
                X_train, X_test, y_train, y_test = result
                
                # Train model
                print("\nTraining model...")
                if predictor.train_model(X_train, y_train):
                    
                    # Evaluate model
                    print("\nEvaluating model...")
                    predictor.evaluate_model(X_test, y_test)
                    
                    # Example prediction
                    print("\nMaking example prediction...")
                    example_data = pd.DataFrame({
                        'Age at enrollment': [20],
                        'Previous qualification (grade)': [120],
                        'Admission grade': [130],
                        'Curricular units 1st sem (enrolled)': [6],
                        'Curricular units 2nd sem (enrolled)': [6],
                        'first_sem_success_ratio': [0.8],
                        'second_sem_success_ratio': [0.8],
                        'average_grade': [12.5]
                    })
                    
                    prediction, probability = predictor.predict(example_data)
                    if prediction is not None:
                        print(f"\nPredicted outcome: {prediction}")
                        print(f"Probability: {max(probability):.2%}")

if __name__ == "__main__":
    main()

Loading data...
Data loaded successfully!
Shape of data: (4424, 37)

Preprocessing data...
Data preprocessing completed successfully!

Preparing features...
Features prepared successfully!

Training model...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['first_sem_success_ratio'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['second_sem_success_ratio'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['average_grade'] = (


Model trained successfully!

Evaluating model...

Classification Report:
              precision    recall  f1-score   support

     Dropout       0.87      0.83      0.85       277
    Graduate       0.90      0.93      0.91       449

    accuracy                           0.89       726
   macro avg       0.89      0.88      0.88       726
weighted avg       0.89      0.89      0.89       726


Making example prediction...

Predicted outcome: Graduate
Probability: 71.00%


## Enhancing the model and adding in new Features

In [56]:
# importing the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump, load

In [57]:
class StudentDropoutPredictor:
    def __init__(self):
        """Initialize the predictor with model and scaler."""
        self.model = RandomForestClassifier(
            n_estimators=200,
            max_depth=10,
            min_samples_split=5,
            random_state=42
        )
        self.scaler = StandardScaler()
        self.features = None
    
    def load_data(self):
        """Load the dataset."""
        try:
            df = pd.read_csv("C:/Intro_DataScience/Week4/Academic_Success_Data.csv", encoding='utf-8')
            print("Data loaded successfully!")
            print("Shape of data:", df.shape)
            return df
        except Exception as e:
            print(f"Error loading data: {str(e)}")
            return None
    
    def preprocess_data(self, df):
        """Preprocess the data."""
        try:
            # Create a copy to avoid SettingWithCopyWarning
            df_processed = df.copy()
            
            # Filter for only Dropout and Graduate
            mask = df_processed['Target'].isin(['Dropout', 'Graduate'])
            df_processed = df_processed[mask].copy()
            
            # Feature engineering
            df_processed.loc[:, 'first_sem_success_ratio'] = (
                df_processed['Curricular units 1st sem (approved)'] / 
                df_processed['Curricular units 1st sem (enrolled)'].replace(0, 1)
            )
            
            df_processed.loc[:, 'second_sem_success_ratio'] = (
                df_processed['Curricular units 2nd sem (approved)'] / 
                df_processed['Curricular units 2nd sem (enrolled)'].replace(0, 1)
            )
            
            df_processed.loc[:, 'average_grade'] = (
                df_processed['Curricular units 1st sem (grade)'] + 
                df_processed['Curricular units 2nd sem (grade)']
            ) / 2
            
            # Additional feature engineering
            df_processed.loc[:, 'performance_change'] = (
                df_processed['Curricular units 2nd sem (grade)'] - 
                df_processed['Curricular units 1st sem (grade)']
            )
            
            df_processed.loc[:, 'economic_factor'] = (
                df_processed['Unemployment rate'] * 
                (1 - df_processed['Scholarship holder']) * 
                (1 - df_processed['Tuition fees up to date'])
            )
            
            print("Data preprocessing completed successfully!")
            return df_processed
            
        except Exception as e:
            print(f"Error in preprocessing: {str(e)}")
            return None
    
    def prepare_features(self, df):
        """Prepare features for modeling."""
        try:
            # Select features
            self.features = [
                'Age at enrollment',
                'Previous qualification (grade)',
                'Admission grade',
                'Curricular units 1st sem (enrolled)',
                'Curricular units 2nd sem (enrolled)',
                'first_sem_success_ratio',
                'second_sem_success_ratio',
                'average_grade',
                'performance_change',
                'economic_factor',
                'Scholarship holder',
                'Tuition fees up to date',
                'International',
                'Displacement'
            ]
            
            X = df[self.features]
            y = df['Target']
            
            # Split the data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42, stratify=y
            )
            
            # Scale features
            X_train_scaled = self.scaler.fit_transform(X_train)
            X_test_scaled = self.scaler.transform(X_test)
            
            print("Features prepared successfully!")
            return X_train_scaled, X_test_scaled, y_train, y_test, X.columns
            
        except Exception as e:
            print(f"Error in feature preparation: {str(e)}")
            return None
    
    def train_model(self, X_train, y_train):
        """Train the model."""
        try:
            self.model.fit(X_train, y_train)
            
            # Perform cross-validation
            cv_scores = cross_val_score(self.model, X_train, y_train, cv=5)
            print("\nCross-validation scores:", cv_scores)
            print(f"Average CV score: {cv_scores.mean():.2f} (+/- {cv_scores.std() * 2:.2f})")
            
            # Get feature importance
            self.plot_feature_importance()
            
            print("Model trained successfully!")
            return True
        except Exception as e:
            print(f"Error in model training: {str(e)}")
            return False
    
    def plot_feature_importance(self):
        """Plot feature importance."""
        try:
            importance = pd.DataFrame({
                'feature': self.features,
                'importance': self.model.feature_importances_
            })
            importance = importance.sort_values('importance', ascending=False)
            
            plt.figure(figsize=(10, 6))
            sns.barplot(data=importance, x='importance', y='feature')
            plt.title('Feature Importance')
            plt.show()
            
        except Exception as e:
            print(f"Error plotting feature importance: {str(e)}")
    
    def evaluate_model(self, X_test, y_test):
        """Evaluate the model."""
        try:
            y_pred = self.model.predict(X_test)
            y_prob = self.model.predict_proba(X_test)
            
            # Print classification report
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))
            
            # Plot confusion matrix
            self.plot_confusion_matrix(y_test, y_pred)
            
            # Plot ROC curve
            self.plot_roc_curve(y_test, y_prob[:, 1])
            
            return True
        except Exception as e:
            print(f"Error in model evaluation: {str(e)}")
            return False
    
    def plot_confusion_matrix(self, y_true, y_pred):
        """Plot confusion matrix."""
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.show()
    
    def plot_roc_curve(self, y_true, y_prob):
        """Plot ROC curve."""
        fpr, tpr, _ = roc_curve(y_true == 'Graduate', y_prob)
        roc_auc = auc(fpr, tpr)
        
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, color='darkorange', lw=2, 
                label=f'ROC curve (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        plt.show()
    
    def predict(self, input_data):
        """Make predictions."""
        try:
            # Scale the input data
            input_scaled = self.scaler.transform(input_data)
            
            # Make prediction
            prediction = self.model.predict(input_scaled)
            probability = self.model.predict_proba(input_scaled)
            
            return prediction[0], probability[0]
            
        except Exception as e:
            print(f"Error in prediction: {str(e)}")
            return None, None

In [58]:
def main():
    # Create predictor instance
    predictor = StudentDropoutPredictor()
    
    # Load data
    print("Loading data...")
    df = predictor.load_data()
    
    if df is not None:
        # Preprocess data
        print("\nPreprocessing data...")
        df_processed = predictor.preprocess_data(df)
        
        if df_processed is not None:
            # Prepare features
            print("\nPreparing features...")
            result = predictor.prepare_features(df_processed)
            
            if result is not None:
                X_train, X_test, y_train, y_test, feature_names = result
                
                # Train model
                print("\nTraining model...")
                if predictor.train_model(X_train, y_train):
                    
                    # Evaluate model
                    print("\nEvaluating model...")
                    predictor.evaluate_model(X_test, y_test)
                    
                    # Example prediction
                    print("\nMaking example prediction...")
                    example_data = pd.DataFrame({
                        'Age at enrollment': [20],
                        'Previous qualification (grade)': [120],
                        'Admission grade': [130],
                        'Curricular units 1st sem (enrolled)': [6],
                        'Curricular units 2nd sem (enrolled)': [6],
                        'first_sem_success_ratio': [0.8],
                        'second_sem_success_ratio': [0.8],
                        'average_grade': [12.5],
                        'performance_change': [0.5],
                        'economic_factor': [5.0],
                        'Scholarship holder': [1],
                        'Tuition fees up to date': [1],
                        'International': [0],
                        'Displacement': [0]
                    })
                    
                    prediction, probability = predictor.predict(example_data)
                    if prediction is not None:
                        print(f"\nPredicted outcome: {prediction}")
                        print(f"Probability: {max(probability):.2%}")
                        
                    # Save the model
                    dump(predictor, 'student_dropout_predictor.joblib')
                    print("\nModel saved successfully!")

if __name__ == "__main__":
    main()

Loading data...
Data loaded successfully!
Shape of data: (4424, 37)

Preprocessing data...
Data preprocessing completed successfully!

Preparing features...
Error in feature preparation: "['Displacement'] not in index"


## Alternate method

In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif, RFE, mutual_info_classif
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump, load

In [60]:
class FeatureSelector:
    def __init__(self):
        """Initialize feature selector with various methods."""
        self.selected_features = None
        self.feature_importance = None
        self.correlation_matrix = None
    
    def correlation_analysis(self, df, threshold=0.8):
        """Remove highly correlated features."""
        # Calculate correlation matrix
        correlation_matrix = df.corr()
        self.correlation_matrix = correlation_matrix
        
        # Plot correlation heatmap
        plt.figure(figsize=(12, 8))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
        plt.title('Feature Correlation Matrix')
        plt.tight_layout()
        plt.show()
        
        # Find highly correlated features
        high_corr_features = []
        for i in range(len(correlation_matrix.columns)):
            for j in range(i):
                if abs(correlation_matrix.iloc[i, j]) > threshold:
                    colname = correlation_matrix.columns[i]
                    high_corr_features.append(colname)
        
        return list(set(high_corr_features))
    
    def mutual_information(self, X, y, n_features=10):
        """Select features based on mutual information."""
        mi_scores = mutual_info_classif(X, y)
        mi_scores = pd.Series(mi_scores, index=X.columns)
        mi_scores = mi_scores.sort_values(ascending=False)
        
        # Plot mutual information scores
        plt.figure(figsize=(10, 6))
        mi_scores.plot(kind='bar')
        plt.title('Mutual Information Scores')
        plt.xlabel('Features')
        plt.ylabel('Mutual Information')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
        
        return mi_scores.head(n_features).index.tolist()
    
    def recursive_feature_elimination(self, X, y, n_features=10):
        """Perform recursive feature elimination."""
        estimator = RandomForestClassifier(random_state=42)
        selector = RFE(estimator=estimator, n_features_to_select=n_features)
        selector = selector.fit(X, y)
        
        # Get selected features
        selected_features = X.columns[selector.support_].tolist()
        
        # Plot RFE rankings
        rankings = pd.Series(selector.ranking_, index=X.columns)
        plt.figure(figsize=(10, 6))
        rankings.plot(kind='bar')
        plt.title('RFE Feature Rankings')
        plt.xlabel('Features')
        plt.ylabel('Ranking (1 = Selected)')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
        
        return selected_features
    
    def select_features(self, X, y, correlation_threshold=0.8, n_features=10):
        """Combine multiple feature selection methods."""
        print("\nPerforming feature selection...")
        
        # 1. Remove highly correlated features
        print("\n1. Correlation Analysis:")
        high_corr_features = self.correlation_analysis(X, correlation_threshold)
        print(f"Highly correlated features to remove: {high_corr_features}")
        
        # 2. Mutual Information
        print("\n2. Mutual Information Analysis:")
        mi_features = self.mutual_information(X, y, n_features)
        print(f"Top features by mutual information: {mi_features}")
        
        # 3. Recursive Feature Elimination
        print("\n3. Recursive Feature Elimination:")
        rfe_features = self.recursive_feature_elimination(X, y, n_features)
        print(f"Features selected by RFE: {rfe_features}")
        
        # Combine results (features that appear in at least 2 methods)
        all_features = set(X.columns) - set(high_corr_features)
        selected_features = [f for f in all_features if sum([
            f in mi_features,
            f in rfe_features
        ]) >= 1]
        
        self.selected_features = selected_features
        print(f"\nFinal selected features: {selected_features}")
        return selected_features

class StudentDropoutPredictor:
    def __init__(self):
        """Initialize the predictor."""
        self.model = RandomForestClassifier(
            n_estimators=200,
            max_depth=10,
            min_samples_split=5,
            random_state=42
        )
        self.scaler = StandardScaler()
        self.feature_selector = FeatureSelector()
        self.features = None
    
    def load_and_preprocess_data(self, data_path):
        """Load and preprocess the data."""
        try:
            # Load data
            df = pd.read_csv(data_path, encoding='utf-8')
            print("Data loaded successfully!")
            print("Shape of data:", df.shape)
            
            # Create a copy to avoid warnings
            df_processed = df.copy()
            
            # Filter for only Dropout and Graduate
            mask = df_processed['Target'].isin(['Dropout', 'Graduate'])
            df_processed = df_processed[mask].copy()
            
            # Feature engineering
            df_processed.loc[:, 'first_sem_success_ratio'] = (
                df_processed['Curricular units 1st sem (approved)'] / 
                df_processed['Curricular units 1st sem (enrolled)'].replace(0, 1)
            )
            
            df_processed.loc[:, 'second_sem_success_ratio'] = (
                df_processed['Curricular units 2nd sem (approved)'] / 
                df_processed['Curricular units 2nd sem (enrolled)'].replace(0, 1)
            )
            
            df_processed.loc[:, 'average_grade'] = (
                df_processed['Curricular units 1st sem (grade)'] + 
                df_processed['Curricular units 2nd sem (grade)']
            ) / 2
            
            df_processed.loc[:, 'performance_change'] = (
                df_processed['Curricular units 2nd sem (grade)'] - 
                df_processed['Curricular units 1st sem (grade)']
            )
            
            df_processed.loc[:, 'economic_factor'] = (
                df_processed['Unemployment rate'] * 
                (1 - df_processed['Scholarship holder']) * 
                (1 - df_processed['Tuition fees up to date'])
            )
            
            return df_processed
            
        except Exception as e:
            print(f"Error in data processing: {str(e)}")
            return None
    
    def prepare_features(self, df):
        """Prepare features for modeling with feature selection."""
        try:
            # Initial feature set
            initial_features = [
                'Age at enrollment',
                'Previous qualification (grade)',
                'Admission grade',
                'Curricular units 1st sem (enrolled)',
                'Curricular units 2nd sem (enrolled)',
                'first_sem_success_ratio',
                'second_sem_success_ratio',
                'average_grade',
                'performance_change',
                'economic_factor',
                'Scholarship holder',
                'Tuition fees up to date',
                'International'
            ]
            
            X = df[initial_features]
            y = df['Target']
            
            # Perform feature selection
            selected_features = self.feature_selector.select_features(X, y)
            self.features = selected_features
            
            # Use selected features
            X = df[selected_features]
            
            # Split and scale data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42, stratify=y
            )
            
            X_train_scaled = self.scaler.fit_transform(X_train)
            X_test_scaled = self.scaler.transform(X_test)
            
            return X_train_scaled, X_test_scaled, y_train, y_test
            
        except Exception as e:
            print(f"Error in feature preparation: {str(e)}")
            return None
    
    def train_and_evaluate(self, X_train, y_train, X_test, y_test):
        """Train and evaluate the model."""
        try:
            # Train model
            self.model.fit(X_train, y_train)
            
            # Evaluate
            train_score = self.model.score(X_train, y_train)
            test_score = self.model.score(X_test, y_test)
            
            print("\nModel Performance:")
            print(f"Training accuracy: {train_score:.4f}")
            print(f"Testing accuracy: {test_score:.4f}")
            
            # Classification report
            y_pred = self.model.predict(X_test)
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))
            
            return True
        except Exception as e:
            print(f"Error in training/evaluation: {str(e)}")
            return False


In [61]:
def main():
    # Initialize predictor
    predictor = StudentDropoutPredictor()
    
    # Load and preprocess data
    print("Loading and preprocessing data...")
    df = predictor.load_and_preprocess_data("C:/Intro_DataScience/Week4/Academic_Success_Data.csv")
    
    if df is not None:
        # Prepare features with selection
        print("\nPreparing features...")
        result = predictor.prepare_features(df)
        
        if result is not None:
            X_train, X_test, y_train, y_test = result
            
            # Train and evaluate model
            print("\nTraining and evaluating model...")
            predictor.train_and_evaluate(X_train, y_train, X_test, y_test)
            
            # Save the model
            dump(predictor, 'student_dropout_predictor.joblib')
            print("\nModel saved successfully!")

if __name__ == "__main__":
    main()

Loading and preprocessing data...
Data loaded successfully!
Shape of data: (4424, 37)

Preparing features...

Performing feature selection...

1. Correlation Analysis:


  plt.show()


Highly correlated features to remove: ['Tuition fees up to date', 'average_grade', 'Curricular units 2nd sem (enrolled)', 'second_sem_success_ratio']

2. Mutual Information Analysis:


  plt.show()


Top features by mutual information: ['second_sem_success_ratio', 'first_sem_success_ratio', 'average_grade', 'performance_change', 'Tuition fees up to date', 'economic_factor', 'Age at enrollment', 'Scholarship holder', 'Curricular units 1st sem (enrolled)', 'Curricular units 2nd sem (enrolled)']

3. Recursive Feature Elimination:


  plt.show()


Features selected by RFE: ['Age at enrollment', 'Previous qualification (grade)', 'Admission grade', 'Curricular units 1st sem (enrolled)', 'first_sem_success_ratio', 'second_sem_success_ratio', 'average_grade', 'performance_change', 'economic_factor', 'Tuition fees up to date']

Final selected features: ['Previous qualification (grade)', 'Age at enrollment', 'Scholarship holder', 'first_sem_success_ratio', 'Curricular units 1st sem (enrolled)', 'economic_factor', 'performance_change', 'Admission grade']

Training and evaluating model...

Model Performance:
Training accuracy: 0.9273
Testing accuracy: 0.8912

Classification Report:
              precision    recall  f1-score   support

     Dropout       0.92      0.79      0.85       284
    Graduate       0.88      0.96      0.91       442

    accuracy                           0.89       726
   macro avg       0.90      0.87      0.88       726
weighted avg       0.89      0.89      0.89       726


Model saved successfully!
