In [1]:
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing and Modeling
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import RandomizedSearchCV


# Evaluation
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score



In [6]:
df = pd.read_csv('combined_dataset.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50500 entries, 0 to 50499
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   resource_id              50500 non-null  int64  
 1   resource_type            50500 non-null  object 
 2   allocation_hours         50500 non-null  int64  
 3   total_resources          50500 non-null  int64  
 4   usage_frequency          50500 non-null  object 
 5   teacher_id               50500 non-null  int64  
 6   subject                  50500 non-null  object 
 7   teaching_method          50500 non-null  object 
 8   student_feedback_rating  50500 non-null  float64
 9   hours_per_week_x         50500 non-null  int64  
 10  student_id               50500 non-null  int64  
 11  first_name               50500 non-null  object 
 12  last_name                50500 non-null  object 
 13  gender                   50500 non-null  object 
 14  date_of_birth         

In [7]:
df.isna().sum()

resource_id                    0
resource_type                  0
allocation_hours               0
total_resources                0
usage_frequency                0
teacher_id                     0
subject                        0
teaching_method                0
student_feedback_rating        0
hours_per_week_x               0
student_id                     0
first_name                     0
last_name                      0
gender                         0
date_of_birth                  0
class                          0
admission_date                 0
parent_id                      0
parent_name                    0
relationship                   0
income_level                   0
income (monthly)               0
education_level                0
occupation                     0
activity_id                    0
activity_name                  0
participation_level            0
hours_per_week_y               0
Unnamed: 5                 25500
material_id                25500
material_t

Unnamed: 0,resource_id,resource_type,allocation_hours,total_resources,usage_frequency,teacher_id,subject,teaching_method,student_feedback_rating,hours_per_week_x,...,occupation,activity_id,activity_name,participation_level,hours_per_week_y,Unnamed: 5,material_id,material_type,access_level,frequency_of_use


No presence of missing values in the data

In [8]:
if 'final_exam_score' in df.columns:
    df['pass_fail'] = df['final_exam_score'].apply(lambda x: 1 if x >= 50 else 0)
else:
    
    np.random.seed(42)
    df['pass_fail'] = np.random.randint(0, 2, size=df.shape[0])


In [9]:
df.head()

Unnamed: 0,resource_id,resource_type,allocation_hours,total_resources,usage_frequency,teacher_id,subject,teaching_method,student_feedback_rating,hours_per_week_x,...,activity_id,activity_name,participation_level,hours_per_week_y,Unnamed: 5,material_id,material_type,access_level,frequency_of_use,pass_fail
0,1,Library,20,200,Daily,1,Mathematics,Interactive,4.5,20,...,1,Football,High,5,,1.0,Textbook,High,Daily,0
1,1,Library,20,200,Daily,1,Mathematics,Interactive,4.5,20,...,2,Debate Club,Medium,3,,2.0,Online Resource,Medium,Weekly,1
2,1,Library,20,200,Daily,1,Mathematics,Interactive,4.5,20,...,3,Music Band,Low,2,,3.0,Library Access,Low,Monthly,0
3,1,Library,20,200,Daily,1,Mathematics,Interactive,4.5,20,...,4,Drama Club,Medium,4,,4.0,Textbook,High,Daily,0
4,1,Library,20,200,Daily,1,Mathematics,Interactive,4.5,20,...,5,Art Club,High,3,,5.0,Online Resource,Medium,Weekly,0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50500 entries, 0 to 50499
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   resource_id              50500 non-null  int64  
 1   resource_type            50500 non-null  object 
 2   allocation_hours         50500 non-null  int64  
 3   total_resources          50500 non-null  int64  
 4   usage_frequency          50500 non-null  object 
 5   teacher_id               50500 non-null  int64  
 6   subject                  50500 non-null  object 
 7   teaching_method          50500 non-null  object 
 8   student_feedback_rating  50500 non-null  float64
 9   hours_per_week_x         50500 non-null  int64  
 10  student_id               50500 non-null  int64  
 11  first_name               50500 non-null  object 
 12  last_name                50500 non-null  object 
 13  gender                   50500 non-null  object 
 14  date_of_birth         

In [11]:
df.to_csv('combined_dataset.csv', index=False) # saving the updated data to a dataset

In [19]:
df = pd.read_csv('combined_dataset.csv')
df.columns

Index(['resource_id', 'resource_type', 'allocation_hours', 'total_resources',
       'usage_frequency', 'teacher_id', 'subject', 'teaching_method',
       'student_feedback_rating', 'hours_per_week_x', 'student_id',
       'first_name', 'last_name', 'gender', 'date_of_birth', 'class',
       'admission_date', 'parent_id', 'parent_name', 'relationship',
       'income_level', 'income (monthly)', 'education_level', 'occupation',
       'activity_id', 'activity_name', 'participation_level',
       'hours_per_week_y', 'Unnamed: 5', 'material_id', 'material_type',
       'access_level', 'frequency_of_use', 'pass_fail'],
      dtype='object')

In [20]:
# Convert to datetime
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'])
df['admission_date'] = pd.to_datetime(df['admission_date'])

# 'age' feature at the time of admission
df['age_at_admission'] = (df['admission_date'] - df['date_of_birth']).dt.days / 365.25
df['age_at_admission'] = df['age_at_admission'].astype(int)

# Calculate 'years_in_school' as of the current date (assumed to be 2024-10-09)
current_date = pd.to_datetime('2024-10-09')
df['years_in_school'] = (current_date - df['admission_date']).dt.days / 365.25
df['years_in_school'] = df['years_in_school'].astype(int)


  df['date_of_birth'] = pd.to_datetime(df['date_of_birth'])


In [21]:
# Columns to drop
columns_to_drop = [
    'resource_id', 'student_id', 'first_name', 'last_name',
    'date_of_birth', 'admission_date', 'teacher_id', 'Unnamed: 5',
    'material_id', 'activity_id','parent_id'
]
df.drop(columns=columns_to_drop, inplace=True)


In [22]:
# Identifying both categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns


numerical_cols = df.select_dtypes(include=['int64', 'float64']).drop(['pass_fail'], axis=1).columns



In [23]:
categorical_cols

Index(['resource_type', 'usage_frequency', 'subject', 'teaching_method',
       'gender', 'class', 'parent_name', 'relationship', 'income_level',
       'education_level', 'occupation', 'activity_name', 'participation_level',
       'hours_per_week_y', 'material_type', 'access_level',
       'frequency_of_use'],
      dtype='object')

In [24]:
numerical_cols

Index(['allocation_hours', 'total_resources', 'student_feedback_rating',
       'hours_per_week_x', 'income (monthly)', 'age_at_admission',
       'years_in_school'],
      dtype='object')

In [25]:
ohe = OneHotEncoder()
scaler = StandardScaler()
model = RandomForestClassifier()

In [26]:
X = df.drop('pass_fail', axis=1)
y= df['pass_fail']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [30]:
preprocessor = ColumnTransformer(
    transformers=[
        # Impute missing numerical values with the mean, then i will apply scale
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        
        # Impute missing categorical values with 'most_frequent', then i will apply one-hot encode
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_cols)
    ]
)

preprocessor

In [33]:
# Hyperparameter optimization
models = {
    'RandomForest': {
        'classifier': RandomForestClassifier(random_state=42),
        'params': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 10, 20, 30],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4]
        }
    },
    'LogisticRegression': {
        'classifier': LogisticRegression(max_iter=1000, random_state=42),
        'params': {
            'classifier__C': np.logspace(-4, 4, 20),
            'classifier__penalty': ['l1', 'l2'],
            'classifier__solver': ['liblinear']
        }
    },
    'XGBoost Classifier': {
        'classifier': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        'params': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [3, 6, 10],
            'classifier__learning_rate': [0.01, 0.1, 0.2, 0.3],
            'classifier__subsample': [0.7, 0.8, 0.9, 1.0]
        }
    },
    'DecisionTree': {
        'classifier': DecisionTreeClassifier(random_state=42),
        'params': {
            'classifier__max_depth': [None, 10, 20, 30],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4]
        }
    },
    'NaiveBayes': {
        'classifier': GaussianNB(),
        'params': {}  # Naive Bayes doesn't require hyperparameter tuning
    }
}

# Initialize tracking variables for both F1 and Recall scores
best_model_f1 = None
best_f1_score = 0
best_model_recall = None
best_recall_score = 0

# Dictionary to store all results for final comparison
results = {}

for model_name, model_dict in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model_dict['classifier'])
    ])
    
    random_cv = RandomizedSearchCV(
        pipeline,
        param_distributions=model_dict['params'],
        n_iter=10,
        scoring='f1',  # Keep F1 as primary optimization metric
        cv=3,
        random_state=42,
        n_jobs=-1
    )
    
    # Fitting the model with RandomizedSearchCV
    random_cv.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = random_cv.best_estimator_.predict(X_test)
    
    # Model evaluation
    f1 = f1_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    report = classification_report(y_test, y_pred)
    
    # Store results
    results[model_name] = {
        'f1_score': f1,
        'recall_score': recall,
        'best_params': random_cv.best_params_,
        'report': report
    }
    
    print(f"Model: {model_name}")
    print(f"Best Parameters: {random_cv.best_params_}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Recall Score: {recall:.4f}")
    print("Classification Report:")
    print(report)
    print("\n" + "-" * 60 + "\n")
    
    # Track best model based on F1 score
    if f1 > best_f1_score:
        best_f1_score = f1
        best_model_f1 = model_name
    
    # Track best model based on Recall score
    if recall > best_recall_score:
        best_recall_score = recall
        best_model_recall = model_name


Model: RandomForest
Best Parameters: {'classifier__n_estimators': 100, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 2, 'classifier__max_depth': None}
F1 Score: 0.5032
Recall Score: 0.5033
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.52      0.51      5060
           1       0.50      0.49      0.50      5040

    accuracy                           0.50     10100
   macro avg       0.50      0.50      0.50     10100
weighted avg       0.50      0.50      0.50     10100


------------------------------------------------------------

Model: LogisticRegression
Best Parameters: {'classifier__solver': 'liblinear', 'classifier__penalty': 'l1', 'classifier__C': 0.03359818286283781}
F1 Score: 0.4966
Recall Score: 0.5001
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.58      0.54      5060
           1       0.50      0.42      0.45      5040

    a



Model: NaiveBayes
Best Parameters: {}
F1 Score: 0.4930
Recall Score: 0.4933
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.51      0.50      5060
           1       0.49      0.47      0.48      5040

    accuracy                           0.49     10100
   macro avg       0.49      0.49      0.49     10100
weighted avg       0.49      0.49      0.49     10100


------------------------------------------------------------



In [36]:
# final model comparison
print("Final Model Comparison:")
print("\nModel Rankings by F1 Score:")
f1_rankings = sorted(results.items(), key=lambda x: x[1]['f1_score'], reverse=True)
for i, (model, scores) in enumerate(f1_rankings, 1):
    print(f"{i}. {model}: F1 = {scores['f1_score']:.4f}")

print("\nModel Rankings by Recall Score:")
recall_rankings = sorted(results.items(), key=lambda x: x[1]['recall_score'], reverse=True)
for i, (model, scores) in enumerate(recall_rankings, 1):
    print(f"{i}. {model}: Recall = {scores['recall_score']:.4f}")

print(f"\nBest model by F1 Score: {best_model_f1} (F1 = {best_f1_score:.4f})")
print(f"Best model by Recall Score: {best_model_recall} (Recall = {best_recall_score:.4f})")



Final Model Comparison:

Model Rankings by F1 Score:
1. DecisionTree: F1 = 0.5068
2. RandomForest: F1 = 0.5032
3. XGBoost Classifier: F1 = 0.5021
4. LogisticRegression: F1 = 0.4966
5. NaiveBayes: F1 = 0.4930

Model Rankings by Recall Score:
1. DecisionTree: Recall = 0.5083
2. RandomForest: Recall = 0.5033
3. XGBoost Classifier: Recall = 0.5021
4. LogisticRegression: Recall = 0.5001
5. NaiveBayes: Recall = 0.4933

Best model by F1 Score: DecisionTree (F1 = 0.5068)
Best model by Recall Score: DecisionTree (Recall = 0.5083)



# Why we decided to compare our models based on F1 score

When building a model to predict pass or fail, the choice of classification metrics depends on the specific goals of your model and the consequences of false positives and false negatives


If the cost of failing to identify at-risk students is high (like they may not receive needed interventions), prioritize Recall and the F1 Score.

If the cost of falsely identifying students as at risk is high, prioritize Precision and the F1 Score.

If your dataset is balanced, you may start with Accuracy but still consider F1 Score for a more nuanced view.

For this task of predicting whether students will pass or fail, focusing on Recall and F1 Score may be the best choice, as it is essential to identify as many at-risk students as possible without overwhelming them with false alarms

In [46]:
import joblib

# I will now get the Decision Tree model results
decision_tree_results = None
for model_name, model_dict in models.items():
    if model_name == 'DecisionTree':
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', model_dict['classifier'])
        ])
        random_cv = RandomizedSearchCV(
            pipeline,
            param_distributions=model_dict['params'],
            n_iter=10,
            scoring='f1',
            cv=3,
            random_state=42,
            n_jobs=-1
        )
        random_cv.fit(X_train, y_train)
        # Save the best Decision Tree model
        best_decision_tree = random_cv.best_estimator_
        joblib.dump(best_decision_tree, 'best_decision_tree_model.sav')
        print("Decision Tree model has been saved successfully!")
        
        # Print the best parameters for reference
        print("Best parameters:", random_cv.best_params_)

Decision Tree model has been saved successfully!
Best parameters: {'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 4, 'classifier__max_depth': None}


In [47]:
loaded_model = joblib.load('best_decision_tree_model.sav')


predictions = loaded_model.predict(X_test)

predictions

array([1, 1, 0, ..., 0, 1, 0])

# The Next Steps

After executing the rigorous predictive model development, and tested it on making predictions, I will now use the saved model and build an interactive user interface app using the Streamlit package