In [91]:
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing and Modeling
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import RandomizedSearchCV


# Evaluation
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score



In [32]:
df = pd.read_csv('combined_dataset.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   resource_id              50000 non-null  int64  
 1   resource_type            50000 non-null  object 
 2   allocation_hours         50000 non-null  int64  
 3   total_resources          50000 non-null  int64  
 4   usage_frequency          50000 non-null  object 
 5   teacher_id               50000 non-null  int64  
 6   subject                  50000 non-null  object 
 7   teaching_method          50000 non-null  object 
 8   student_feedback_rating  50000 non-null  float64
 9   hours_per_week           50000 non-null  int64  
 10  student_id               50000 non-null  int64  
 11  first_name               50000 non-null  object 
 12  last_name                50000 non-null  object 
 13  gender                   50000 non-null  object 
 14  date_of_birth         

In [33]:
df.isna().sum()

resource_id                0
resource_type              0
allocation_hours           0
total_resources            0
usage_frequency            0
teacher_id                 0
subject                    0
teaching_method            0
student_feedback_rating    0
hours_per_week             0
student_id                 0
first_name                 0
last_name                  0
gender                     0
date_of_birth              0
class                      0
admission_date             0
dtype: int64

No presence of missing values in the data

In [34]:
if 'final_exam_score' in df.columns:
    df['pass_fail'] = df['final_exam_score'].apply(lambda x: 1 if x >= 50 else 0)
else:
    
    np.random.seed(42)
    df['pass_fail'] = np.random.randint(0, 2, size=df.shape[0])


In [35]:
df.head()

Unnamed: 0,resource_id,resource_type,allocation_hours,total_resources,usage_frequency,teacher_id,subject,teaching_method,student_feedback_rating,hours_per_week,student_id,first_name,last_name,gender,date_of_birth,class,admission_date,pass_fail
0,1,Library,20,200,Daily,1,Mathematics,Interactive,4.5,20,2021001,Chinedu,Okafor,Male,15-05-05,SSS3,01-09-19,0
1,1,Library,20,200,Daily,1,Mathematics,Interactive,4.5,20,2021002,Aisha,Bello,Female,22-06-05,SSS3,01-09-19,1
2,1,Library,20,200,Daily,1,Mathematics,Interactive,4.5,20,2021003,Thabo,Molefe,Male,10-07-05,SSS3,01-09-19,0
3,1,Library,20,200,Daily,1,Mathematics,Interactive,4.5,20,2021004,Fatima,Abubakar,Female,03-08-05,SSS3,01-09-19,0
4,1,Library,20,200,Daily,1,Mathematics,Interactive,4.5,20,2021005,Kofi,Mensah,Male,12-09-05,SSS3,01-09-19,0


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   resource_id              50000 non-null  int64  
 1   resource_type            50000 non-null  object 
 2   allocation_hours         50000 non-null  int64  
 3   total_resources          50000 non-null  int64  
 4   usage_frequency          50000 non-null  object 
 5   teacher_id               50000 non-null  int64  
 6   subject                  50000 non-null  object 
 7   teaching_method          50000 non-null  object 
 8   student_feedback_rating  50000 non-null  float64
 9   hours_per_week           50000 non-null  int64  
 10  student_id               50000 non-null  int64  
 11  first_name               50000 non-null  object 
 12  last_name                50000 non-null  object 
 13  gender                   50000 non-null  object 
 14  date_of_birth         

In [37]:
df.to_csv('combined_dataset.csv', index=False) # saving the updated data to a dataset

In [52]:
df = pd.read_csv('combined_dataset.csv')
df.columns

Index(['resource_id', 'resource_type', 'allocation_hours', 'total_resources',
       'usage_frequency', 'teacher_id', 'subject', 'teaching_method',
       'student_feedback_rating', 'hours_per_week', 'student_id', 'first_name',
       'last_name', 'gender', 'date_of_birth', 'class', 'admission_date',
       'pass_fail'],
      dtype='object')

In [53]:
# Convert to datetime
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'])
df['admission_date'] = pd.to_datetime(df['admission_date'])

# 'age' feature at the time of admission
df['age_at_admission'] = (df['admission_date'] - df['date_of_birth']).dt.days / 365.25
df['age_at_admission'] = df['age_at_admission'].astype(int)

# Calculate 'years_in_school' as of the current date (assumed to be 2024-10-09)
current_date = pd.to_datetime('2024-10-09')
df['years_in_school'] = (current_date - df['admission_date']).dt.days / 365.25
df['years_in_school'] = df['years_in_school'].astype(int)


  df['date_of_birth'] = pd.to_datetime(df['date_of_birth'])


In [54]:
# Columns to drop
columns_to_drop = [
    'resource_id', 'student_id', 'first_name', 'last_name',
    'date_of_birth', 'admission_date', 'teacher_id'
]
df.drop(columns=columns_to_drop, inplace=True)


In [55]:
# Identifying both categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns


numerical_cols = df.select_dtypes(include=['int64', 'float64']).drop(['pass_fail'], axis=1).columns



In [56]:
categorical_cols

Index(['resource_type', 'usage_frequency', 'subject', 'teaching_method',
       'gender', 'class'],
      dtype='object')

In [57]:
numerical_cols

Index(['allocation_hours', 'total_resources', 'student_feedback_rating',
       'hours_per_week', 'age_at_admission', 'years_in_school'],
      dtype='object')

In [64]:
ohe = OneHotEncoder()
scaler = StandardScaler()
model = RandomForestClassifier()

In [66]:
X = df.drop('pass_fail', axis=1)
y= df['pass_fail']

In [71]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [100]:
# Updated preprocessor with sparse=False
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ]
)
preprocessor

In [96]:
# Hyperparameter optimization
models = {
    'RandomForest': {
        'classifier': RandomForestClassifier(random_state=42),
        'params': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [None, 10, 20, 30],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4]
        }
    },
    'LogisticRegression': {
        'classifier': LogisticRegression(max_iter=1000, random_state=42),
        'params': {
            'classifier__C': np.logspace(-4, 4, 20),
            'classifier__penalty': ['l1', 'l2'],
            'classifier__solver': ['liblinear']
        }
    },
    'XGBoost Classifier': {
        'classifier': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        'params': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [3, 6, 10],
            'classifier__learning_rate': [0.01, 0.1, 0.2, 0.3],
            'classifier__subsample': [0.7, 0.8, 0.9, 1.0]
        }
    },
    'DecisionTree': {
        'classifier': DecisionTreeClassifier(random_state=42),
        'params': {
            'classifier__max_depth': [None, 10, 20, 30],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4]
        }
    },
    'NaiveBayes': {
        'classifier': GaussianNB(),
        'params': {}  # Naive Bayes doesn't require hyperparameter tuning
    }
}

best_model = None
best_f1_score = 0

for model_name, model_dict in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model_dict['classifier'])
    ])

    random_cv = RandomizedSearchCV(
        pipeline,
        param_distributions=model_dict['params'],
        n_iter=10,
        scoring='f1',  # Change to F1 score
        cv=3,
        random_state=42,
        n_jobs=-1
    )

    # Fitting the model with RandomizedSearchCV
    random_cv.fit(X_train, y_train)

    # Predict on the test set
    y_pred = random_cv.best_estimator_.predict(X_test)

    # Model evaluation
    f1 = f1_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    report = classification_report(y_test, y_pred)

    print(f"Model: {model_name}")
    print(f"The Model's Best Parameters: {random_cv.best_params_}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Recall Score: {recall:.4f}")
    print("Classification Report:")
    print(report)
    print("\n" + "-" * 60 + "\n")

    # Compare for the best model based on F1 score
    if f1 > best_f1_score:
        best_f1_score = f1
        best_model = model_name

print(f"The best model is: {best_model} with an F1 score of {best_f1_score:.4f}")


Model: RandomForest
The Model's Best Parameters: {'classifier__n_estimators': 50, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': 10}
F1 Score: 0.5026
Recall Score: 0.5027
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.52      0.51      5009
           1       0.50      0.49      0.49      4991

    accuracy                           0.50     10000
   macro avg       0.50      0.50      0.50     10000
weighted avg       0.50      0.50      0.50     10000


------------------------------------------------------------

Model: LogisticRegression
The Model's Best Parameters: {'classifier__solver': 'liblinear', 'classifier__penalty': 'l2', 'classifier__C': 3792.690190732246}
F1 Score: 0.5015
Recall Score: 0.5016
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.51      0.51      5009
           1       0.50      0.49      0.



Model: NaiveBayes
The Model's Best Parameters: {}
F1 Score: 0.4944
Recall Score: 0.5003
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.39      0.44      5009
           1       0.50      0.61      0.55      4991

    accuracy                           0.50     10000
   macro avg       0.50      0.50      0.49     10000
weighted avg       0.50      0.50      0.49     10000


------------------------------------------------------------

The best model is: DecisionTree with an F1 score of 0.5027


In [98]:
import pickle

model_filename = 'decision_tree_model.sav'
with open(model_filename, 'wb') as model_file:
    pickle.dump(best_model, model_file)

print(f"Best model saved as {model_filename}")

Best model saved as decision_tree_model.sav


In [99]:
import joblib

# Saving the best DecisionTree model and preprocessor
if best_model == 'DecisionTree':
    # Saving the trained model
    joblib.dump(random_cv.best_estimator_, 'decision_tree_model.sav')

    # Saving the preprocessor used in the pipeline
    joblib.dump(preprocessor, 'preprocessor.sav')
