# eXtreme Gradient Boosting (XGB)
- Trained with VAK qeustions
- An efficient and scalable machine learning library known for its speed and performance in dealing with structured data
- Employs an ensemble learning technique called gradient boosting, which combines multiple weak predictive models (typically decision trees) sequentially to create a strong model

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier # !pip install xgboost
from sklearn.metrics import accuracy_score, classification_report

from sklearn.model_selection import GridSearchCV
import joblib

In [5]:
# Load the CSV file containing responses
df = pd.read_csv('Dataset/encoded_response_withQues.csv')
df.head()

Unnamed: 0,Gender,Level of Study,Household Income,Learning Objects [Slide presentation],Learning Objects [Book],Learning Objects [Lecture Note],Learning Objects [Educational game],Learning Objects [Video],Learning Objects [Audio-recorded lecture],Learning Objects [Animated instruction],...,28. I find it easiest to remember_Things I have done,29. I think I can tell someone is lying because_The vibes I get from them,29. I think I can tell someone is lying because_Their voice changes,29. I think I can tell someone is lying because_They avoid looking at you,30. When I'm meeting with an old friend_I give them a hug or a handshake,"30. When I'm meeting with an old friend_I say ""it's great to hear your voice!""","30. When I'm meeting with an old friend_I say ""it's great to see you!""",Preferred learning mode_Asynchronous Online Learning (On your own time),Preferred learning mode_Face to Face,Preferred learning mode_Synchronous Online Learning (Real Time)
0,2,1,2,0,0,0,0,0,0,0,...,False,False,False,True,False,True,False,False,True,False
1,2,1,2,0,0,0,0,0,0,0,...,False,False,False,True,False,True,False,False,False,True
2,2,1,2,1,0,1,0,0,0,0,...,True,True,False,False,False,False,True,False,True,False
3,2,1,3,1,1,1,0,0,0,0,...,True,True,False,False,True,False,False,False,True,False
4,2,1,3,1,1,1,0,0,0,0,...,True,True,False,False,True,False,False,True,False,False


### Define target variables

In [6]:
# # Target variable: Learning Objects Preference
target = df[[
    'Learning Objects [Slide presentation]',
    'Learning Objects [Book]',
    'Learning Objects [Lecture Note]',
    'Learning Objects [Educational game]',
    'Learning Objects [Video]',
    'Learning Objects [Audio-recorded lecture]',
    'Learning Objects [Animated instruction]',
    'Learning Objects [Real object model]',
    'Learning Objects [Mind Map]',
    'Learning Objects [Multimedia content]',
    'Learning Objects [Interactive Tool]',
    'Learning Objects [Technology-supported learning include computer-based training systems]',
    'Learning Objects [Intelligent computer-aided instruction systems]'
]]

### Split test and train data

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(target.columns, axis=1), target, test_size=0.25, random_state=42)

### Train model
- use GridSearchCV to find the best parameters which will give the highest accuracy
- save the model using joblib

In [8]:
# Create an empty dictionary to hold the classifiers
best_estimators = {}

# Loop through each learning object
for col in target.columns:
    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [100, 150, 200],  # Vary the number of trees
        'max_depth': [3, 6, 9],  # Vary the maximum depth of trees
        # Add other hyperparameters to tune for XGBoost
        # 'learning_rate': [0.1, 0.01, 0.001],
        # 'subsample': [0.7, 0.8, 0.9],
        # 'colsample_bytree': [0.7, 0.8, 0.9],
        # 'gamma': [0, 0.1, 0.2]
    }

    # Instantiate GridSearchCV for XGBClassifier
    grid_search = GridSearchCV(XGBClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')

    # Fit the grid search to your data for the current learning object
    grid_search.fit(X_train, y_train[col])

    # Get the best parameters and best estimator for the current learning object
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    # Store the best estimator in the classifiers dictionary
    best_estimators[col] = best_estimator

# Save the best_estimators dictionary containing the trained models
joblib.dump(best_estimators, "Model/xgb_model.joblib")


['Model/xgb_model.joblib']

In [9]:
print('best_params:', best_params)

best_params: {'max_depth': 6, 'n_estimators': 100}


best_params: {'max_depth': 6, 'n_estimators': 100}

### Check model accuracy
- use classification_report

In [10]:
xgb_model = joblib.load("Model/xgb_model.joblib")

In [11]:
# Make predictions on the test set
y_pred = pd.DataFrame({col: classifier.predict(X_test) for col, classifier in xgb_model.items()})

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.83      0.81       200
           1       0.79      0.68      0.73       117
           2       0.82      0.96      0.88       254
           3       0.79      0.69      0.74       137
           4       0.84      0.85      0.84       194
           5       0.76      0.70      0.73       138
           6       0.74      0.71      0.73       143
           7       0.75      0.63      0.69       136
           8       0.78      0.69      0.73       140
           9       0.75      0.80      0.78       156
          10       0.76      0.77      0.76       170
          11       0.74      0.76      0.75       165
          12       0.73      0.75      0.74       151

   micro avg       0.78      0.77      0.78      2101
   macro avg       0.77      0.76      0.76      2101
weighted avg       0.78      0.77      0.77      2101
 samples avg       0.70      0.66      0.65      2101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


xgb_model:  
micro avg       0.78      0.77      0.78      2101  
macro avg       0.77      0.76      0.76      2101  
weighted avg    0.78      0.77      0.77      2101    
samples avg     0.70      0.66      0.65      2101    

### Make predictions on a new data

In [12]:
xgb_model = joblib.load("Model/xgb_model.joblib")

In [13]:
data = pd.read_csv("Streamlit/merged_withdomVAK.csv")

In [14]:
predictions = pd.DataFrame({col: classifier.predict(data) for col, classifier in xgb_model.items()})
predictions

Unnamed: 0,Learning Objects [Slide presentation],Learning Objects [Book],Learning Objects [Lecture Note],Learning Objects [Educational game],Learning Objects [Video],Learning Objects [Audio-recorded lecture],Learning Objects [Animated instruction],Learning Objects [Real object model],Learning Objects [Mind Map],Learning Objects [Multimedia content],Learning Objects [Interactive Tool],Learning Objects [Technology-supported learning include computer-based training systems],Learning Objects [Intelligent computer-aided instruction systems]
0,0,0,0,0,0,0,1,1,1,1,1,1,1
1,0,0,0,0,0,0,1,1,1,1,1,1,1
