# Random Forest Classifier Model
- Trained with VAK qeustions
- 2nd highest accuracy

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

from sklearn.model_selection import GridSearchCV
import joblib

In [6]:
# Load the CSV file containing responses
df = pd.read_csv('Dataset/encoded_response_withQues.csv')
df.head()

Unnamed: 0,Gender,Level of Study,Household Income,Learning Objects [Slide presentation],Learning Objects [Book],Learning Objects [Lecture Note],Learning Objects [Educational game],Learning Objects [Video],Learning Objects [Audio-recorded lecture],Learning Objects [Animated instruction],...,28. I find it easiest to remember_Things I have done,29. I think I can tell someone is lying because_The vibes I get from them,29. I think I can tell someone is lying because_Their voice changes,29. I think I can tell someone is lying because_They avoid looking at you,30. When I'm meeting with an old friend_I give them a hug or a handshake,"30. When I'm meeting with an old friend_I say ""it's great to hear your voice!""","30. When I'm meeting with an old friend_I say ""it's great to see you!""",Preferred learning mode_Asynchronous Online Learning (On your own time),Preferred learning mode_Face to Face,Preferred learning mode_Synchronous Online Learning (Real Time)
0,2,1,2,0,0,0,0,0,0,0,...,False,False,False,True,False,True,False,False,True,False
1,2,1,2,0,0,0,0,0,0,0,...,False,False,False,True,False,True,False,False,False,True
2,2,1,2,1,0,1,0,0,0,0,...,True,True,False,False,False,False,True,False,True,False
3,2,1,3,1,1,1,0,0,0,0,...,True,True,False,False,True,False,False,False,True,False
4,2,1,3,1,1,1,0,0,0,0,...,True,True,False,False,True,False,False,True,False,False


### Define target variables

In [7]:
# # Target variable: Learning Objects Preference
target = df[[
    'Learning Objects [Slide presentation]',
    'Learning Objects [Book]',
    'Learning Objects [Lecture Note]',
    'Learning Objects [Educational game]',
    'Learning Objects [Video]',
    'Learning Objects [Audio-recorded lecture]',
    'Learning Objects [Animated instruction]',
    'Learning Objects [Real object model]',
    'Learning Objects [Mind Map]',
    'Learning Objects [Multimedia content]',
    'Learning Objects [Interactive Tool]',
    'Learning Objects [Technology-supported learning include computer-based training systems]',
    'Learning Objects [Intelligent computer-aided instruction systems]'
]]

### Split test and train data

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(target.columns, axis=1), target, test_size=0.25, random_state=42)

### Train model
- use GridSearchCV to find the best parameters which will give the highest accuracy
- save the model using joblib

In [9]:
# Create an empty dictionary to hold the classifiers
best_estimators = {}

# Loop through each learning object
for col in target.columns:
    # Define the parameter grid for hyperparameter tuning
    param_grid = {
#         'n_estimators': [100, 150, 200],  # Vary the number of trees
#         'max_depth': [None, 10, 20, 30],  # Vary the maximum depth of trees
        'n_estimators': [200], 
        'max_depth': [10]
    }
    
    # Instantiate GridSearchCV for RandomForestClassifier
    grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
    
    # Fit the grid search to your data for the current learning object
    grid_search.fit(X_train, y_train[col])
    
    # Get the best parameters and best estimator for the current learning object
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_
    
    # Store the best estimator in the classifiers dictionary
    best_estimators[col] = best_estimator
    
# joblib.dump(best_estimators, "Model/rf_model.joblib")

### With even more parameters
- Take quite a long time to run

In [8]:
# # Create an empty dictionary to hold the classifiers
# best_estimators = {}

# # Loop through each learning object
# for col in target.columns:
#     # Define the parameter grid for hyperparameter tuning
#     param_grid = {
#         'n_estimators': [100, 150, 200, 250],  # Expanded number of trees
#         'max_depth': [None, 10, 20, 30, 50],  # Expanded maximum depth of trees
#         'min_samples_split': [2, 5, 10],  # Minimum samples for node splitting
#         'min_samples_leaf': [1, 2, 4, 8],  # Minimum samples required at leaf nodes
#         'max_features': ['auto', 'sqrt', 'log2']  # Maximum number of features considered for splitting
#         # Add other hyperparameters to tune
#     }
    
#     # Instantiate GridSearchCV for RandomForestClassifier
#     grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
    
#     # Fit the grid search to your data for the current learning object
#     grid_search.fit(X_train, y_train[col])
    
#     # Get the best parameters and best estimator for the current learning object
#     best_params = grid_search.best_params_
#     best_estimator = grid_search.best_estimator_
    
#     # Store the best estimator in the classifiers dictionary
#     best_estimators[col] = best_estimator
    
#     # Evaluate the model on the validation set
#     y_pred = best_estimator.predict(X_test)
#     accuracy = accuracy_score(y_test[col], y_pred)
#     print(f"Model Accuracy for {col}: {accuracy}")

    
# # Save the best estimators using joblib
# joblib.dump(best_estimators, "Model/rf_model_3.joblib")

In [9]:
print('best_params:', best_params)

best_params: {'max_depth': 10, 'n_estimators': 200}


best_params: {'max_depth': 10, 'n_estimators': 200}

### Check model accuracy
- use classification_report

In [2]:
rf_model = joblib.load("Model/rf_model.joblib")

In [10]:
# Make predictions on the test set
y_pred = pd.DataFrame({col: classifier.predict(X_test) for col, classifier in rf_model.items()})

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.72      0.76       200
           1       0.99      0.62      0.76       117
           2       0.78      1.00      0.88       254
           3       0.92      0.58      0.71       137
           4       0.81      0.84      0.82       194
           5       0.97      0.63      0.76       138
           6       0.91      0.57      0.70       143
           7       0.79      0.61      0.69       136
           8       0.99      0.59      0.74       140
           9       0.84      0.69      0.76       156
          10       0.80      0.72      0.76       170
          11       0.81      0.75      0.78       165
          12       0.84      0.62      0.71       151

   micro avg       0.84      0.71      0.77      2101
   macro avg       0.86      0.69      0.76      2101
weighted avg       0.85      0.71      0.76      2101
 samples avg       0.73      0.61      0.63      2101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


rf_model:  
micro avg       0.84      0.71      0.77      2101  
macro avg       0.86      0.69      0.76      2101   
weighted avg    0.85      0.71      0.76      2101  
samples avg     0.73      0.61      0.63      2101  

In [11]:
# Initialize a dictionary to store accuracy scores
accuracy_scores = {}

# Loop through each column and calculate accuracy score
for col in y_test.columns:
    accuracy = accuracy_score(y_test[col], y_pred[col])
    accuracy_scores[col] = accuracy
    print(f"Accuracy for {col}: {accuracy}")

# Overall accuracy score
overall_accuracy = accuracy_score(y_test.values.flatten(), y_pred.values.flatten())
print(f"\nOverall Accuracy: {overall_accuracy}")

Accuracy for Learning Objects [Slide presentation]: 0.7591623036649214
Accuracy for Learning Objects [Book]: 0.8795811518324608
Accuracy for Learning Objects [Lecture Note]: 0.8141361256544503
Accuracy for Learning Objects [Educational game]: 0.8324607329842932
Accuracy for Learning Objects [Video]: 0.819371727748691
Accuracy for Learning Objects [Audio-recorded lecture]: 0.8586387434554974
Accuracy for Learning Objects [Animated instruction]: 0.8167539267015707
Accuracy for Learning Objects [Real object model]: 0.8036649214659686
Accuracy for Learning Objects [Mind Map]: 0.8455497382198953
Accuracy for Learning Objects [Multimedia content]: 0.819371727748691
Accuracy for Learning Objects [Interactive Tool]: 0.7931937172774869
Accuracy for Learning Objects [Technology-supported learning include computer-based training systems]: 0.8141361256544503
Accuracy for Learning Objects [Intelligent computer-aided instruction systems]: 0.8036649214659686

Overall Accuracy: 0.819975835682642


### Make predictions on a new data

In [14]:
rf_model = joblib.load("Model/rf_model.joblib")

In [17]:
data = pd.read_csv("Streamlit/merged_withdomVAK.csv")

In [19]:
predictions = pd.DataFrame({col: classifier.predict(data) for col, classifier in rf_model.items()})
predictions

Unnamed: 0,Learning Objects [Slide presentation],Learning Objects [Book],Learning Objects [Lecture Note],Learning Objects [Educational game],Learning Objects [Video],Learning Objects [Audio-recorded lecture],Learning Objects [Animated instruction],Learning Objects [Real object model],Learning Objects [Mind Map],Learning Objects [Multimedia content],Learning Objects [Interactive Tool],Learning Objects [Technology-supported learning include computer-based training systems],Learning Objects [Intelligent computer-aided instruction systems]
0,0,0,0,0,0,0,1,1,0,1,1,1,1
1,0,0,0,0,0,0,1,1,1,1,1,1,1
