# Decision Tree Classifier
- Trained with VAK qeustions

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV
import joblib

In [2]:
# Load the CSV file containing responses
df = pd.read_csv('Dataset/encoded_new_response_withQues.csv')
df.head()

Unnamed: 0,Gender,Level of Study,Household Income,Learning Objects [Slide presentation],Learning Objects [Book],Learning Objects [Lecture Note],Learning Objects [Educational game],Learning Objects [Video],Learning Objects [Audio-recorded lecture],Learning Objects [Animated instruction],...,"30. When I'm meeting with an old friend_I say ""it's great to see you!""",Preferred learning mode_Asynchronous Online Learning (On your own time),Preferred learning mode_Face to Face,Preferred learning mode_Synchronous Online Learning (Real Time),Preferred Communication Platform_Call,Preferred Communication Platform_Email,Preferred Communication Platform_Others,Preferred Communication Platform_Telegram,Preferred Communication Platform_University eLearning Chat Room,Preferred Communication Platform_Whatsapp
0,2,1,2,0,0,0,0,0,0,0,...,False,False,True,False,False,False,False,False,False,True
1,2,1,2,0,0,0,0,0,0,0,...,False,False,False,True,False,False,False,False,False,True
2,2,1,2,1,0,1,0,0,0,0,...,True,False,True,False,False,True,False,False,False,False
3,2,1,2,1,0,1,0,0,0,0,...,True,False,True,False,False,False,False,False,True,False
4,2,1,2,1,0,1,0,0,0,0,...,True,False,True,False,False,False,False,False,False,True


### Define target variables

In [3]:
# # Target variable: Learning Objects Preference
target = df[[
    'Learning Objects [Slide presentation]',
    'Learning Objects [Book]',
    'Learning Objects [Lecture Note]',
    'Learning Objects [Educational game]',
    'Learning Objects [Video]',
    'Learning Objects [Audio-recorded lecture]',
    'Learning Objects [Animated instruction]',
    'Learning Objects [Real object model]',
    'Learning Objects [Mind Map]',
    'Learning Objects [Multimedia content]',
    'Learning Objects [Interactive Tool]',
    'Learning Objects [Technology-supported learning include computer-based training systems]',
    'Learning Objects [Intelligent computer-aided instruction systems]'
]]

### Split test and train data

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(target.columns, axis=1), target, test_size=0.25, random_state=42)

### Train model
- use GridSearchCV to find the best parameters which will give the highest accuracy
- save the model using joblib

In [5]:
best_estimators = {}

for col in target.columns:
    param_grid = {
#         'max_depth': [None, 5, 10, 15],  
#         'min_samples_split': [2, 5, 10],  
#         'min_samples_leaf': [1, 2, 4],  
#         'max_features': ['sqrt', 'log2']
        'max_depth': [None],  
        'min_samples_split': [2],  
        'min_samples_leaf': [1],  
        'max_features': ['log2']
    }

    grid_search = GridSearchCV(
        DecisionTreeClassifier(random_state=42),
        param_grid,
        cv=5,
        scoring='accuracy'
    )

    grid_search.fit(X_train, y_train[col])

    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    best_estimators[col] = best_estimator

# joblib.dump(best_estimators, "Model/dt_model.joblib")

In [6]:
print('best_params:', best_params)

best_params: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2}


### Check model accuracy
- use classification_report

In [7]:
# dt_model = joblib.load("Model/dt_model.joblib")

In [8]:
# Make predictions on the test set
y_pred = pd.DataFrame({col: classifier.predict(X_test) for col, classifier in best_estimators.items()})

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.95       492
           1       0.90      0.93      0.91       281
           2       0.96      0.96      0.96       654
           3       0.94      0.91      0.92       343
           4       0.93      0.96      0.94       477
           5       0.90      0.90      0.90       327
           6       0.89      0.91      0.90       365
           7       0.93      0.91      0.92       370
           8       0.89      0.93      0.91       350
           9       0.93      0.93      0.93       428
          10       0.93      0.95      0.94       416
          11       0.92      0.95      0.93       426
          12       0.93      0.94      0.93       392

   micro avg       0.93      0.94      0.93      5321
   macro avg       0.92      0.93      0.93      5321
weighted avg       0.93      0.94      0.93      5321
 samples avg       0.84      0.84      0.83      5321



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


df_model:  
micro avg       0.70      0.72      0.71      2101  
macro avg       0.69      0.71      0.70      2101  
weighted avg    0.70      0.72      0.71      2101  
samples avg     0.64      0.63      0.61      2101  

In [9]:
# Initialize a dictionary to store accuracy scores
accuracy_scores = {}

# Loop through each column and calculate accuracy score
for col in y_test.columns:
    accuracy = accuracy_score(y_test[col], y_pred[col])
    accuracy_scores[col] = accuracy
    print(f"Accuracy for {col}: {accuracy}")

# Overall accuracy score
overall_accuracy = accuracy_score(y_test.values.flatten(), y_pred.values.flatten())
print(f"\nOverall Accuracy: {overall_accuracy}")

Accuracy for Learning Objects [Slide presentation]: 0.9493534482758621
Accuracy for Learning Objects [Book]: 0.9450431034482759
Accuracy for Learning Objects [Lecture Note]: 0.9396551724137931
Accuracy for Learning Objects [Educational game]: 0.9450431034482759
Accuracy for Learning Objects [Video]: 0.9418103448275862
Accuracy for Learning Objects [Audio-recorded lecture]: 0.9321120689655172
Accuracy for Learning Objects [Animated instruction]: 0.9170258620689655
Accuracy for Learning Objects [Real object model]: 0.9375
Accuracy for Learning Objects [Mind Map]: 0.9299568965517241
Accuracy for Learning Objects [Multimedia content]: 0.9331896551724138
Accuracy for Learning Objects [Interactive Tool]: 0.9439655172413793
Accuracy for Learning Objects [Technology-supported learning include computer-based training systems]: 0.9375
Accuracy for Learning Objects [Intelligent computer-aided instruction systems]: 0.9439655172413793

Overall Accuracy: 0.9381631299734748


### Make predictions on a new data

In [13]:
# dt_model = joblib.load("Model/dt_model.joblib")

In [14]:
data = pd.read_csv("Streamlit/merged_withdomVAK.csv")

In [15]:
predictions = pd.DataFrame({col: classifier.predict(data) for col, classifier in dt_model.items()})
predictions.head()

Unnamed: 0,Learning Objects [Slide presentation],Learning Objects [Book],Learning Objects [Lecture Note],Learning Objects [Educational game],Learning Objects [Video],Learning Objects [Audio-recorded lecture],Learning Objects [Animated instruction],Learning Objects [Real object model],Learning Objects [Mind Map],Learning Objects [Multimedia content],Learning Objects [Interactive Tool],Learning Objects [Technology-supported learning include computer-based training systems],Learning Objects [Intelligent computer-aided instruction systems]
0,0,0,0,0,0,0,1,1,1,0,1,1,1
1,0,0,0,0,0,0,1,1,1,0,1,1,1
