# Decision Tree Classifier
- Trained with VAK qeustions

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV
import joblib

In [2]:
# Load the CSV file containing responses
df = pd.read_csv('Dataset/encoded_response_withQues.csv')

### Define target variables

In [3]:
# # Target variable: Learning Objects Preference
target = df[[
    'Learning Objects [Slide presentation]',
    'Learning Objects [Book]',
    'Learning Objects [Lecture Note]',
    'Learning Objects [Educational game]',
    'Learning Objects [Video]',
    'Learning Objects [Audio-recorded lecture]',
    'Learning Objects [Animated instruction]',
    'Learning Objects [Real object model]',
    'Learning Objects [Mind Map]',
    'Learning Objects [Multimedia content]',
    'Learning Objects [Interactive Tool]',
    'Learning Objects [Technology-supported learning include computer-based training systems]',
    'Learning Objects [Intelligent computer-aided instruction systems]'
]]

### Split test and train data

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(target.columns, axis=1), target, test_size=0.25, random_state=42)

### Train model
- use GridSearchCV to find the best parameters which will give the highest accuracy
- save the model using joblib

In [5]:
best_estimators = {}

for col in target.columns:
    param_grid = {
        'max_depth': [None, 5, 10, 15],  
        'min_samples_split': [2, 5, 10],  
        'min_samples_leaf': [1, 2, 4],  
        'max_features': ['sqrt', 'log2']
    }

    grid_search = GridSearchCV(
        DecisionTreeClassifier(random_state=42),
        param_grid,
        cv=5,
        scoring='accuracy'
    )

    grid_search.fit(X_train, y_train[col])

    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    best_estimators[col] = best_estimator

joblib.dump(best_estimators, "Model/dt_model.joblib")

['Model/dt_model.joblib']

In [6]:
print('best_params:', best_params)

best_params: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2}


### Check model accuracy
- use classification_report

In [7]:
dt_model = joblib.load("Model/dt_model.joblib")

In [10]:
# Make predictions on the test set
y_pred = pd.DataFrame({col: classifier.predict(X_test) for col, classifier in dt_model.items()})

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.68      0.70       200
           1       0.59      0.68      0.63       117
           2       0.82      0.85      0.83       254
           3       0.62      0.59      0.60       137
           4       0.76      0.76      0.76       194
           5       0.72      0.70      0.71       138
           6       0.65      0.69      0.67       143
           7       0.70      0.74      0.72       136
           8       0.71      0.74      0.72       140
           9       0.66      0.71      0.69       156
          10       0.69      0.71      0.70       170
          11       0.66      0.72      0.68       165
          12       0.64      0.70      0.67       151

   micro avg       0.70      0.72      0.71      2101
   macro avg       0.69      0.71      0.70      2101
weighted avg       0.70      0.72      0.71      2101
 samples avg       0.64      0.63      0.61      2101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


df_model:  
micro avg       0.70      0.72      0.71      2101  
macro avg       0.69      0.71      0.70      2101  
weighted avg    0.70      0.72      0.71      2101  
samples avg     0.64      0.63      0.61      2101  

### Make predictions on a new data

In [13]:
dt_model = joblib.load("Model/dt_model.joblib")

In [14]:
data = pd.read_csv("Streamlit/merged_withdomVAK.csv")

In [15]:
predictions = pd.DataFrame({col: classifier.predict(data) for col, classifier in dt_model.items()})
predictions.head()

Unnamed: 0,Learning Objects [Slide presentation],Learning Objects [Book],Learning Objects [Lecture Note],Learning Objects [Educational game],Learning Objects [Video],Learning Objects [Audio-recorded lecture],Learning Objects [Animated instruction],Learning Objects [Real object model],Learning Objects [Mind Map],Learning Objects [Multimedia content],Learning Objects [Interactive Tool],Learning Objects [Technology-supported learning include computer-based training systems],Learning Objects [Intelligent computer-aided instruction systems]
0,0,0,0,0,0,0,1,1,1,0,1,1,1
1,0,0,0,0,0,0,1,1,1,0,1,1,1
