In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import joblib

## Load data

In [39]:
# Read the dataset
df = pd.read_csv('Datasets/latest_responses_encoded.csv')

df.head()

Unnamed: 0,Gender,Level of Study,6. Online Instructional Strategies/Assessment [Demonstration],6. Online Instructional Strategies/Assessment [Digital Lab Experiments],6. Online Instructional Strategies/Assessment [Forum],6. Online Instructional Strategies/Assessment [Case Study],6. Online Instructional Strategies/Assessment [Concept Mapping],6. Online Instructional Strategies/Assessment [Real Time Online Exam],6. Online Instructional Strategies/Assessment [Individual Project/Assignment],6. Online Instructional Strategies/Assessment [Group Project/Assignment],...,"27. If I am very angry_I stomp about, slam doors and throw things",28. I find it easiest to remember_Faces,28. I find it easiest to remember_Names,28. I find it easiest to remember_Things I have done,29. I think I can tell someone is lying because_The vibes I get from them,29. I think I can tell someone is lying because_Their voice changes,29. I think I can tell someone is lying because_They avoid looking at you,30. When I'm meeting with an old friend_I give them a hug or a handshake,"30. When I'm meeting with an old friend_I say ""it's great to hear your voice!""","30. When I'm meeting with an old friend_I say ""it's great to see you!"""
0,0,3,0,0,0,0,0,0,0,0,...,True,True,False,False,False,False,True,False,True,False
1,0,3,1,1,0,0,1,0,1,1,...,False,False,False,True,True,False,False,False,False,True
2,0,3,1,1,0,0,1,0,1,1,...,False,False,False,True,True,False,False,False,False,True
3,0,3,1,1,0,0,1,0,1,1,...,False,False,False,True,True,False,False,False,False,True
4,0,3,1,1,0,0,1,0,1,1,...,False,False,False,True,True,False,False,False,False,True


## Extract target variables

In [40]:
# Extract online assessment columns as target
target_columns = df.iloc[:, 2:15]
target_columns.columns

Index(['6. Online Instructional Strategies/Assessment [Demonstration]',
       '6. Online Instructional Strategies/Assessment [Digital Lab Experiments]',
       '6. Online Instructional Strategies/Assessment [Forum]',
       '6. Online Instructional Strategies/Assessment [Case Study]',
       '6. Online Instructional Strategies/Assessment [Concept Mapping]',
       '6. Online Instructional Strategies/Assessment [Real Time Online Exam]',
       '6. Online Instructional Strategies/Assessment [Individual Project/Assignment]',
       '6. Online Instructional Strategies/Assessment [Group Project/Assignment]',
       '6. Online Instructional Strategies/Assessment [Online Quiz/Test - MCQ]',
       '6. Online Instructional Strategies/Assessment [Online Quiz/Test - Essay]',
       '6. Online Instructional Strategies/Assessment [Online Quiz/Test - Open Book]',
       '6. Online Instructional Strategies/Assessment [Peer Review Assessment Live Presentation]',
       '6. Online Instructional Strate

## Split data

In [41]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(target_columns.columns, axis=1), target_columns, 
                                                    test_size=0.2, random_state=42)

## Train the model

### Without hyperparameter search

In [42]:
# Initialise an empty dictionary to store the trained model
model_dict = {}

# Train a SVM
for column in target_columns.columns:
    model = SVC(kernel='rbf', random_state=42)
    model.fit(X_train, y_train[column])
    model_dict[column] = model

## Evaluate the model

In [43]:
# Make predictions on the test set
y_pred = pd.DataFrame({col: model.predict(X_test) for col, model in model_dict.items()})

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.89      0.86       268
           1       1.00      0.35      0.51       124
           2       1.00      0.53      0.69       106
           3       0.94      0.48      0.63       130
           4       0.96      0.40      0.57       131
           5       1.00      0.36      0.53       120
           6       0.92      0.72      0.81       188
           7       0.92      0.62      0.74       186
           8       0.88      0.80      0.84       247
           9       1.00      0.49      0.66       136
          10       0.84      0.83      0.84       258
          11       0.96      0.37      0.53       139
          12       0.90      0.76      0.82       211

   micro avg       0.90      0.64      0.75      2244
   macro avg       0.93      0.58      0.69      2244
weighted avg       0.92      0.64      0.73      2244
 samples avg       0.69      0.53      0.57      2244



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Save the model

In [47]:
joblib.dump(model_dict, 'Model/svm.joblib')

['Model/svm.joblib']

### With hyperparameter search

In [49]:
# from sklearn.model_selection import GridSearchCV

# # Create a dictionary to hold the best estimators after grid search
# best_estimators = {}

# # Iterate through each learning object
# for col in target_columns.columns:
#     # Define the parameter grid
#     param_grid = {
#         'C': [0.5, 1, 1.5],  # Expanded range of C values
#         'gamma': [0.1, 0.01, 0.001],  # More options for gamma
#         'kernel': ['rbf', 'sigma']  # Diverse kernel options
#     }
    
#     # Instantiate GridSearchCV
#     grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=5, scoring='accuracy')
    
#     # Fit the grid search to your data for the current learning object
#     grid_search.fit(X_train, y_train[col])
    
#     # Get the best parameters and best estimator for the current learning object
#     best_params = grid_search.best_params_
#     best_estimator = grid_search.best_estimator_
    
#     # Save the best estimator (SVM model) to a file
# #     joblib.dump(best_estimator, f"svm_model_{col}.joblib")
    
#     # Store the best estimator in the dictionary for later use if needed
#     best_estimators[col] = best_estimator
    
#     # Evaluate the model on the validation set
#     y_pred = best_estimator.predict(X_test)
#     accuracy = accuracy_score(y_test[col], y_pred)
#     print(f"Model Accuracy for {col}: {accuracy}")
    

In [48]:
# # Make predictions on the test set
# y_pred = pd.DataFrame({col: model.predict(X_test) for col, model in best_estimators.items()})

# # Classification Report
# print("Classification Report:")
# print(classification_report(y_test, y_pred))

## Make predictions on new data

In [44]:
test = pd.read_csv('Datasets/test_data.csv')

In [45]:
predictions = pd.DataFrame({col: model.predict(test) for col, model in model_dict.items()})

predictions.head()

Unnamed: 0,6. Online Instructional Strategies/Assessment [Demonstration],6. Online Instructional Strategies/Assessment [Digital Lab Experiments],6. Online Instructional Strategies/Assessment [Forum],6. Online Instructional Strategies/Assessment [Case Study],6. Online Instructional Strategies/Assessment [Concept Mapping],6. Online Instructional Strategies/Assessment [Real Time Online Exam],6. Online Instructional Strategies/Assessment [Individual Project/Assignment],6. Online Instructional Strategies/Assessment [Group Project/Assignment],6. Online Instructional Strategies/Assessment [Online Quiz/Test - MCQ],6. Online Instructional Strategies/Assessment [Online Quiz/Test - Essay],6. Online Instructional Strategies/Assessment [Online Quiz/Test - Open Book],6. Online Instructional Strategies/Assessment [Peer Review Assessment Live Presentation],6. Online Instructional Strategies/Assessment [Recorded Presentation]
0,1,0,0,0,0,1,1,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0,1,0,1
2,1,1,0,0,1,0,1,1,1,1,1,0,0


In [21]:
# predictions.to_csv('Datasets/prediction.csv', index=False)