In [207]:
import pandas as pd

In [208]:
df = pd.read_csv('cleaned.csv')
df.columns = df.columns.str.strip()

## Target: What programme are you in?

Predict which program the student is in, based on the 4 courses and use of chatgpt

because the corr in the heatmap is high (not now though), and in reality courses that student take can indeed characterize the program they are in


In [209]:
target_col = 'What programme are you in?'
feature_cols = ['Have you taken a course on machine learning?',
                'Have you taken a course on information retrieval?',
                'Have you taken a course on statistics?',
                'Have you taken a course on databases?',
                'I have used ChatGPT to help me with some of my study assignments']

# take a look at the target column and the features
def check_value_counts(df: pd.DataFrame, cols: list[str]):
    for col in cols:
        print(df[col].value_counts(), '\n')
check_value_counts(df, [*feature_cols, target_col])

Have you taken a course on machine learning?
yes        193
no          50
unknown      2
Name: count, dtype: int64 

Have you taken a course on information retrieval?
0          120
1          101
unknown     24
Name: count, dtype: int64 

Have you taken a course on statistics?
0          170
1           46
unknown     29
Name: count, dtype: int64 

Have you taken a course on databases?
1          170
0           67
unknown      8
Name: count, dtype: int64 

I have used ChatGPT to help me with some of my study assignments
1         189
unkown     41
0          15
Name: count, dtype: int64 

What programme are you in?
computer science             114
artificial intelligence      100
computational science         17
finance                        7
other                          4
human language technology      2
unknown                        1
Name: count, dtype: int64 



Encoding the features and the target variable

In [210]:
for col in feature_cols:
    df[col] = df[col].map({'yes': 1, '1': 1,
                           'no': 0, '0': 0,
                           'unknown': 0.5, 'unkown': 0.5})
check_value_counts(df, feature_cols)

Have you taken a course on machine learning?
1.0    193
0.0     50
0.5      2
Name: count, dtype: int64 

Have you taken a course on information retrieval?
0.0    120
1.0    101
0.5     24
Name: count, dtype: int64 

Have you taken a course on statistics?
0.0    170
1.0     46
0.5     29
Name: count, dtype: int64 

Have you taken a course on databases?
1.0    170
0.0     67
0.5      8
Name: count, dtype: int64 

I have used ChatGPT to help me with some of my study assignments
1.0    189
0.5     41
0.0     15
Name: count, dtype: int64 



In [211]:
major_classes = ['computer science', 'artificial intelligence']
df[target_col] = df[target_col].apply(lambda x: x if x in major_classes else 'other')
df[target_col].value_counts()


What programme are you in?
computer science           114
artificial intelligence    100
other                       31
Name: count, dtype: int64

In [212]:

from sklearn.preprocessing import LabelEncoder
le_program = LabelEncoder()
encoded_target_col = 'encoded_program'
df[encoded_target_col] = le_program.fit_transform(df[target_col])
check_value_counts(df, [encoded_target_col])

encoded_program
1    114
0    100
2     31
Name: count, dtype: int64 



## Train-test split

In [213]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df[encoded_target_col], test_size=0.2, random_state=123)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((196, 5), (49, 5), (196,), (49,))

In [214]:
set(y_train), set(y_test)

({0, 1, 2}, {0, 1, 2})

## Apply two classification algorithms
### Random Forest

In [215]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score

rf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=1,
)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_pred_rf_labels = le_program.inverse_transform(y_pred_rf)
pd.Series(y_pred_rf_labels).value_counts()

artificial intelligence    23
computer science           21
other                       5
Name: count, dtype: int64

We adjusted the hyperparameters a bit to avoid the case where no `other` class is predicted, which could also cause insufficient data for `classification_report`


In [216]:
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("F1 Score (macro):", f1_score(y_test, y_pred_rf, average='weighted'))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

Accuracy: 0.5102040816326531
F1 Score (macro): 0.5034761157210137

Classification Report:
               precision    recall  f1-score   support

           0       0.39      0.56      0.46        16
           1       0.71      0.62      0.67        24
           2       0.20      0.11      0.14         9

    accuracy                           0.51        49
   macro avg       0.44      0.43      0.42        49
weighted avg       0.51      0.51      0.50        49



### K-Nearest Neighbors

In [217]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)
# transform back to the original labels
y_pred_knn_labels = le_program.inverse_transform(y_pred_knn)
pd.Series(y_pred_knn_labels).value_counts()

artificial intelligence    32
computer science           13
other                       4
Name: count, dtype: int64

## Performance
Metrics:
- Accuracy: overall correctness
- F1 Score (weighted): accounts for class imbalance, since class `other` is much less than the other two classes
- `classification_report` shows precision, recall, F1 and support for each class

In [218]:
def print_metrics(y_test, y_pred):
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1 Score (macro):", f1_score(y_test, y_pred, average='weighted'))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

### Random Forest Performance

In [219]:
print_metrics(y_test, y_pred_rf)

Accuracy: 0.5102040816326531
F1 Score (macro): 0.5034761157210137

Classification Report:
               precision    recall  f1-score   support

           0       0.39      0.56      0.46        16
           1       0.71      0.62      0.67        24
           2       0.20      0.11      0.14         9

    accuracy                           0.51        49
   macro avg       0.44      0.43      0.42        49
weighted avg       0.51      0.51      0.50        49



### K-Nearest Neighbors Performance

In [220]:
print_metrics(y_test, y_pred_knn)

Accuracy: 0.4897959183673469
F1 Score (macro): 0.4863167720310578

Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.75      0.50        16
           1       0.69      0.38      0.49        24
           2       0.75      0.33      0.46         9

    accuracy                           0.49        49
   macro avg       0.61      0.49      0.48        49
weighted avg       0.60      0.49      0.49        49



TODO: optimization of the hyperparameters, using `GridSearchCV` or manually?