In [100]:
import pandas as pd

In [101]:
df = pd.read_csv('cleaned.csv')
df.columns = df.columns.str.strip()

## Target: What programme are you in?

Predict which program the student is in, based on the 4 courses and use of chatgpt

because the corr in the heatmap is high (not now though), and in reality courses that student take can indeed characterize the program they are in


In [102]:
target_col = 'What programme are you in?'
feature_cols = ['Have you taken a course on machine learning?',
                'Have you taken a course on information retrieval?',
                'Have you taken a course on statistics?',
                'Have you taken a course on databases?',
                'I have used ChatGPT to help me with some of my study assignments']

# take a look at the target column and the features
def check_value_counts(df: pd.DataFrame, cols: list[str]):
    for col in cols:
        print(df[col].value_counts(), '\n')
check_value_counts(df, [*feature_cols, target_col])

Have you taken a course on machine learning?
yes        193
no          50
unknown      2
Name: count, dtype: int64 

Have you taken a course on information retrieval?
0          120
1          101
unknown     24
Name: count, dtype: int64 

Have you taken a course on statistics?
0          170
1           46
unknown     29
Name: count, dtype: int64 

Have you taken a course on databases?
1          170
0           67
unknown      8
Name: count, dtype: int64 

I have used ChatGPT to help me with some of my study assignments
1         189
unkown     41
0          15
Name: count, dtype: int64 

What programme are you in?
computer science             114
artificial intelligence      100
computational science         17
finance                        7
other                          4
human language technology      2
unknown                        1
Name: count, dtype: int64 



Encoding the features and the target variable

In [103]:
for col in feature_cols:
    df[col] = df[col].map({'yes': 1, '1': 1,
                           'no': 0, '0': 0,
                           'unknown': 0.5, 'unkown': 0.5})
check_value_counts(df, feature_cols)

Have you taken a course on machine learning?
1.0    193
0.0     50
0.5      2
Name: count, dtype: int64 

Have you taken a course on information retrieval?
0.0    120
1.0    101
0.5     24
Name: count, dtype: int64 

Have you taken a course on statistics?
0.0    170
1.0     46
0.5     29
Name: count, dtype: int64 

Have you taken a course on databases?
1.0    170
0.0     67
0.5      8
Name: count, dtype: int64 

I have used ChatGPT to help me with some of my study assignments
1.0    189
0.5     41
0.0     15
Name: count, dtype: int64 



In [104]:
major_classes = ['computer science', 'artificial intelligence']
df[target_col] = df[target_col].apply(lambda x: x if x in major_classes else 'other')
df[target_col].value_counts()


What programme are you in?
computer science           114
artificial intelligence    100
other                       31
Name: count, dtype: int64

In [105]:

from sklearn.preprocessing import LabelEncoder
le_program = LabelEncoder()
encoded_target_col = 'encoded_program'
df[encoded_target_col] = le_program.fit_transform(df[target_col])
check_value_counts(df, [encoded_target_col])

encoded_program
1    114
0    100
2     31
Name: count, dtype: int64 



## Train-test split

In [106]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df[encoded_target_col], test_size=0.2, random_state=123)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((196, 5), (49, 5), (196,), (49,))

In [107]:
set(y_train), set(y_test)

({0, 1, 2}, {0, 1, 2})

## Apply two classification algorithms
### Random Forest

In [118]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score

rf = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=1,
)
rf = RandomForestClassifier(
    class_weight='balanced',
    max_depth=5,
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=5,
    n_estimators=30,
    random_state=1
)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_pred_rf_labels = le_program.inverse_transform(y_pred_rf)
pd.Series(y_pred_rf_labels).value_counts()

computer science           23
artificial intelligence    19
other                       7
Name: count, dtype: int64

In [109]:
def print_metrics(y_test, y_pred):
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1 Score (weighted):", f1_score(y_test, y_pred, average='weighted', zero_division=0))  # set `zero_division` to 0 to handle case when there are no true positives
    print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))

In [119]:
print_metrics(y_test, y_pred_rf)

Accuracy: 0.6122448979591837
F1 Score (macro): 0.6140872154332858

Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.56      0.51        16
           1       0.74      0.71      0.72        24
           2       0.57      0.44      0.50         9

    accuracy                           0.61        49
   macro avg       0.59      0.57      0.58        49
weighted avg       0.62      0.61      0.61        49



Optimize the hyperparameters, using `GridSearchCV`, could take 2 minutes

In [134]:
from sklearn.model_selection import GridSearchCV

rf_param_grid = {
    'n_estimators': list(range(10, 300, 20)),
    'max_depth': [None] + list(range(5, 30, 5)),
    'min_samples_split': list(range(1, 10, 2)),
    'min_samples_leaf': list(range(1, 5)),
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy'],
}

# These were the best params in one of our searches, but it is somehow not found after we expand the param grid
# ANSWER: OVERFITTING
# rf_param_best = {'class_weight': 'balanced', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 30} # Acc: 0.6122448979591837, F1: 0.6140872154332858
# rf_param_grid = {key: [value] for key, value in rf_param_best.items()}
# for k, v in rf_param_best.items():
#     if v not in rf_param_grid[k]:
#         print(f"param {k} not in grid search")

rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=1, class_weight='balanced'),
    rf_param_grid,
    scoring='f1_weighted',
    cv=10,
    n_jobs=-1,
    verbose=1
)
rf_grid.fit(X_train, y_train)

print("best rf params:", rf_grid.best_params_)
y_pred_rf_optimized = rf_grid.predict(X_test)
print_metrics(y_test, y_pred_rf_optimized)

Fitting 10 folds for each of 7200 candidates, totalling 72000 fits
best rf params: {'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 9, 'n_estimators': 90}
Accuracy: 0.42857142857142855
F1 Score (macro): 0.43136474101835853

Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.56      0.45        16
           1       0.65      0.46      0.54        24
           2       0.12      0.11      0.12         9

    accuracy                           0.43        49
   macro avg       0.38      0.38      0.37        49
weighted avg       0.46      0.43      0.43        49



14400 fits failed out of a total of 72000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
14400 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/a13554/PycharmProjects/DMT_Assignment_1/venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/a13554/PycharmProjects/DMT_Assignment_1/venv/lib/python3.12/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/Users/a13554/PycharmProjects/DMT_Assignment_1/venv/lib/python3.12/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/Users/a13554/PycharmProje

### K-Nearest Neighbors

In [112]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)
# transform back to the original labels
y_pred_knn_labels = le_program.inverse_transform(y_pred_knn)
pd.Series(y_pred_knn_labels).value_counts()

artificial intelligence    32
computer science           13
other                       4
Name: count, dtype: int64

In [113]:
print_metrics(y_test, y_pred_knn)

Accuracy: 0.4897959183673469
F1 Score (macro): 0.4863167720310578

Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.75      0.50        16
           1       0.69      0.38      0.49        24
           2       0.75      0.33      0.46         9

    accuracy                           0.49        49
   macro avg       0.61      0.49      0.48        49
weighted avg       0.60      0.49      0.49        49



Optimize the hyperparameters, using `GridSearchCV`, could take 1 minutes

In [132]:
knn_param_grid = {
    'n_neighbors': [3, 47],
    # 'n_neighbors': list(range(1, 20, 2)),
    # 'weights': ['uniform', 'distance'],
    # 'p': [1, 2],
    'leaf_size': [30, 33],
    # 'leaf_size': list(range(10, 50, 2)),
    # 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    # 'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],
}

knn_grid = GridSearchCV(
    KNeighborsClassifier(),
    knn_param_grid,
    scoring='f1_weighted',
    cv=10,
    n_jobs=-1,
    verbose=1
)
knn_grid.fit(X_train, y_train)

print("best knn params:", knn_grid.best_params_)
print("best knn score:", knn_grid.best_score_)
y_pred_knn_optimized = knn_grid.predict(X_test)
print_metrics(y_test, y_pred_knn_optimized)

Fitting 10 folds for each of 4 candidates, totalling 40 fits
best knn params: {'leaf_size': 30, 'n_neighbors': 3}
best knn score: 0.3953934652808492
Accuracy: 0.4897959183673469
F1 Score (macro): 0.4863167720310578

Classification Report:
               precision    recall  f1-score   support

           0       0.38      0.75      0.50        16
           1       0.69      0.38      0.49        24
           2       0.75      0.33      0.46         9

    accuracy                           0.49        49
   macro avg       0.61      0.49      0.48        49
weighted avg       0.60      0.49      0.49        49



## Performance
Metrics:
- Accuracy: overall correctness
- F1 Score (weighted): accounts for class imbalance, since class `other` is much less than the other two classes
- `classification_report` shows precision, recall, F1 and support for each class

### Random Forest Performance

In [115]:
print_metrics(y_test, y_pred_rf_optimized)

Accuracy: 0.5102040816326531
F1 Score (macro): 0.5034761157210137

Classification Report:
               precision    recall  f1-score   support

           0       0.39      0.56      0.46        16
           1       0.71      0.62      0.67        24
           2       0.20      0.11      0.14         9

    accuracy                           0.51        49
   macro avg       0.44      0.43      0.42        49
weighted avg       0.51      0.51      0.50        49



### K-Nearest Neighbors Performance

In [116]:
print_metrics(y_test, y_pred_knn_optimized)

Accuracy: 0.4489795918367347
F1 Score (macro): 0.4503052503052503

Classification Report:
               precision    recall  f1-score   support

           0       0.34      0.62      0.44        16
           1       0.56      0.38      0.45        24
           2       0.75      0.33      0.46         9

    accuracy                           0.45        49
   macro avg       0.55      0.44      0.45        49
weighted avg       0.53      0.45      0.45        49



### Performance Comparison

In [117]:
results = {
    "Model": ["Random Forest", "KNN"],
    "Best Params": [rf_grid.best_params_, knn_grid.best_params_],
    "Accuracy": [accuracy_score(y_test, y_pred_rf_optimized), accuracy_score(y_test, y_pred_knn_optimized)],
    "F1 (weighted)": [f1_score(y_test, y_pred_rf_optimized, average='weighted'), f1_score(y_test, y_pred_knn_optimized, average='weighted')],
}

pd.DataFrame(results)


Unnamed: 0,Model,Best Params,Accuracy,F1 (weighted)
0,Random Forest,"{'criterion': 'gini', 'max_depth': None, 'max_...",0.510204,0.503476
1,KNN,"{'algorithm': 'auto', 'leaf_size': 9, 'metric'...",0.44898,0.450305
