# COMP5318 Assignment 1: Rice Classification

##### Group number: ...
##### Student 1 SID: ...
##### Student 2 SID: ...  
##### Student 3 SID: ... 
##### Student 4 SID: ... 

In [1]:
# Import all libraries
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import (
    StratifiedKFold,
    cross_val_score,
    GridSearchCV,
    train_test_split
)

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    BaggingClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier
)
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, f1_score

In [84]:
# Ignore future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [98]:
# Load the rice dataset: rice-final2.csv
data = pd.read_csv("rice-final2.csv", na_values="?")
# data = pd.read_csv("test-before.csv", na_values="?")

In [99]:
# Print first ten rows of pre-processed dataset to 4 decimal places as per assignment spec
# A function is provided to assist

def print_data(X, y, n_rows=10):
    """Takes a numpy data array and target and prints the first ten rows.
    
    Arguments:
        X: numpy array of shape (n_examples, n_features)
        y: numpy array of shape (n_examples)
        n_rows: numpy of rows to print
    """
    for example_num in range(n_rows):
        for feature in X[example_num]:
            print("{:.4f}".format(feature), end=",")

        if example_num == len(X)-1:
            print(y[example_num],end="")
        else:
            print(y[example_num])    


In [100]:
# Pre-process dataset

# 1. Separate features and labels
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# 2. Handle missing values: Replace missing values with column mean
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# 3. Normalize features: Scale all features into range [0, 1]
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(X)

# 4. Encode class labels: Convert class1 -> 0, class2 -> 1
y = np.where(y == "class1", 0, 1)

# 5. Printing function as per assignment spec
print_data(X, y, n_rows=10)

0.4628,0.5406,0.5113,0.4803,0.7380,0.4699,0.1196,1
0.4900,0.5547,0.5266,0.5018,0.7319,0.4926,0.8030,1
0.6109,0.6847,0.6707,0.5409,0.8032,0.6253,0.1185,0
0.6466,0.6930,0.6677,0.5961,0.7601,0.6467,0.2669,0
0.6712,0.6233,0.4755,0.8293,0.3721,0.6803,0.4211,1
0.2634,0.2932,0.2414,0.4127,0.5521,0.2752,0.2825,1
0.8175,0.9501,0.9515,0.5925,0.9245,0.8162,0.0000,0
0.3174,0.3588,0.3601,0.3908,0.6921,0.3261,0.8510,1
0.3130,0.3050,0.2150,0.5189,0.3974,0.3159,0.4570,1
0.5120,0.5237,0.4409,0.6235,0.5460,0.5111,0.3155,1


In [101]:
# --- Combine processed features and labels ---
processed_df = pd.DataFrame(X, columns=data.drop(columns=['class']).columns)
processed_df['class'] = y  # add processed class labels

# --- Save pre-processed dataset ---
processed_df.to_csv("rice-final2-preprocessed.csv", index=False)
print("Pre-processed dataset saved as rice-final2-preprocessed.csv")

Pre-processed dataset saved as rice-final2-preprocessed.csv


### Part 1: Cross-validation without parameter tuning

In [89]:
## Setting the 10 fold stratified cross-validation
cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# The stratified folds from cvKFold should be provided to the classifiers

In [90]:
# Logistic Regression
def logregClassifier(X, y):
    clf = LogisticRegression(random_state=0, max_iter=1000)
    scores = cross_val_score(clf, X, y, cv=cvKFold)
    return scores.mean()

In [91]:
#Naïve Bayes
def nbClassifier(X, y):
    clf = GaussianNB()
    scores = cross_val_score(clf, X, y, cv=cvKFold)
    return scores.mean()

In [92]:
# Decision Tree
def dtClassifier(X, y):
    clf = DecisionTreeClassifier(criterion='entropy', random_state=0)
    scores = cross_val_score(clf, X, y, cv=cvKFold)
    return scores.mean()

In [93]:
# Ensembles: Bagging, Ada Boost and Gradient Boosting
def bagDTClassifier(X, y, n_estimators, max_samples, max_depth):
    base_clf = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, random_state=0)
    clf = BaggingClassifier(estimator=base_clf,
                            n_estimators=n_estimators,
                            max_samples=max_samples,
                            random_state=0)
    scores = cross_val_score(clf, X, y, cv=cvKFold)
    return scores.mean()

def adaDTClassifier(X, y, n_estimators, learning_rate, max_depth):
    base_clf = DecisionTreeClassifier(criterion='entropy', max_depth=max_depth, random_state=0)
    clf = AdaBoostClassifier(estimator=base_clf,
                             n_estimators=n_estimators,
                             learning_rate=learning_rate,
                             random_state=0)
    scores = cross_val_score(clf, X, y, cv=cvKFold)
    return scores.mean()

def gbClassifier(X, y, n_estimators, learning_rate):
    clf = GradientBoostingClassifier(n_estimators=n_estimators,
                                     learning_rate=learning_rate,
                                     random_state=0)
    scores = cross_val_score(clf, X, y, cv=cvKFold)
    return scores.mean()

### Part 1 Results

In [94]:
# Parameters for Part 1:

#Bagging
bag_n_estimators = 50
bag_max_samples = 100
bag_max_depth = 5

#AdaBoost
ada_n_estimators = 50
ada_learning_rate = 0.5
ada_bag_max_depth = 5

#GB
gb_n_estimators = 50
gb_learning_rate = 0.5

# Print results for each classifier in part 1 to 4 decimal places here:
print("LogR average cross-validation accuracy: {:.4f}".format(logregClassifier(X, y)))
print("NB average cross-validation accuracy: {:.4f}".format(nbClassifier(X, y)))
print("DT average cross-validation accuracy: {:.4f}".format(dtClassifier(X, y)))
print("Bagging average cross-validation accuracy: {:.4f}".format(bagDTClassifier(X, y, bag_n_estimators, bag_max_samples, bag_max_depth)))
print("AdaBoost average cross-validation accuracy: {:.4f}".format(adaDTClassifier(X, y, ada_n_estimators, ada_learning_rate, ada_bag_max_depth)))
print("GB average cross-validation accuracy: {:.4f}".format(gbClassifier(X, y, gb_n_estimators, gb_learning_rate)))

LogR average cross-validation accuracy: 0.6700
NB average cross-validation accuracy: 0.6555
DT average cross-validation accuracy: 0.7750
Bagging average cross-validation accuracy: 0.7514
AdaBoost average cross-validation accuracy: 0.7224
GB average cross-validation accuracy: 0.7464


### Part 2: Cross-validation with parameter tuning

In [95]:
# KNN
k = [1, 3, 5, 7]
p = [1, 2]


def bestKNNClassifier(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=0
    )

    param_grid = {"n_neighbors": k, "p": p}
    grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=cvKFold)
    grid.fit(X_train, y_train)

    best_params = grid.best_params_
    best_cv_score = grid.best_score_
    test_accuracy = accuracy_score(y_test, grid.predict(X_test))

    return best_params, best_cv_score, test_accuracy

In [96]:
# Random Forest
# You should use RandomForestClassifier from sklearn.ensemble with information gain and max_features set to ‘sqrt’.
n_estimators = [10, 30, 60, 100]
max_leaf_nodes = [6, 12]

def bestRFClassifier(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=0
    )

    param_grid = {"n_estimators": n_estimators, "max_leaf_nodes": max_leaf_nodes}
    grid = GridSearchCV(
        RandomForestClassifier(
            criterion="entropy", max_features="sqrt", random_state=0
        ),
        param_grid,
        cv=cvKFold,
    )
    grid.fit(X_train, y_train)

    best_params = grid.best_params_
    best_cv_score = grid.best_score_
    y_pred = grid.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average="macro")
    f1_weighted = f1_score(y_test, y_pred, average="weighted")

    return best_params, best_cv_score, test_accuracy, f1_macro, f1_weighted

### Part 2: Results

In [97]:
# Perform Grid Search with 10-fold stratified cross-validation (GridSearchCV in sklearn). 
cvKFold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
# The stratified folds from cvKFold should be provided to GridSearchV

# This should include using train_test_split from sklearn.model_selection with stratification and random_state=0
# Print results for each classifier here. All results should be printed to 4 decimal places except for
# "k", "p", n_estimators" and "max_leaf_nodes" which should be printed as integers.
# KNN
knn_params, knn_cv_acc, knn_test_acc = bestKNNClassifier(X, y)
print("KNN best k: {}".format(knn_params["n_neighbors"]))
print("KNN best p: {}".format(knn_params["p"]))
print("KNN cross-validation accuracy: {:.4f}".format(knn_cv_acc))
print("KNN test set accuracy: {:.4f}".format(knn_test_acc))

print()

# RF
rf_params, rf_cv_acc, rf_test_acc, rf_f1_macro, rf_f1_weighted = bestRFClassifier(X, y)
print("RF best n_estimators: {}".format(rf_params["n_estimators"]))
print("RF best max_leaf_nodes: {}".format(rf_params["max_leaf_nodes"]))
print("RF cross-validation accuracy: {:.4f}".format(rf_cv_acc))
print("RF test set accuracy: {:.4f}".format(rf_test_acc))
print("RF test set macro average F1: {:.4f}".format(rf_f1_macro))
print("RF test set weighted average F1: {:.4f}".format(rf_f1_weighted))

KNN best k: 7
KNN best p: 1
KNN cross-validation accuracy: 0.7038
KNN test set accuracy: 0.6349

RF best n_estimators: 100
RF best max_leaf_nodes: 6
RF cross-validation accuracy: 0.8071
RF test set accuracy: 0.6667
RF test set macro average F1: 0.6437
RF test set weighted average F1: 0.6566
