In [39]:
# Import libraries for data processing and machine learning models
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Read the edge histogram features from a CSV file into a Pandas DataFrame
edgeHistogramDf = pd.read_csv('EdgeHistogram.csv', skiprows=1, delimiter=';', names=range(81))

# Read the image classification labels from another CSV file into a Pandas DataFrame
imagesDf = pd.read_csv('Images.csv', skiprows=1, delimiter=';', names=['ImageID', 'Class'])

# Update the column name for better clarity, setting the first column as ImageID
edgeHistogramDf.rename(columns={0: 'ImageID'}, inplace=True)

# Combine the dataframes to align each images class label with its histogram data
mergedDf = pd.merge(imagesDf, edgeHistogramDf, on='ImageID')

# Extract the feature matrix (X) by removing ImageID and Class columns from the merged dataframe
X = mergedDf.drop(columns=['ImageID', 'Class'])

# Define the target variable (y), representing the class for each image.
y = mergedDf['Class']

# Split the dataset into training and testing subsets, ensuring class distribution is maintained.
XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [40]:
#K-Nearest Neighbor
from sklearn.neighbors import KNeighborsClassifier
basicknnmodel = KNeighborsClassifier()
basicknnmodel.fit(XTrain, yTrain)
knnpredictions = basicknnmodel.predict(XTest)

paramGridKnn = {
    'n_neighbors': [3, 5, 7], 
    'weights': ['uniform', 'distance'], 
    'algorithm': ['ball_tree', 'kd_tree', 'brute'], 
    'p': [1, 2] 
}
gridSearchKnn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=paramGridKnn, cv=3, n_jobs=-1, verbose=2)
gridSearchKnn.fit(XTrain, yTrain)
bestParamsKnn = gridSearchKnn.best_params_
bestScoreKnn = gridSearchKnn.best_score_
bestParamsKnn, bestScoreKnn

Fitting 3 folds for each of 36 candidates, totalling 108 fits


({'algorithm': 'ball_tree', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'},
 0.4936424699221014)

In [41]:
# Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
nbModel = GaussianNB()
nbModel.fit(XTrain, yTrain)
nbPredictions = nbModel.predict(XTest)
nbTestScore = accuracy_score(yTest, nbPredictions)
paramGridSvm = {
    'C': [0.1, 1, 10],  
    'kernel': ['linear', 'rbf'], 
    'gamma': ['scale', 'auto']
}
gridSearchSvm = GridSearchCV(estimator=SVC(random_state=42), param_grid=paramGridSvm, cv=3, n_jobs=-1, verbose=2)
gridSearchSvm.fit(XTrain, yTrain)
bestParamsSvm = gridSearchSvm.best_params_
bestScoreSvm = gridSearchSvm.best_score_
bestParamsSvm, bestScoreSvm, nbTestScore

Fitting 3 folds for each of 12 candidates, totalling 36 fits


({'C': 10, 'gamma': 'scale', 'kernel': 'rbf'},
 0.5654133681068831,
 0.4297430289775834)

In [42]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
basicdtmodel = DecisionTreeClassifier(random_state=42)
basicdtmodel.fit(XTrain, yTrain)
dtpredictions = basicdtmodel.predict(XTest)
paramGridDt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
gridSearchDt = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), param_grid=paramGridDt, cv=3, n_jobs=-1, verbose=2)
gridSearchDt.fit(XTrain, yTrain)
bestParamsDt = gridSearchDt.best_params_
bestScoreDt = gridSearchDt.best_score_
bestParamsDt, bestScoreDt

Fitting 3 folds for each of 72 candidates, totalling 216 fits


({'criterion': 'entropy',
  'max_depth': 10,
  'min_samples_leaf': 4,
  'min_samples_split': 10},
 0.31606214886321676)

In [43]:
# Import required libraries for matrix operations and data handling
from sklearn.metrics import confusion_matrix
import numpy as np

def saveConfusionMatrix(yTrue, yPred, classLabels, groupNumber, resultNumber):
    # Sort the class labels alphabetically
    sortedLabels = sorted(classLabels, key=str.casefold)  
    # Create a confusion matrix with true and predicted labels
    cm = confusion_matrix(yTrue, yPred, labels=sortedLabels)
    # Convert the confusion matrix into a DataFrame
    cmDf = pd.DataFrame(cm, index=sortedLabels, columns=sortedLabels)
    # Create a filename using the group number and result number
    cmFileName = f'group{groupNumber}_result{resultNumber}.csv'
    # Save the confusion matrix as a CSV file
    cmDf.to_csv(cmFileName, index=True)
    # Return the filename
    return cmFileName

def saveHyperparameters(hyperparameters, classifierName, library, testSize, groupNumber, resultNumber):
    # Initialize the data dictionary with the classifier name, library, and test size
    data = {
        'classifier_name': classifierName,
        'library': library,
        'test_size': testSize
    }
    
    # Append each hyperparameter and its value to the data dictionary
    for param, value in hyperparameters.items():
        data[param] = value

    # Convert the data dictionary to a DataFrame for easy CSV conversion
    df = pd.DataFrame(list(data.items()), columns=['Parameter', 'Value'])
    
    # Create a filename using the group number and result number
    parametersFileName = f'group{groupNumber}_parameters{resultNumber}.csv'
    
    # Save the DataFrame as a CSV file
    df.to_csv(parametersFileName, index=False, header=False)
    # Return the filename 
    return parametersFileName

# Initialize lists to store filenames of saved confusion matrices and hyperparameters
cmFiles, hpFiles = [], []

