In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

# File paths
edge_histogram_csv_path = 'EdgeHistogram.csv'
images_csv_path = 'Images.csv'


images_df = pd.read_csv(images_csv_path, skiprows=1, sep=';', header=None, names=['id', 'class'])
# Load EdgeHistogram.csv
# edge_histogram_df = pd.read_csv(edge_histogram_path, delimiter=';', header=None, skiprows=1)
# edge_histogram_df.columns = ['ImageID'] + [f'Feature_{i}' for i in range(1, edge_histogram_df.shape[1])]
edge_histogram_df = pd.read_csv(edge_histogram_csv_path, skiprows=1, sep=';', header=None)
edge_histogram_df.rename(columns={0: 'id'}, inplace=True)

# combined_df = pd.merge(edge_histogram_df, images_df, on='ImageID')
merged_df = pd.merge(images_df, edge_histogram_df, on='id')


In [16]:
# Split data into features and target 
#removing the id and class in X so that it only contains features
X = merged_df.drop(['id', 'class'], axis=1)
#y only contains the class names 
y = merged_df['class']
#split data for training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [17]:
# Initialize classifiers
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

# Fit models on training data
knn.fit(X_train, y_train)
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)

# Make predictions on test data
knn_pred = knn.predict(X_test)
dt_pred = dt.predict(X_test)
rf_pred = rf.predict(X_test)

class_labels = sorted(y.unique())

# Confusion matrices
knn_cm = confusion_matrix(y_test, knn_pred, labels=class_labels)
dt_cm = confusion_matrix(y_test, dt_pred, labels=class_labels)
rf_cm = confusion_matrix(y_test, rf_pred, labels=class_labels)

In [27]:

# Convert confusion matrices to DataFrames with class labels
def cm_to_df(cm, class_labels):
    cm_df = pd.DataFrame(cm, index=class_labels, columns=class_labels)
    return cm_df

knn_cm_df = cm_to_df(knn_cm, class_labels)
dt_cm_df = cm_to_df(dt_cm, class_labels)
rf_cm_df = cm_to_df(rf_cm, class_labels)

# Save the DataFrames to an Excel file
excel_file_path = 'confusion_matrices.xlsx'
with pd.ExcelWriter(excel_file_path, engine='xlsxwriter') as writer:
    knn_cm_df.to_excel(writer, sheet_name='KNN Confusion Matrix')
    dt_cm_df.to_excel(writer, sheet_name='DT Confusion Matrix')
    rf_cm_df.to_excel(writer, sheet_name='RF Confusion Matrix')

In [29]:
def save_classifier_hyperparameters(classifier_name, test_size, params, file_path):
    with open(file_path, 'w') as f:
        f.write(f"Classifiername,{classifier_name}\n")
        f.write("library,sklearn\n")
        f.write(f"test_size,{test_size}\n")
        for param, value in params.items():
            f.write(f"{param},{value}\n")

# Add leaf_size for KNeighborsClassifier
save_classifier_hyperparameters('KNeighborsClassifier', 0.20, 
                                 {'n_neighbors': 5, 'algorithm': 'auto', 'leaf_size': 30}, 
                                 'group071_parameters1.csv')

# DecisionTreeClassifier hyperparameters
save_classifier_hyperparameters('DecisionTreeClassifier', 0.20, 
                                 {'criterion': 'gini', 'max_depth': 'None'}, 
                                 'group071_parameters2.csv')

# RandomForestClassifier hyperparameters
save_classifier_hyperparameters('RandomForestClassifier', 0.20, 
                                 {'n_estimators': 100, 'criterion': 'gini', 'max_depth': 'None'}, 
                                 'group071_parameters3.csv')

