# D7041E Applied artificial intelligence - Mini project
Raphael Michon

In [8]:
# Importing the needed libraries
import pandas as pd
import os
from sklearn.model_selection import train_test_split, cross_val_score
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

from sklearn.metrics import accuracy_score, f1_score

In [4]:
# Function to load the data from the folder
def load_data(file_path):
    # Read the raw data with tab delimiter and specify the index column
    data = pd.read_csv(file_path, delimiter='\t', index_col=0)
    
    # Drop any leading/trailing whitespace in column names, I found some issues in some files
    data.columns = data.columns.str.strip()

    # Ensure the 'clase' column is treated as a string and use it as the label
    data['clase'] = data['clase'].astype(str).str.strip()

    # Display the number of features
    num_features = data.shape[1] - 1
    print(f"Number of features: {num_features}")

    return data

In [6]:
# Function to split the data into the train, validation and test sets
def split_data(df, label_column, test_size=0.3, val_size=0.5, random_state=42):
    # Remove the label
    X = df.drop(columns=[label_column])
    y = df[label_column]
    
    # Split into train and temp sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=test_size, random_state=random_state)
    # Split temp into test and validation sets
    X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=val_size, random_state=random_state)
    
    return X_train, X_test, X_val, y_train, y_test, y_val

In [10]:
# Function to open and load all the datasets
def open_files(base_dir):
    # List all folders in the base directory
    folders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]
    # Some debugging
    print("Folders found:", folders)
    num = 0

    # Dictionary to hold the data for each folder
    datasets = {}
    
    # Iterate through each folder and open the specified file
    for folder in folders:
        file_name = f"{folder}_R.dat"
        file_path = os.path.join(base_dir, folder, file_name)
        if os.path.exists(file_path):
            print(f"Opening file: {file_path}")
            datasets[folder] = load_data(file_path)
            num += 1
        else:
            print(f"File not found: {file_path}")

    print(f"Number of files opened: {num}")
    return datasets

In [12]:
base_dir = "data"
datasets = open_files(base_dir)

Folders found: ['abalone', 'acute-inflammation', 'acute-nephritis', 'adult', 'annealing', 'arrhythmia', 'audiology-std', 'balance-scale', 'balloons', 'bank', 'blood', 'breast-cancer', 'breast-cancer-wisc', 'breast-cancer-wisc-diag', 'breast-cancer-wisc-prog', 'breast-tissue', 'car', 'cardiotocography-10clases', 'cardiotocography-3clases', 'chess-krvk', 'chess-krvkp', 'congressional-voting', 'conn-bench-sonar-mines-rocks', 'conn-bench-vowel-deterding', 'contrac', 'credit-approval', 'cylinder-bands', 'dermatology', 'echocardiogram', 'ecoli', 'energy-y1', 'energy-y2', 'fertility', 'flags', 'glass', 'haberman-survival', 'hayes-roth', 'heart-cleveland', 'heart-hungarian', 'heart-switzerland', 'heart-va', 'hepatitis', 'hill-valley', 'horse-colic', 'ilpd-indian-liver', 'image-segmentation', 'ionosphere', 'iris', 'led-display', 'lenses', 'letter', 'libras', 'low-res-spect', 'lung-cancer', 'lymphography', 'magic', 'mammographic', 'molec-biol-promoter', 'molec-biol-protein-second', 'molec-biol-s

The ```File not found```happens when the data is already split in train and test sets. I don't load them.

In [15]:
# Dictionary to store accuracies and F1 scores for all datasets
results = {}

# Loop through each dataset
for folder, data in datasets.items():
    print(f"\nProcessing {folder} dataset...")
    X_train, X_test, X_val, y_train, y_test, y_val = split_data(data, label_column='clase')
    
    # Ensure labels are numeric for kmeans
    y_train = y_train.astype(float)
    y_test = y_test.astype(float)
    
    # Train and evaluate SVM
    classifier = SVC(kernel='linear', decision_function_shape="ovo")
    linear_ovr = classifier.fit(X_train, y_train)
    pred = linear_ovr.predict(X_test)
    f1 = f1_score(y_test, pred, average='weighted')
    accuracy = accuracy_score(y_test, pred)
    
    # Train and evaluate Random Forest
    rf_classifier = RandomForestClassifier(random_state=42)
    rf_classifier.fit(X_train, y_train)
    rf_pred = rf_classifier.predict(X_test)
    rf_accuracy = accuracy_score(y_test, rf_pred)
    rf_f1 = f1_score(y_test, rf_pred, average='weighted')
    
    # Perform Agglomerative Clustering and map clusters to labels
    agg_cluster = AgglomerativeClustering(n_clusters=len(set(y_train)), metric='euclidean', linkage='ward')
    agg_cluster.fit(X_train)
    agg_labels = agg_cluster.fit_predict(X_test)
    agg_labels = agg_labels.astype(float)  # Ensure labels are treated as floats
    agg_accuracy = accuracy_score(y_test, agg_labels)
    agg_f1 = f1_score(y_test, agg_labels, average='weighted')
    
    # Perform K-Means Clustering
    kmeans = KMeans(n_clusters=len(set(y_train)), random_state=42, n_init=10)
    kmeans.fit(X_train)
    kmeans_labels = kmeans.predict(X_test)
    kmeans_labels = kmeans_labels.astype(float)  # Ensure labels are treated as floats
    kmeans_accuracy = accuracy_score(y_test, kmeans_labels)
    kmeans_f1 = f1_score(y_test, kmeans_labels, average='weighted')
    
    # Add results to the dictionary
    results[f"{folder} - SVM"] = {'accuracy': accuracy, 'f1_score': f1}
    results[f"{folder} - Random Forest"] = {'accuracy': rf_accuracy, 'f1_score': rf_f1}
    results[f"{folder} - Agglomerative Clustering"] = {'accuracy': agg_accuracy, 'f1_score': agg_f1}
    results[f"{folder} - K-Means Clustering"] = {'accuracy': kmeans_accuracy, 'f1_score': kmeans_f1}
    
    # Perform cross-validation for both supervised models
    cv_scores_svc = cross_val_score(classifier, X_train, y_train, cv=5)
    cv_scores_rf = cross_val_score(rf_classifier, X_train, y_train, cv=5)
    
    print('Supervised techniques:')
    print(f'{folder} SVM accuracy: {accuracy} and F1 score: {f1}')
    print(f'{folder} Random Forest accuracy: {rf_accuracy} and F1 score: {rf_f1}')
    print('Unsupervised techniques')
    print(f'{folder} Agglomerative Clustering accuracy: {agg_accuracy} and F1 score: {agg_f1}')
    print(f'{folder} K-Means Clustering accuracy: {kmeans_accuracy} and F1 score: {kmeans_f1}')
    print('Cross validation:')
    print(f'{folder} Mean SVM cross-validation score: {cv_scores_svc.mean():.2f}')
    print(f'{folder} Mean Random Forest cross-validation score: {cv_scores_rf.mean():.2f}')


Processing abalone dataset...
Supervised techniques:
abalone SVM accuracy: 0.6602870813397129 and F1 score: 0.6585778058855416
abalone Random Forest accuracy: 0.6443381180223285 and F1 score: 0.6437340110921232
Unsupervised techniques
abalone Agglomerative Clustering accuracy: 0.2822966507177033 and F1 score: 0.2655361022383284
abalone K-Means Clustering accuracy: 0.48484848484848486 and F1 score: 0.49198139870700425
Cross validation:
abalone Mean SVM cross-validation score: 0.65
abalone Mean Random Forest cross-validation score: 0.64

Processing acute-inflammation dataset...
Supervised techniques:
acute-inflammation SVM accuracy: 1.0 and F1 score: 1.0
acute-inflammation Random Forest accuracy: 1.0 and F1 score: 1.0
Unsupervised techniques
acute-inflammation Agglomerative Clustering accuracy: 0.6111111111111112 and F1 score: 0.5925925925925927
acute-inflammation K-Means Clustering accuracy: 0.3888888888888889 and F1 score: 0.3582311408398365
Cross validation:
acute-inflammation Mean S



Supervised techniques:
arrhythmia SVM accuracy: 0.6029411764705882 and F1 score: 0.5357347433355364
arrhythmia Random Forest accuracy: 0.6617647058823529 and F1 score: 0.5682503770739065
Unsupervised techniques
arrhythmia Agglomerative Clustering accuracy: 0.1323529411764706 and F1 score: 0.19117647058823528
arrhythmia K-Means Clustering accuracy: 0.23529411764705882 and F1 score: 0.2536620906511236
Cross validation:
arrhythmia Mean SVM cross-validation score: 0.66
arrhythmia Mean Random Forest cross-validation score: 0.73

Processing balance-scale dataset...
Supervised techniques:
balance-scale SVM accuracy: 0.8936170212765957 and F1 score: 0.9000751979475384
balance-scale Random Forest accuracy: 0.7978723404255319 and F1 score: 0.7791666666666668
Unsupervised techniques
balance-scale Agglomerative Clustering accuracy: 0.46808510638297873 and F1 score: 0.5183851609383524
balance-scale K-Means Clustering accuracy: 0.40425531914893614 and F1 score: 0.4440116678105696
Cross validation:
b



Supervised techniques:
ecoli SVM accuracy: 0.92 and F1 score: 0.9188957159111958
ecoli Random Forest accuracy: 0.92 and F1 score: 0.9194035087719299
Unsupervised techniques
ecoli Agglomerative Clustering accuracy: 0.0 and F1 score: 0.0
ecoli K-Means Clustering accuracy: 0.3 and F1 score: 0.32799999999999996
Cross validation:
ecoli Mean SVM cross-validation score: 0.86
ecoli Mean Random Forest cross-validation score: 0.85

Processing energy-y1 dataset...
Supervised techniques:
energy-y1 SVM accuracy: 0.8347826086956521 and F1 score: 0.8323520198913572
energy-y1 Random Forest accuracy: 0.9652173913043478 and F1 score: 0.9652173913043478
Unsupervised techniques
energy-y1 Agglomerative Clustering accuracy: 0.6521739130434783 and F1 score: 0.6484606503126507
energy-y1 K-Means Clustering accuracy: 0.2608695652173913 and F1 score: 0.2630244327148066
Cross validation:
energy-y1 Mean SVM cross-validation score: 0.86
energy-y1 Mean Random Forest cross-validation score: 0.97

Processing energy-y2



Supervised techniques:
flags SVM accuracy: 0.3793103448275862 and F1 score: 0.37710727969348656
flags Random Forest accuracy: 0.6896551724137931 and F1 score: 0.6660098522167487
Unsupervised techniques
flags Agglomerative Clustering accuracy: 0.10344827586206896 and F1 score: 0.11637931034482758
flags K-Means Clustering accuracy: 0.20689655172413793 and F1 score: 0.1853448275862069
Cross validation:
flags Mean SVM cross-validation score: 0.45
flags Mean Random Forest cross-validation score: 0.68

Processing glass dataset...
Supervised techniques:
glass SVM accuracy: 0.6875 and F1 score: 0.6510416666666667
glass Random Forest accuracy: 0.78125 and F1 score: 0.7626183712121213
Unsupervised techniques
glass Agglomerative Clustering accuracy: 0.0625 and F1 score: 0.08035714285714285
glass K-Means Clustering accuracy: 0.1875 and F1 score: 0.15808823529411764
Cross validation:
glass Mean SVM cross-validation score: 0.60
glass Mean Random Forest cross-validation score: 0.73

Processing haberm



Supervised techniques:
heart-switzerland SVM accuracy: 0.5 and F1 score: 0.4677871148459384
heart-switzerland Random Forest accuracy: 0.3333333333333333 and F1 score: 0.3151169817836485
Unsupervised techniques
heart-switzerland Agglomerative Clustering accuracy: 0.3333333333333333 and F1 score: 0.2758169934640523
heart-switzerland K-Means Clustering accuracy: 0.2777777777777778 and F1 score: 0.22954822954822957
Cross validation:
heart-switzerland Mean SVM cross-validation score: 0.37
heart-switzerland Mean Random Forest cross-validation score: 0.47

Processing heart-va dataset...
Supervised techniques:
heart-va SVM accuracy: 0.36666666666666664 and F1 score: 0.3199447895100069
heart-va Random Forest accuracy: 0.3333333333333333 and F1 score: 0.30277777777777776
Unsupervised techniques
heart-va Agglomerative Clustering accuracy: 0.26666666666666666 and F1 score: 0.29271664008506115
heart-va K-Means Clustering accuracy: 0.13333333333333333 and F1 score: 0.13489278752436648
Cross validati



Supervised techniques:
lenses SVM accuracy: 1.0 and F1 score: 1.0
lenses Random Forest accuracy: 1.0 and F1 score: 1.0
Unsupervised techniques
lenses Agglomerative Clustering accuracy: 0.75 and F1 score: 0.75
lenses K-Means Clustering accuracy: 0.0 and F1 score: 0.0
Cross validation:
lenses Mean SVM cross-validation score: 0.68
lenses Mean Random Forest cross-validation score: 0.62

Processing letter dataset...
Supervised techniques:
letter SVM accuracy: 0.8516666666666667 and F1 score: 0.8521098815605431
letter Random Forest accuracy: 0.957 and F1 score: 0.9571590508862862
Unsupervised techniques
letter Agglomerative Clustering accuracy: 0.02033333333333333 and F1 score: 0.01678212847692108
letter K-Means Clustering accuracy: 0.035 and F1 score: 0.03143078321467211
Cross validation:
letter Mean SVM cross-validation score: 0.85
letter Mean Random Forest cross-validation score: 0.95

Processing libras dataset...
Supervised techniques:
libras SVM accuracy: 0.8518518518518519 and F1 score



Supervised techniques:
low-res-spect SVM accuracy: 0.8625 and F1 score: 0.8505785643943538
low-res-spect Random Forest accuracy: 0.85 and F1 score: 0.839613300492611
Unsupervised techniques
low-res-spect Agglomerative Clustering accuracy: 0.2375 and F1 score: 0.30364906832298133
low-res-spect K-Means Clustering accuracy: 0.075 and F1 score: 0.0819078947368421
Cross validation:
low-res-spect Mean SVM cross-validation score: 0.90
low-res-spect Mean Random Forest cross-validation score: 0.89

Processing lung-cancer dataset...
Supervised techniques:
lung-cancer SVM accuracy: 0.2 and F1 score: 0.06666666666666668
lung-cancer Random Forest accuracy: 0.6 and F1 score: 0.6333333333333333
Unsupervised techniques
lung-cancer Agglomerative Clustering accuracy: 0.4 and F1 score: 0.32
lung-cancer K-Means Clustering accuracy: 0.2 and F1 score: 0.06666666666666668
Cross validation:
lung-cancer Mean SVM cross-validation score: 0.45
lung-cancer Mean Random Forest cross-validation score: 0.54

Processin



Supervised techniques:
lymphography SVM accuracy: 0.9090909090909091 and F1 score: 0.8875598086124402
lymphography Random Forest accuracy: 0.7727272727272727 and F1 score: 0.7193675889328063
Unsupervised techniques
lymphography Agglomerative Clustering accuracy: 0.0 and F1 score: 0.0
lymphography K-Means Clustering accuracy: 0.5909090909090909 and F1 score: 0.5772727272727273
Cross validation:
lymphography Mean SVM cross-validation score: 0.79
lymphography Mean Random Forest cross-validation score: 0.86

Processing magic dataset...
Supervised techniques:
magic SVM accuracy: 0.784086926042762 and F1 score: 0.776939543387801
magic Random Forest accuracy: 0.8783736417805819 and F1 score: 0.8760738578064705
Unsupervised techniques
magic Agglomerative Clustering accuracy: 0.5958640028040659 and F1 score: 0.586161257187111
magic K-Means Clustering accuracy: 0.5513494567122328 and F1 score: 0.5583896744456954
Cross validation:
magic Mean SVM cross-validation score: 0.79
magic Mean Random Fore



Supervised techniques:
post-operative SVM accuracy: 0.7692307692307693 and F1 score: 0.7357859531772575
post-operative Random Forest accuracy: 0.6923076923076923 and F1 score: 0.6923076923076923
Unsupervised techniques
post-operative Agglomerative Clustering accuracy: 0.46153846153846156 and F1 score: 0.5128205128205128
post-operative K-Means Clustering accuracy: 0.15384615384615385 and F1 score: 0.1794871794871795
Cross validation:
post-operative Mean SVM cross-validation score: 0.53
post-operative Mean Random Forest cross-validation score: 0.51

Processing primary-tumor dataset...




Supervised techniques:
primary-tumor SVM accuracy: 0.42857142857142855 and F1 score: 0.4550578673027652
primary-tumor Random Forest accuracy: 0.5306122448979592 and F1 score: 0.5254494655004859
Unsupervised techniques
primary-tumor Agglomerative Clustering accuracy: 0.10204081632653061 and F1 score: 0.1267080745341615
primary-tumor K-Means Clustering accuracy: 0.02040816326530612 and F1 score: 0.02782931354359926
Cross validation:
primary-tumor Mean SVM cross-validation score: 0.39
primary-tumor Mean Random Forest cross-validation score: 0.43

Processing ringnorm dataset...
Supervised techniques:
ringnorm SVM accuracy: 0.7720720720720721 and F1 score: 0.7698965300292001
ringnorm Random Forest accuracy: 0.9531531531531532 and F1 score: 0.9531068642098856
Unsupervised techniques
ringnorm Agglomerative Clustering accuracy: 0.31801801801801804 and F1 score: 0.2454287085948222
ringnorm K-Means Clustering accuracy: 0.7558558558558559 and F1 score: 0.7469820832323196
Cross validation:
ringnor



Supervised techniques:
zoo SVM accuracy: 0.9333333333333333 and F1 score: 0.9066666666666666
zoo Random Forest accuracy: 0.9333333333333333 and F1 score: 0.9066666666666666
Unsupervised techniques
zoo Agglomerative Clustering accuracy: 0.0 and F1 score: 0.0
zoo K-Means Clustering accuracy: 0.2 and F1 score: 0.22333333333333333
Cross validation:
zoo Mean SVM cross-validation score: 0.99
zoo Mean Random Forest cross-validation score: 0.99


In [17]:
# Calculate the average accuracy and F1 score for each model
def compute_average_metrics(results):
    avg_metrics = {
        'SVM': {'accuracy': [], 'f1_score': []},
        'Random Forest': {'accuracy': [], 'f1_score': []},
        'Agglomerative Clustering': {'accuracy': [], 'f1_score': []},
        'K-Means Clustering': {'accuracy': [], 'f1_score': []}
    }
    
    # Collect accuracy and F1 scores
    for key, scores in results.items():
        if 'SVM' in key:
            avg_metrics['SVM']['accuracy'].append(scores['accuracy'])
            avg_metrics['SVM']['f1_score'].append(scores['f1_score'])
        elif 'Random Forest' in key:
            avg_metrics['Random Forest']['accuracy'].append(scores['accuracy'])
            avg_metrics['Random Forest']['f1_score'].append(scores['f1_score'])
        elif 'Agglomerative Clustering' in key:
            avg_metrics['Agglomerative Clustering']['accuracy'].append(scores['accuracy'])
            avg_metrics['Agglomerative Clustering']['f1_score'].append(scores['f1_score'])
        elif 'K-Means Clustering' in key:
            avg_metrics['K-Means Clustering']['accuracy'].append(scores['accuracy'])
            avg_metrics['K-Means Clustering']['f1_score'].append(scores['f1_score'])
    
    # Calculate the mean accuracy and F1 score
    for model in avg_metrics:
        avg_metrics[model]['accuracy'] = sum(avg_metrics[model]['accuracy']) / len(avg_metrics[model]['accuracy'])
        avg_metrics[model]['f1_score'] = sum(avg_metrics[model]['f1_score']) / len(avg_metrics[model]['f1_score'])
    
    return avg_metrics

avg_metrics = compute_average_metrics(results)
print("Average Metrics for Each Classifier:")
for model, metrics in avg_metrics.items():
    print(f"{model} - Average Accuracy: {metrics['accuracy']:.4f}, Average F1 Score: {metrics['f1_score']:.4f}")

Average Metrics for Each Classifier:
SVM - Average Accuracy: 0.7865, Average F1 Score: 0.7701
Random Forest - Average Accuracy: 0.8193, Average F1 Score: 0.8088
Agglomerative Clustering - Average Accuracy: 0.3741, Average F1 Score: 0.3687
K-Means Clustering - Average Accuracy: 0.3403, Average F1 Score: 0.3338


The previous warnings come from the fact that the dataset contains too few data.