In [2]:
# Import required library
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate

In [3]:
# Load in csv with labels
file_path = "data/train.csv"
df = pd.read_csv(file_path, usecols=[1,2], header=0, names=["uid", "class"])

print(df)

           uid  class
0    ZYURRE527      4
1    ZWNWBP435      0
2    ZVHEZA963      4
3    ZSFNU1100      4
4    ZRXUB1049      0
..         ...    ...
422  AGHXWX765      0
423  AFEOPC672      3
424  AEEEIG737      3
425  ADQRPH513      3
426  ABNTSS552      4

[427 rows x 2 columns]


In [4]:
# Explore class distribution
class_dist = df["class"].value_counts()
print(class_dist)

# Calculate class frequencies
class_freq = (np.bincount(df["class"])) / len(df) * 100
print(class_freq)

0    159
4    109
3     78
1     54
2     27
Name: class, dtype: int64
[37.23653396 12.64637002  6.32318501 18.26697892 25.52693208]


In [5]:
def get_features(uid):
    # Set file path using uid
    file_path = f"data/img_details/{uid}.csv"
    
    # Open csv
    csv_df = pd.read_csv(file_path, header=None)

    # Return as a Python list
    return csv_df.iloc[0].values.tolist()

In [6]:
# Get img features in a single df
features = df[["uid"]].apply(lambda row: get_features(row[0]), axis=1, result_type="expand")

In [7]:
# Get number of cols
num_features = len(features.columns)

# Set column names (features): f0000, f0001, f0002, ... f1024
col_names = [f"f{index:04d}" for index in range(num_features)]

# Rename columns to identify features
features.columns=col_names

# Create final df with all img features
features_df = pd.concat([df, features], axis=1)

display(features_df)

Unnamed: 0,uid,class,f0000,f0001,f0002,f0003,f0004,f0005,f0006,f0007,...,f1014,f1015,f1016,f1017,f1018,f1019,f1020,f1021,f1022,f1023
0,ZYURRE527,4,0.000462,0.005583,-0.001031,0.002307,-0.113097,-0.284965,0.001069,-0.000092,...,0.680631,-1.153061,0.111816,0.162622,-1.085265,-0.657002,-1.406191,2.240085,0.118616,-0.728013
1,ZWNWBP435,0,0.000220,0.006780,-0.000547,0.002183,-0.045820,-0.216762,0.000987,-0.001331,...,-1.241972,-0.115316,-0.411191,0.431461,0.442649,1.243681,-0.151721,0.458508,1.931918,-0.241081
2,ZVHEZA963,4,0.000405,0.007183,-0.000137,0.002612,-0.083430,-0.292385,0.001094,-0.000112,...,0.659314,-0.792833,-0.471358,0.514799,-0.846220,0.479314,-0.730218,1.352716,0.040223,-0.163302
3,ZSFNU1100,4,0.000388,0.003802,0.002121,0.001513,-0.109248,-0.183284,0.000813,-0.001447,...,-0.047666,-0.201043,-0.565545,0.999009,-0.332314,-0.066972,-1.263785,3.876905,-0.397950,-0.693763
4,ZRXUB1049,0,0.000425,0.006544,0.001630,0.001549,-0.068301,-0.283487,0.001004,-0.001800,...,-1.221178,-0.253239,-0.046740,0.242367,-0.379724,-0.893249,-0.957397,1.118245,0.181925,-0.024197
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422,AGHXWX765,0,0.000305,0.003671,-0.004093,0.003010,-0.093583,0.133018,0.000627,0.001443,...,-0.260746,-0.741712,-0.887129,0.190525,0.216271,0.490549,-1.047399,1.875185,0.345561,-0.874318
423,AFEOPC672,3,0.000441,0.006178,-0.000811,0.003572,-0.108863,-0.302020,0.000761,0.001851,...,0.457373,-0.782917,-1.072765,1.180279,-0.111142,1.897755,-0.902370,0.552967,-0.314270,-1.198762
424,AEEEIG737,3,0.000464,0.006611,0.000842,0.001412,-0.152744,-0.355706,0.000906,-0.001229,...,0.411773,0.232481,-0.527885,-0.305296,-0.189008,-0.592684,-1.144780,3.459698,-0.199579,-0.999165
425,ADQRPH513,3,0.000233,0.003029,0.001606,0.001224,-0.092386,-0.434045,0.000668,-0.000410,...,-0.147889,1.168724,-0.486698,1.134707,-0.029372,0.092189,-0.791921,1.786787,2.089036,-0.690614


- Use interpretability / feature importance packages from Azure machine learning
- Use linear discriminant analysis to convert multi-dimensional feature space into 2d
- Plot boxplots

In [8]:
# Make box plots
def make_boxplots(feature_name, output_folder="plots/boxplot"):
    # Create plot
    plt.boxplot([
        features_df[feature_name].loc[features_df["class"]==0], 
        features_df[feature_name].loc[features_df["class"]==1],
        features_df[feature_name].loc[features_df["class"]==2], 
        features_df[feature_name].loc[features_df["class"]==3],
        features_df[feature_name].loc[features_df["class"]==4]
        ])
    plt.title(feature_name)
    plt.xlabel("Class")
    plt.ylabel("Feature Value")

    # Save figure
    plt.savefig(f"{output_folder}/{feature_name}.png")
    plt.close()

In [9]:
# Loop over each feature and create boxplot
for feature in col_names:
    make_boxplots(feature)

In [8]:
# Use a stratified split
# Use sklearn train_test_split function
X_train, X_test, y_train, y_test = train_test_split(
    features_df[col_names], 
    features_df["class"],
    test_size=0.30, random_state=10, stratify=features_df["class"]
)

# Get number of samples from each class in train and test subsets
train_count = y_train.value_counts()
print(f"Training subset:\n{train_count}")

test_count = y_test.value_counts()
print(f"Testing subset:\n{test_count}")

Training subset:
0    111
4     76
3     54
1     38
2     19
Name: class, dtype: int64
Testing subset:
0    48
4    33
3    24
1    16
2     8
Name: class, dtype: int64


In [9]:
# Create linear SVM classifier
svm_clf = SVC(kernel="linear", C=1)

# Train the SVM model using the clf
svm_model = svm_clf.fit(X_train, y_train)

# Calculate accuracy
y_hat = svm_model.predict(X_test)
acc = np.average(y_hat == y_test)
print(f"Accuracy for linear SVM is: {acc:.3f}")

# Create RBF SVM classifier
svm_clf_rbf = SVC(kernel="rbf", C=0.5, gamma=1, probability=True)

# Train the SVM model using the clf
svm_model_rbf = svm_clf_rbf.fit(X_train, y_train)

# Calculate accuracy
y_hat_rbf = svm_model_rbf.predict(X_test)
acc_rbf = np.average(y_hat_rbf == y_test)
print(f"Accuracy for RBF SVM is: {acc_rbf:.3f}")

Accuracy for linear SVM is: 0.527
Accuracy for RBF SVM is: 0.372


In [11]:
# Try training classifier with cross-fold validation
# Use cross_validate from sklearn to obtain accuracies for 5-fold cross validation
cv_results = cross_validate(svm_clf, X_train, y_train, cv=15,
    scoring='accuracy')

# Print accuracy for each fold
print(f"The accuracy for fold 1 is: {cv_results['test_score'][0]:.3f}")
print(f"The accuracy for fold 2 is: {cv_results['test_score'][1]:.3f}")
print(f"The accuracy for fold 3 is: {cv_results['test_score'][2]:.3f}")
print(f"The accuracy for fold 4 is: {cv_results['test_score'][3]:.3f}")
print(f"The accuracy for fold 5 is: {cv_results['test_score'][4]:.3f}")

# Print average accuracy across all five folds
svm_avg = np.average(cv_results['test_score'])
print(f"The averge accuracy across all five folds is: {svm_avg:.3f}")

# Print standard deviation across five accuracy measurements
svm_std = np.std(cv_results['test_score'])
print(f"The standard deviation across the five accuracy measurements is: {svm_std:.3f}")

The accuracy for fold 1 is: 0.450
The accuracy for fold 2 is: 0.450
The accuracy for fold 3 is: 0.400
The accuracy for fold 4 is: 0.700
The accuracy for fold 5 is: 0.350
The averge accuracy across all five folds is: 0.452
The standard deviation across the five accuracy measurements is: 0.135
