In [280]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Preprocessing libraries
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE 
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import SelectFromModel

# Sklearn ML
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [281]:
# Initialize variable for first test/train split
test_percent = 0.25

# Use same random seed to ensure same results across runs
rand_seed = 100

In [282]:
# Load the data
filepath = "data/train.csv"
class_df = pd.read_csv(
    filepath, usecols=[1, 2], header=0, names=["uid", "class"]
)

display(class_df)

Unnamed: 0,uid,class
0,ZYURRE527,4
1,ZWNWBP435,0
2,ZVHEZA963,4
3,ZSFNU1100,4
4,ZRXUB1049,0
...,...,...
422,AGHXWX765,0
423,AFEOPC672,3
424,AEEEIG737,3
425,ADQRPH513,3


In [283]:
# Load features from individual CSVs into a single dataframe
def get_features(uid):
    feature_filepath = f"data/img_details/{uid}.csv"
    feature_df = pd.read_csv(feature_filepath, header=None)
    return feature_df.iloc[0].values.tolist()


features_df = class_df[["uid"]].apply(
    lambda row: get_features(row[0]), axis=1, result_type="expand"
)
display(features_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0.000462,0.005583,-0.001031,0.002307,-0.113097,-0.284965,0.001069,-0.000092,-0.271864,0.000503,...,0.680631,-1.153061,0.111816,0.162622,-1.085265,-0.657002,-1.406191,2.240085,0.118616,-0.728013
1,0.000220,0.006780,-0.000547,0.002183,-0.045820,-0.216762,0.000987,-0.001331,-0.465898,0.000515,...,-1.241972,-0.115316,-0.411191,0.431461,0.442649,1.243681,-0.151721,0.458508,1.931918,-0.241081
2,0.000405,0.007183,-0.000137,0.002612,-0.083430,-0.292385,0.001094,-0.000112,-0.236576,0.000466,...,0.659314,-0.792833,-0.471358,0.514799,-0.846220,0.479314,-0.730218,1.352716,0.040223,-0.163302
3,0.000388,0.003802,0.002121,0.001513,-0.109248,-0.183284,0.000813,-0.001447,-0.066267,0.000654,...,-0.047666,-0.201043,-0.565545,0.999009,-0.332314,-0.066972,-1.263785,3.876905,-0.397950,-0.693763
4,0.000425,0.006544,0.001630,0.001549,-0.068301,-0.283487,0.001004,-0.001800,-0.251112,0.000428,...,-1.221178,-0.253239,-0.046740,0.242367,-0.379724,-0.893249,-0.957397,1.118245,0.181925,-0.024197
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422,0.000305,0.003671,-0.004093,0.003010,-0.093583,0.133018,0.000627,0.001443,-0.367352,0.000462,...,-0.260746,-0.741712,-0.887129,0.190525,0.216271,0.490549,-1.047399,1.875185,0.345561,-0.874318
423,0.000441,0.006178,-0.000811,0.003572,-0.108863,-0.302020,0.000761,0.001851,-0.197981,0.000310,...,0.457373,-0.782917,-1.072765,1.180279,-0.111142,1.897755,-0.902370,0.552967,-0.314270,-1.198762
424,0.000464,0.006611,0.000842,0.001412,-0.152744,-0.355706,0.000906,-0.001229,-0.320724,0.000493,...,0.411773,0.232481,-0.527885,-0.305296,-0.189008,-0.592684,-1.144780,3.459698,-0.199579,-0.999165
425,0.000233,0.003029,0.001606,0.001224,-0.092386,-0.434045,0.000668,-0.000410,-0.228858,0.000444,...,-0.147889,1.168724,-0.486698,1.134707,-0.029372,0.092189,-0.791921,1.786787,2.089036,-0.690614


In [284]:
# Merge dataframes and fix column names
num_features = features_df.shape[1]
feature_names = [f"f{i}" for i in range(num_features)]

features_df.columns = feature_names

df = pd.concat([class_df, features_df], axis=1)
display(df)

Unnamed: 0,uid,class,f0,f1,f2,f3,f4,f5,f6,f7,...,f1014,f1015,f1016,f1017,f1018,f1019,f1020,f1021,f1022,f1023
0,ZYURRE527,4,0.000462,0.005583,-0.001031,0.002307,-0.113097,-0.284965,0.001069,-0.000092,...,0.680631,-1.153061,0.111816,0.162622,-1.085265,-0.657002,-1.406191,2.240085,0.118616,-0.728013
1,ZWNWBP435,0,0.000220,0.006780,-0.000547,0.002183,-0.045820,-0.216762,0.000987,-0.001331,...,-1.241972,-0.115316,-0.411191,0.431461,0.442649,1.243681,-0.151721,0.458508,1.931918,-0.241081
2,ZVHEZA963,4,0.000405,0.007183,-0.000137,0.002612,-0.083430,-0.292385,0.001094,-0.000112,...,0.659314,-0.792833,-0.471358,0.514799,-0.846220,0.479314,-0.730218,1.352716,0.040223,-0.163302
3,ZSFNU1100,4,0.000388,0.003802,0.002121,0.001513,-0.109248,-0.183284,0.000813,-0.001447,...,-0.047666,-0.201043,-0.565545,0.999009,-0.332314,-0.066972,-1.263785,3.876905,-0.397950,-0.693763
4,ZRXUB1049,0,0.000425,0.006544,0.001630,0.001549,-0.068301,-0.283487,0.001004,-0.001800,...,-1.221178,-0.253239,-0.046740,0.242367,-0.379724,-0.893249,-0.957397,1.118245,0.181925,-0.024197
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422,AGHXWX765,0,0.000305,0.003671,-0.004093,0.003010,-0.093583,0.133018,0.000627,0.001443,...,-0.260746,-0.741712,-0.887129,0.190525,0.216271,0.490549,-1.047399,1.875185,0.345561,-0.874318
423,AFEOPC672,3,0.000441,0.006178,-0.000811,0.003572,-0.108863,-0.302020,0.000761,0.001851,...,0.457373,-0.782917,-1.072765,1.180279,-0.111142,1.897755,-0.902370,0.552967,-0.314270,-1.198762
424,AEEEIG737,3,0.000464,0.006611,0.000842,0.001412,-0.152744,-0.355706,0.000906,-0.001229,...,0.411773,0.232481,-0.527885,-0.305296,-0.189008,-0.592684,-1.144780,3.459698,-0.199579,-0.999165
425,ADQRPH513,3,0.000233,0.003029,0.001606,0.001224,-0.092386,-0.434045,0.000668,-0.000410,...,-0.147889,1.168724,-0.486698,1.134707,-0.029372,0.092189,-0.791921,1.786787,2.089036,-0.690614


### Split Data into Train/Test Sets

In [285]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_all = df[feature_names]
y_all = df["class"]

X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=test_percent, random_state=rand_seed, stratify=y_all
)

# Reset X_train index
y_train.reset_index(inplace=True, drop=True)
X_train.reset_index(inplace=True, drop=True)

print("Training set by class:")
display(y_train.value_counts())
print("Test set by class:")
display(y_test.value_counts())

Training set by class:


0    119
4     82
3     58
1     41
2     20
Name: class, dtype: int64

Test set by class:


0    40
4    27
3    20
1    13
2     7
Name: class, dtype: int64

### Standardize Data

Use StandardScaler from sklearn. Standardize both X_train and X_test data.

In [286]:
# Create function that will standardize the dataset
def norm_data(df_to_scale):
    # Setup scaler
    scaler_std = StandardScaler()
    #scaler_abs = MaxAbsScaler()
    #scaler_minmax = MinMaxScaler()
    
    # Apply scaling to training data
    scaled_df = scaler_std.fit_transform(df_to_scale)
    
    return scaled_df

In [287]:
# Test norm_data function
X_train = norm_data(X_train)
X_test = norm_data(X_test)

In [288]:
# # Get single training DataFrame
# norm_x_df = pd.DataFrame(X_train, columns=feature_names)
# norm_df = pd.concat([y_train, norm_x_df], axis=1)
# # Standardize X_train values
# X_train = norm_data(norm_df[feature_names])

# # Get single testing DataFrame
# norm_xtest_df = pd.DataFrame(X_test, columns=feature_names)
# norm_test_df = pd.concat([y_test, norm_x_df], axis=1)
# # Standardize X_test values (for later)
# X_test = norm_data(norm_test_df[feature_names])

### Remove Outliers

Since we are in multi-dimensional space, we will use the mean and covariance matrices. This will be computed using Mahalanobis distance which is well-suited for multi-dimensional space: https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.mahalanobis.html.

In [289]:
# Create function that computes mean, cov matrix, and inv cov matrix
def get_mean_cov(X_train):
    # Merge dfs
    norm_x_df = pd.DataFrame(X_train, columns=feature_names)
    norm_df = pd.concat([y_train, norm_x_df], axis=1)
    # Compute mean and cov per class per feature
    avg_list = []
    cov_list = []
    inv_cov_list = []
    for i in range(5):
        # Compute mean
        avg = np.mean(norm_df[norm_df["class"]==i][feature_names], axis=0)
        avg_list.append(avg)
        # Compute cov matrix
        cov = np.cov(norm_df[norm_df["class"]==i][feature_names], rowvar=False)
        cov_list.append(cov)
        # Compute inverse of cov matrix
        inv_cov = np.linalg.inv(cov)
        inv_cov_list.append(inv_cov)
    return norm_df, avg_list, inv_cov_list


In [290]:
# Test get_mean_cov function
norm_df, avg_list, inv_cov_list = get_mean_cov(X_train)

In [291]:
# Determine which features should be removed (identify outliers based on Mahalanobis dist)
# Create function that computes Mahalanobis distance and adds it to norm_df
def get_mahalanobis_dist(label, features):
    u = avg_list[label]
    v = features
    vi = inv_cov_list[label]
    delta = u - v
    m = np.dot(np.dot(delta, vi), delta)
    #dist = distance.mahalanobis(u, features, vi)
    return np.sqrt(np.abs(m))

# Call function for each feature
norm_df["mahalanobis_dist"] = norm_df.apply(lambda row: get_mahalanobis_dist(int(row["class"]), row[feature_names]), axis=1)
norm_df["mahalanobis_dist"].describe()

count    320.000000
mean      25.970715
std       14.806621
min        0.730243
25%       15.870690
50%       23.222919
75%       33.974090
max       76.599838
Name: mahalanobis_dist, dtype: float64

In [292]:
# Drop outliers
def drop_outliers(norm_df, threshold):
    thresh = threshold
    norm_df.sort_values(by="mahalanobis_dist", ascending=False, inplace=True)
    norm_df.reset_index(inplace=True, drop=True)
    norm_df.drop(norm_df.index[:int(norm_df.shape[0]*thresh)], inplace=True)
    norm_df.reset_index(inplace=True, drop=True)

    return norm_df

In [293]:
# Test drop_outliers function
norm_df = drop_outliers(norm_df, 0.2)

# Print updated descriptive stats
norm_df["mahalanobis_dist"].describe()
#print(len(norm_df["class"]))

count    256.000000
mean      20.030034
std        8.641128
min        0.730243
25%       14.240340
50%       20.179466
75%       25.927477
max       37.240983
Name: mahalanobis_dist, dtype: float64

### Oversample Data

Use ADASYN technique.

In [294]:
# Create function that oversamples or undersamples data
def resample(sampler, X_train, y_train, name):
    X_train, y_train = sampler.fit_resample(X_train, y_train)
    # Observe number of classes after resample
    #print(f"Number of samples per class after {name}:\n{y_train.value_counts()}")
    return X_train, y_train

In [295]:
# Test resample function
# Setup ADASYN (oversampling)
ada = ADASYN(random_state=rand_seed)

# Call resample function
X_train, y_train = resample(ada, X_train, y_train, "ADASYN Oversampling")

In [296]:
X_train.shape

(595, 1024)

### Feature Selection

In [297]:
# Try univariate feature selection
X_train_univar = SelectKBest(chi2, k=50).fit_transform(abs(X_train), y_train)
X_train_univar.shape


(595, 50)

### Hyperparameter Tuning

In [298]:
# Setup parameters to try
c_range = np.arange(5,4000)
params = {
    'C': c_range,
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Setup classifier
svm = SVC(random_state=10)
svm_clf = RandomizedSearchCV(svm, params, scoring='f1_macro', random_state=10)
search = svm_clf.fit(X_train, y_train)

# Get best params
best_params = search.best_params_

In [299]:
print(best_params)

{'kernel': 'rbf', 'gamma': 'scale', 'C': 327}


### Get Final Model

In [300]:
# Fit model using best hyperparameters found in previous search
svm_final_clf = SVC(kernel="rbf", C=327, gamma='auto', random_state=10)
svm_model_final = svm_final_clf.fit(X_train, y_train)

y_pred = svm_model_final.predict(X_test)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")

print(f"Accuracy: {acc:.4f}")
print(f"F1 score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Accuracy: 0.5607
F1 score: 0.5484
Precision: 0.5758
Recall: 0.5607
