Data Preprocessing

In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
#load data
file_path = 'train_sortiert_new_format.csv'
df = pd.read_csv(file_path)
df.head()

In [None]:
#split into target and data
target = df['label_id']
data = df.iloc[:,3:]
print("data: ",data.shape)
print("label:", target.shape)

In [None]:
#split into train and test data
X_train, X_test, y_train, y_test = train_test_split(data,target,random_state=1,test_size=0.2,shuffle=True)

In [None]:
#print shape
print('X_train : ') 
print(X_train.shape) 
print('') 
print('X_test : ') 
print(X_test.shape) 
print('') 
print('y_train : ') 
print(y_train.shape) 
print('') 
print('y_test : ') 
print(y_test.shape)

In [None]:
#distribution of training data

# Get unique labels and their counts
unique_labels, counts = np.unique(y_train, return_counts=True)

# Plotting
plt.bar(unique_labels, counts)
plt.xlabel('Label')
plt.ylabel('Number of Samples')
plt.title('Number of Samples per Label')
plt.show()

# Print unique labels and their counts
for label, count in zip(unique_labels, counts):
    print(f"Label '{label}': {count} samples")

Model Training

In [None]:
#import training libraries
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

In [None]:
#Logistic Regression
clr = LogisticRegression(max_iter=2000, random_state=1)
clr.fit(X_train, y_train)

#predict training and new data
y_train_pred = clr.predict(X_train) 
y_test_pred = clr.predict(X_test)

#evaluation of the prediction
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Training Set Accuracy:", train_accuracy)
print("Test Set Accuracy:", test_accuracy)

In [None]:
#Plot the Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred, labels=clr.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=clr.classes_)
disp.plot()

First impression:
There seems to be an issue with label 20 "Thumb Up". Although the true label 20 is often correctly predicted as 20, many other classes are also predicted as label 20, although they belong to another label. Idea: leave out gesture 20, since it's not that relevant for our project.
Additionally, also out training accuracy is too low. We should try to improve it. 

In [None]:
#filter dataframe
filtered_df = df[df['label_id'] != 20]

#split into target and data
target = filtered_df['label_id']
data = filtered_df.iloc[:,3:]

#split into train and test data
X_train, X_test, y_train, y_test = train_test_split(data,target,random_state=1,test_size=0.2,shuffle=True)



In [None]:
#Create Model
clr_filtered = LogisticRegression(max_iter=2000, random_state=1)
clr_filtered.fit(X_train, y_train)

#predict training and new data
y_train_pred = clr_filtered.predict(X_train) 
y_test_pred = clr_filtered.predict(X_test)

#evaluation of the prediction
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Training Set Accuracy:", train_accuracy)
print("Test Set Accuracy:", test_accuracy)

In [None]:
#Plot the Confusion Matrix
cm_filtered = confusion_matrix(y_test, y_test_pred, labels=clr_filtered.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_filtered,display_labels=clr_filtered.classes_)
disp.plot()

Result: The filtered version didn't increase the training accuracy, but improved the test accuracy. I think it's worth to continue with the filtered version. 

Continue with an Neural Network (MLPClassifier)

In [None]:
#standardize data
scaler = StandardScaler()
# Fit only on the training data
scaler.fit(X_train)
# Apply the transformations to the data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Create an MLPClassifier

hidden_layer_sizes = (600,100,50) #change sizes if necessary
mlp = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, max_iter=1000, random_state=42)

# Train the model
mlp.fit(X_train, y_train)

# Make predictions
y_pred = mlp.predict(X_test)
y_pred_train = mlp.predict(X_train)

#evaluation of the prediction
test_accuracy = accuracy_score(y_test, y_pred)
train_accuracy = accuracy_score(y_train, y_pred_train)
print("Training Set Accuracy", train_accuracy)
print("Test Set Accuracy:", test_accuracy)


In [None]:
#Plot the Confusion Matrix
cm_mlp = confusion_matrix(y_test, y_pred, labels=mlp.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_mlp,display_labels=mlp.classes_)
disp.plot()

try to improve with hyperparameter tuning

In [None]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
}

# Create an MLPClassifier
mlp2 = MLPClassifier(max_iter=1000, random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(mlp, param_grid, cv=3, n_jobs=-1, verbose=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the results for each parameter setting
results_df = pd.DataFrame(grid_search.cv_results_)
print(results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']])

# Print the best parameters found by GridSearchCV
print("\nBest parameters found: ", grid_search.best_params_)

# Use the best estimator to make predictions
best_mlp = grid_search.best_estimator_
y_test_pred = best_mlp.predict(X_test)

#Evaluation of the prediction
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Set Accuracy:", test_accuracy)

MLP Classifier doesn't seem to work very well, or the parameters are still not right adjusted. So let's try another Classifier.

Use XGBoost Classifier

In [None]:
#encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

In [None]:
#xgb for classification
#model = xgb.XGBClassifier(
#    objective='multi:softprob',
 #   num_class=num_classes,      
  #  max_depth=max_depth,       
   # learning_rate=learning_rate,
    #subsample=subsample,        
    #colsample_bytree=colsample, 
    #n_estimators=num_estimators
#)

cl_xgb3 = xgb.XGBClassifier(objective='multi:softprob', num_class=8, random_state=42, use_label_encoder=False, n_estimators=100)

# Train the model
cl_xgb3.fit(X_train, y_train_encoded)

#predict training and new data
y_train_pred = cl_xgb3.predict(X_train) 
y_test_pred = cl_xgb3.predict(X_test)

#evaluation of the prediction
train_accuracy = accuracy_score(y_train_encoded, y_train_pred)
test_accuracy = accuracy_score(y_test_encoded, y_test_pred)
print("Training Set Accuracy:", train_accuracy)
print("Test Set Accuracy:", test_accuracy)


In [None]:
#Plot the Confusion Matrix
cm_xgb = confusion_matrix(y_test_encoded, y_test_pred, labels=cl_xgb3.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_xgb,display_labels=cl_xgb3.classes_)
disp.plot()

Accuracy seems to increase. These are the best accuracy values so far. We should continue with techniques to reduce the dimensionality, since we have a lot of columns.

In [None]:
#try another version of xgboost with DMatrix (highly optimized class for memory and speed)

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train_encoded, enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test_encoded, enable_categorical=True)

# Define parameters
params = {
    'objective': 'multi:softmax',
    'num_class': 8,
    'learning_rate': 0.1,
    'max_depth': 6,
    'eval_metric': 'mlogloss'
}

# Train the model, use more configurations
bst = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtest, 'test')], early_stopping_rounds=10)

# Make predictions on both training and test sets
train_preds = bst.predict(dtrain)
test_preds = bst.predict(dtest)

# Evaluate accuracy
train_accuracy = accuracy_score(y_train_encoded, train_preds)
test_accuracy = accuracy_score(y_test_encoded, test_preds)

print(f"Training set accuracy: {train_accuracy:.2f}")
print(f"Test set accuracy: {test_accuracy:.2f}")

In [None]:
#feature selection technique one

from sklearn.feature_selection import SelectKBest, f_classif

#Select top k features based on ANOVA F-value
selector = SelectKBest(score_func=f_classif, k=1000)  # Adjust k as needed

# Fit selector on training data and transform it
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)  # Transform test data using the same selector

#test with xgboost
cl_xgb2 = xgb.XGBClassifier(objective='multi:softmax', num_class=8, random_state=42, use_label_encoder=False)

# Train the model
cl_xgb2.fit(X_train_selected, y_train_encoded)

#predict training and new data
y_train_pred = cl_xgb2.predict(X_train_selected) 
y_test_pred = cl_xgb2.predict(X_test_selected)

#evaluation of the prediction
train_accuracy = accuracy_score(y_train_encoded, y_train_pred)
test_accuracy = accuracy_score(y_test_encoded, y_test_pred)
print("Training Set Accuracy:", train_accuracy)
print("Test Set Accuracy:", test_accuracy)

This method decreases accuracy, so we should proceed with another technique. 

Try to improve LogisticRegression again:

In [None]:
clf_log = LogisticRegression(solver='saga', max_iter=2000, random_state=1)

# Set up the hyperparameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1],
    'penalty': ['l1', 'l2']  # SAGA supports both L1 and L2 regularization
}

# Use GridSearchCV to search for the best hyperparameters
grid_search = GridSearchCV(estimator=clf_log, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=3)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Train the model using the best hyperparameters
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
y_pred_train = best_model.predict(X_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f"Training Accuracy: {accuracy_train}")
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {accuracy}")

#best: C:1 , penalty: l2, Accuracy: 0.774

In [None]:
#Logistic Regression
clr = LogisticRegression(max_iter=2000, random_state=1)
clr.fit(X_train, y_train)

#predict training and new data
y_test_pred = clr.predict(X_test)
y_test_prob = clr.predict_proba(X_test)

#evaluation of the prediction
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Set Accuracy:", test_accuracy)

for idx, (pred, prob) in enumerate(zip(y_test_pred, y_test_prob)):
    print("pred:",pred," prob:",prob)

In [None]:
print(df['label'].unique())
print(df['label_id'].unique())

In [22]:
y_test_pred[0]

11

We should proceed now with tuning the xgb model.