Import Libraries

In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb

Load and Display Data

In [2]:
df = pd.read_csv('./../Dataset/StrongPassword.csv')
df.head()

Unnamed: 0,subject,sessionIndex,rep,H.period,DD.period.t,UD.period.t,H.t,DD.t.i,UD.t.i,H.i,...,H.a,DD.a.n,UD.a.n,H.n,DD.n.l,UD.n.l,H.l,DD.l.Return,UD.l.Return,H.Return
0,s002,1,1,0.1491,0.3979,0.2488,0.1069,0.1674,0.0605,0.1169,...,0.1349,0.1484,0.0135,0.0932,0.3515,0.2583,0.1338,0.3509,0.2171,0.0742
1,s002,1,2,0.1111,0.3451,0.234,0.0694,0.1283,0.0589,0.0908,...,0.1412,0.2558,0.1146,0.1146,0.2642,0.1496,0.0839,0.2756,0.1917,0.0747
2,s002,1,3,0.1328,0.2072,0.0744,0.0731,0.1291,0.056,0.0821,...,0.1621,0.2332,0.0711,0.1172,0.2705,0.1533,0.1085,0.2847,0.1762,0.0945
3,s002,1,4,0.1291,0.2515,0.1224,0.1059,0.2495,0.1436,0.104,...,0.1457,0.1629,0.0172,0.0866,0.2341,0.1475,0.0845,0.3232,0.2387,0.0813
4,s002,1,5,0.1249,0.2317,0.1068,0.0895,0.1676,0.0781,0.0903,...,0.1312,0.1582,0.027,0.0884,0.2517,0.1633,0.0903,0.2517,0.1614,0.0818


Feature Extraction Functions

In [3]:
def get_feature_columns(df):
    """
    Extract column names for dwell times, latency, and flight times.
    """
    dwell_columns = [col for col in df.columns if col.startswith('H.')]
    latency_columns = [col for col in df.columns if col.startswith('DD.')]
    flight_columns = [col for col in df.columns if col.startswith('UD.')]
    return dwell_columns, latency_columns, flight_columns

def extract_features(df, feature_set):
    """
    Extract features based on the specified feature set.
    feature_set can be 'first_order', 'second_order', or 'combined'.
    """
    dwell_columns, latency_columns, flight_columns = get_feature_columns(df)
    
    # First-order features
    first_order_features = pd.concat([
        df[dwell_columns],
        df[latency_columns],
        df[flight_columns]
    ], axis=1)
    
    if feature_set == 'first_order':
        return first_order_features
    elif feature_set == 'second_order':
        # Calculate second-order features
        second_order_features = {
            'mean_dwell_time': first_order_features[dwell_columns].mean(axis=1),
            'std_dwell_time': first_order_features[dwell_columns].std(axis=1),
            'mean_latency': first_order_features[latency_columns].mean(axis=1),
            'std_latency': first_order_features[latency_columns].std(axis=1),
            'mean_flight_time': first_order_features[flight_columns].mean(axis=1),
            'std_flight_time': first_order_features[flight_columns].std(axis=1),
        }
        return pd.DataFrame(second_order_features)
    elif feature_set == 'combined':
        # Combine first and second-order features
        second_order_features = {
            'mean_dwell_time': first_order_features[dwell_columns].mean(axis=1),
            'std_dwell_time': first_order_features[dwell_columns].std(axis=1),
            'mean_latency': first_order_features[latency_columns].mean(axis=1),
            'std_latency': first_order_features[latency_columns].std(axis=1),
            'mean_flight_time': first_order_features[flight_columns].mean(axis=1),
            'std_flight_time': first_order_features[flight_columns].std(axis=1),
        }
        second_order_df = pd.DataFrame(second_order_features)
        combined_features = pd.concat([first_order_features, second_order_df], axis=1)
        return combined_features
    else:
        raise ValueError("Invalid feature set specified. Choose from 'first_order', 'second_order', or 'combined'.")

Define Model Training and Evaluation Function

In [None]:
def train_and_evaluate_user(user, df, feature_set, feature_set_name):
    """
    Train and evaluate the model for a specific user using the specified feature set.
    Returns evaluation metrics and training time.
    """
    start_time = time.time()
    
    # Create binary labels: 1 for genuine user, 0 for imposters
    user_df = df.copy()
    user_df['label'] = (user_df['subject'] == user).astype(int)
    
    # Extract features
    X = extract_features(user_df, feature_set)
    y = user_df['label']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Initialize the model
    model = xgb.XGBClassifier(eval_metric='logloss')
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    
    # Calculate training time
    end_time = time.time()
    training_time = end_time - start_time
    
    # Print metrics
    print(f"Metrics for user {user} using {feature_set_name}:")
    print(f" - Accuracy: {accuracy * 100:.2f}%")
    print(f" - Precision: {precision:.2f}")
    print(f" - Recall: {recall:.2f}")
    print(f" - F1-Score: {f1:.2f}")
    print(f" - Training time: {training_time:.2f} seconds\n")
    
    return accuracy, precision, recall, f1, training_time

Train Models and Evaluate Performance

In [5]:
# Get list of unique users
users = df['subject'].unique()

# Define feature sets to use
feature_sets = {
    'First-Order Features': 'first_order',
    'Second-Order Features': 'second_order',
    'Combined Features': 'combined'
}

# Loop over feature sets
for feature_set_name, feature_set_key in feature_sets.items():
    print(f"\n=== Training models using {feature_set_name} ===\n")
    
    # Initialize lists to store metrics
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    training_times = []
    
    # Loop over each user
    for user in users:
        accuracy, precision, recall, f1, training_time = train_and_evaluate_user(
            user, df, feature_set_key, feature_set_name
        )
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        training_times.append(training_time)
        
    # Compute average metrics across all users for this feature set
    average_accuracy = np.mean(accuracies)
    average_precision = np.mean(precisions)
    average_recall = np.mean(recalls)
    average_f1_score = np.mean(f1_scores)
    average_training_time = np.mean(training_times)
    
    print(f"Average Metrics Across All Users for {feature_set_name}:")
    print(f" - Average Accuracy: {average_accuracy * 100:.2f}%")
    print(f" - Average Precision: {average_precision:.2f}")
    print(f" - Average Recall: {average_recall:.2f}")
    print(f" - Average F1-Score: {average_f1_score:.2f}")
    print(f" - Average Training Time per User: {average_training_time:.2f} seconds\n")


=== Training models using First-Order Features ===



Parameters: { "use_label_encoder" } are not used.



Metrics for user s002 using First-Order Features:
 - Accuracy: 99.26%
 - Precision: 0.95
 - Recall: 0.66
 - F1-Score: 0.78
 - Training time: 0.75 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s003 using First-Order Features:
 - Accuracy: 99.88%
 - Precision: 1.00
 - Recall: 0.94
 - F1-Score: 0.97
 - Training time: 0.41 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s004 using First-Order Features:
 - Accuracy: 99.41%
 - Precision: 0.90
 - Recall: 0.79
 - F1-Score: 0.84
 - Training time: 0.40 seconds

Metrics for user s005 using First-Order Features:
 - Accuracy: 99.83%
 - Precision: 1.00
 - Recall: 0.91
 - F1-Score: 0.95
 - Training time: 0.20 seconds



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Metrics for user s007 using First-Order Features:
 - Accuracy: 99.19%
 - Precision: 0.94
 - Recall: 0.62
 - F1-Score: 0.75
 - Training time: 0.39 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s008 using First-Order Features:
 - Accuracy: 99.34%
 - Precision: 0.92
 - Recall: 0.72
 - F1-Score: 0.81
 - Training time: 0.32 seconds

Metrics for user s010 using First-Order Features:
 - Accuracy: 99.93%
 - Precision: 0.99
 - Recall: 0.97
 - F1-Score: 0.98
 - Training time: 0.18 seconds



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Metrics for user s011 using First-Order Features:
 - Accuracy: 99.78%
 - Precision: 0.99
 - Recall: 0.90
 - F1-Score: 0.94
 - Training time: 0.38 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s012 using First-Order Features:
 - Accuracy: 99.73%
 - Precision: 0.96
 - Recall: 0.90
 - F1-Score: 0.93
 - Training time: 0.62 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s013 using First-Order Features:
 - Accuracy: 99.71%
 - Precision: 0.96
 - Recall: 0.89
 - F1-Score: 0.92
 - Training time: 0.29 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s015 using First-Order Features:
 - Accuracy: 99.56%
 - Precision: 0.94
 - Recall: 0.82
 - F1-Score: 0.88
 - Training time: 0.25 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s016 using First-Order Features:
 - Accuracy: 99.78%
 - Precision: 0.97
 - Recall: 0.91
 - F1-Score: 0.94
 - Training time: 0.28 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s017 using First-Order Features:
 - Accuracy: 99.90%
 - Precision: 1.00
 - Recall: 0.95
 - F1-Score: 0.97
 - Training time: 0.22 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s018 using First-Order Features:
 - Accuracy: 99.73%
 - Precision: 0.99
 - Recall: 0.88
 - F1-Score: 0.93
 - Training time: 0.38 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s019 using First-Order Features:
 - Accuracy: 99.93%
 - Precision: 1.00
 - Recall: 0.96
 - F1-Score: 0.98
 - Training time: 0.25 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s020 using First-Order Features:
 - Accuracy: 99.51%
 - Precision: 0.93
 - Recall: 0.81
 - F1-Score: 0.87
 - Training time: 0.47 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s021 using First-Order Features:
 - Accuracy: 99.66%
 - Precision: 0.96
 - Recall: 0.86
 - F1-Score: 0.91
 - Training time: 0.36 seconds

Metrics for user s022 using First-Order Features:
 - Accuracy: 99.95%
 - Precision: 1.00
 - Recall: 0.97
 - F1-Score: 0.99
 - Training time: 0.16 seconds



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Metrics for user s024 using First-Order Features:
 - Accuracy: 99.88%
 - Precision: 1.00
 - Recall: 0.94
 - F1-Score: 0.97
 - Training time: 0.27 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s025 using First-Order Features:
 - Accuracy: 99.88%
 - Precision: 0.96
 - Recall: 0.97
 - F1-Score: 0.97
 - Training time: 0.32 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s026 using First-Order Features:
 - Accuracy: 99.73%
 - Precision: 0.99
 - Recall: 0.88
 - F1-Score: 0.93
 - Training time: 0.49 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s027 using First-Order Features:
 - Accuracy: 99.75%
 - Precision: 0.97
 - Recall: 0.90
 - F1-Score: 0.94
 - Training time: 0.20 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s028 using First-Order Features:
 - Accuracy: 99.88%
 - Precision: 0.99
 - Recall: 0.95
 - F1-Score: 0.97
 - Training time: 0.20 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s029 using First-Order Features:
 - Accuracy: 99.46%
 - Precision: 0.88
 - Recall: 0.84
 - F1-Score: 0.86
 - Training time: 0.53 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s030 using First-Order Features:
 - Accuracy: 99.78%
 - Precision: 0.97
 - Recall: 0.91
 - F1-Score: 0.94
 - Training time: 0.29 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s031 using First-Order Features:
 - Accuracy: 99.26%
 - Precision: 0.95
 - Recall: 0.66
 - F1-Score: 0.78
 - Training time: 0.47 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s032 using First-Order Features:
 - Accuracy: 99.14%
 - Precision: 1.00
 - Recall: 0.56
 - F1-Score: 0.72
 - Training time: 0.44 seconds



Parameters: { "use_label_encoder" } are not used.



Metrics for user s033 using First-Order Features:
 - Accuracy: 99.88%
 - Precision: 0.97
 - Recall: 0.96
 - F1-Score: 0.97
 - Training time: 0.21 seconds



Parameters: { "use_label_encoder" } are not used.



KeyboardInterrupt: 