Import Libraries

In [1]:
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.ensemble import RandomForestClassifier

Load and Display Data

In [2]:
df = pd.read_csv('./../Dataset/StrongPassword.csv')
df.head()

Unnamed: 0,subject,sessionIndex,rep,H.period,DD.period.t,UD.period.t,H.t,DD.t.i,UD.t.i,H.i,...,H.a,DD.a.n,UD.a.n,H.n,DD.n.l,UD.n.l,H.l,DD.l.Return,UD.l.Return,H.Return
0,s002,1,1,0.1491,0.3979,0.2488,0.1069,0.1674,0.0605,0.1169,...,0.1349,0.1484,0.0135,0.0932,0.3515,0.2583,0.1338,0.3509,0.2171,0.0742
1,s002,1,2,0.1111,0.3451,0.234,0.0694,0.1283,0.0589,0.0908,...,0.1412,0.2558,0.1146,0.1146,0.2642,0.1496,0.0839,0.2756,0.1917,0.0747
2,s002,1,3,0.1328,0.2072,0.0744,0.0731,0.1291,0.056,0.0821,...,0.1621,0.2332,0.0711,0.1172,0.2705,0.1533,0.1085,0.2847,0.1762,0.0945
3,s002,1,4,0.1291,0.2515,0.1224,0.1059,0.2495,0.1436,0.104,...,0.1457,0.1629,0.0172,0.0866,0.2341,0.1475,0.0845,0.3232,0.2387,0.0813
4,s002,1,5,0.1249,0.2317,0.1068,0.0895,0.1676,0.0781,0.0903,...,0.1312,0.1582,0.027,0.0884,0.2517,0.1633,0.0903,0.2517,0.1614,0.0818


Feature Extraction Functions

In [3]:
# Define Feature Extraction Functions
def get_feature_columns(df):
    """
    Extract column names for dwell times, latency, and flight times.
    """
    dwell_columns = [col for col in df.columns if col.startswith('H.')]
    latency_columns = [col for col in df.columns if col.startswith('DD.')]
    flight_columns = [col for col in df.columns if col.startswith('UD.')]
    return dwell_columns, latency_columns, flight_columns

def extract_features(df, feature_set):
    """
    Extract features based on the specified feature set.
    feature_set can be 'first_order', 'second_order', or 'combined'.
    """
    dwell_columns, latency_columns, flight_columns = get_feature_columns(df)
    
    # First-order features
    first_order_features = pd.concat([
        df[dwell_columns],
        df[latency_columns],
        df[flight_columns]
    ], axis=1)
    
    if feature_set == 'first_order':
        return first_order_features
    elif feature_set == 'second_order':
        # Calculate second-order features
        second_order_features = extract_second_order_features(first_order_features, dwell_columns, latency_columns, flight_columns)
        return second_order_features
    elif feature_set == 'combined':
        # Combine first and second-order features
        second_order_features = extract_second_order_features(first_order_features, dwell_columns, latency_columns, flight_columns)
        combined_features = pd.concat([first_order_features, second_order_features], axis=1)
        return combined_features
    else:
        raise ValueError("Invalid feature set specified. Choose from 'first_order', 'second_order', or 'combined'.")

def extract_second_order_features(first_order_df, dwell_columns, latency_columns, flight_columns):
    """
    Calculate second-order features including min, max, mean, std, and slope.
    """
    # Mean and Standard Deviation
    second_order_features = {
        'mean_dwell_time': first_order_df[dwell_columns].mean(axis=1),
        'std_dwell_time': first_order_df[dwell_columns].std(axis=1),
        'mean_latency': first_order_df[latency_columns].mean(axis=1),
        'std_latency': first_order_df[latency_columns].std(axis=1),
        'mean_flight_time': first_order_df[flight_columns].mean(axis=1),
        'std_flight_time': first_order_df[flight_columns].std(axis=1),
    }
    
    # Minimum and Maximum Values
    second_order_features.update({
        'min_dwell_time': first_order_df[dwell_columns].min(axis=1),
        'max_dwell_time': first_order_df[dwell_columns].max(axis=1),
        'min_latency': first_order_df[latency_columns].min(axis=1),
        'max_latency': first_order_df[latency_columns].max(axis=1),
        'min_flight_time': first_order_df[flight_columns].min(axis=1),
        'max_flight_time': first_order_df[flight_columns].max(axis=1),
    })
    
    # Slope Features
    # Compute the difference between consecutive elements in each row
    dwell_slopes = first_order_df[dwell_columns].diff(axis=1).iloc[:, 1:]
    latency_slopes = first_order_df[latency_columns].diff(axis=1).iloc[:, 1:]
    flight_slopes = first_order_df[flight_columns].diff(axis=1).iloc[:, 1:]
    
    # Include mean and standard deviation of slopes as second-order features
    second_order_features.update({
        'mean_dwell_slope': dwell_slopes.mean(axis=1),
        'std_dwell_slope': dwell_slopes.std(axis=1),
        'mean_latency_slope': latency_slopes.mean(axis=1),
        'std_latency_slope': latency_slopes.std(axis=1),
        'mean_flight_slope': flight_slopes.mean(axis=1),
        'std_flight_slope': flight_slopes.std(axis=1),
    })
    
    # Convert to DataFrame
    second_order_df = pd.DataFrame(second_order_features)
    return second_order_df


Define Model Training and Evaluation Function

In [4]:
def train_and_evaluate_user(user, df, feature_set, feature_set_name):
    start_time = time.time()
    
    # Buat label biner: 1 untuk pengguna asli, 0 untuk impostor
    user_df = df.copy()
    user_df['label'] = (user_df['subject'] == user).astype(int)
    
    # Ekstraksi fitur
    X = extract_features(user_df, feature_set)
    y = user_df['label']
    
    # Bagi data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Inisialisasi model Random Forest
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Latih model
    model.fit(X_train, y_train)
    
    # Prediksi
    y_pred = model.predict(X_test)
    
    # Hitung metrik
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    
    # Hitung waktu pelatihan
    end_time = time.time()
    training_time = end_time - start_time
    
    # Cetak metrik
    print(f"Metrics for user {user} using {feature_set_name} with Random Forest:")
    print(f" - Accuracy: {accuracy * 100:.2f}%")
    print(f" - Precision: {precision:.2f}")
    print(f" - Recall: {recall:.2f}")
    print(f" - F1-Score: {f1:.2f}")
    print(f" - Training time: {training_time:.2f} seconds\n")
    
    return accuracy, precision, recall, f1, training_time

Train Models and Evaluate Performance

In [5]:
# Dapatkan daftar pengguna unik
users = df['subject'].unique()

# Definisikan feature sets yang akan digunakan
feature_sets = {
    'First-Order Features': 'first_order',
    'Second-Order Features': 'second_order',
    'Combined Features': 'combined'
}

# Loop melalui feature sets
for feature_set_name, feature_set_key in feature_sets.items():
    print(f"\n=== Training models using {feature_set_name} ===\n")
    
    # Inisialisasi list untuk menyimpan metrik
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    training_times = []
    
    # Loop melalui setiap pengguna
    for user in users:
        accuracy, precision, recall, f1, training_time = train_and_evaluate_user(
            user, df, feature_set_key, feature_set_name
        )
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        training_times.append(training_time)
        
    # Hitung rata-rata metrik untuk semua pengguna pada feature set ini
    average_accuracy = np.mean(accuracies)
    average_precision = np.mean(precisions)
    average_recall = np.mean(recalls)
    average_f1_score = np.mean(f1_scores)
    average_training_time = np.mean(training_times)
    
    print(f"Average Metrics Across All Users for {feature_set_name}:")
    print(f" - Average Accuracy: {average_accuracy * 100:.2f}%")
    print(f" - Average Precision: {average_precision:.2f}")
    print(f" - Average Recall: {average_recall:.2f}")
    print(f" - Average F1-Score: {average_f1_score:.2f}")
    print(f" - Average Training Time per User: {average_training_time:.2f} seconds\n")


=== Training models using First-Order Features ===

Metrics for user s002 using First-Order Features with Random Forest:
 - Accuracy: 98.58%
 - Precision: 1.00
 - Recall: 0.28
 - F1-Score: 0.43
 - Training time: 4.29 seconds

Metrics for user s003 using First-Order Features with Random Forest:
 - Accuracy: 99.68%
 - Precision: 1.00
 - Recall: 0.84
 - F1-Score: 0.91
 - Training time: 4.50 seconds

Metrics for user s004 using First-Order Features with Random Forest:
 - Accuracy: 99.09%
 - Precision: 0.98
 - Recall: 0.55
 - F1-Score: 0.70
 - Training time: 3.96 seconds

Metrics for user s005 using First-Order Features with Random Forest:
 - Accuracy: 99.58%
 - Precision: 1.00
 - Recall: 0.79
 - F1-Score: 0.88
 - Training time: 2.83 seconds

Metrics for user s007 using First-Order Features with Random Forest:
 - Accuracy: 98.85%
 - Precision: 1.00
 - Recall: 0.41
 - F1-Score: 0.58
 - Training time: 4.16 seconds

Metrics for user s008 using First-Order Features with Random Forest:
 - Accur