In [1]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('./../Dataset/StrongPassword.csv')
df.head()

Unnamed: 0,subject,sessionIndex,rep,H.period,DD.period.t,UD.period.t,H.t,DD.t.i,UD.t.i,H.i,...,H.a,DD.a.n,UD.a.n,H.n,DD.n.l,UD.n.l,H.l,DD.l.Return,UD.l.Return,H.Return
0,s002,1,1,0.1491,0.3979,0.2488,0.1069,0.1674,0.0605,0.1169,...,0.1349,0.1484,0.0135,0.0932,0.3515,0.2583,0.1338,0.3509,0.2171,0.0742
1,s002,1,2,0.1111,0.3451,0.234,0.0694,0.1283,0.0589,0.0908,...,0.1412,0.2558,0.1146,0.1146,0.2642,0.1496,0.0839,0.2756,0.1917,0.0747
2,s002,1,3,0.1328,0.2072,0.0744,0.0731,0.1291,0.056,0.0821,...,0.1621,0.2332,0.0711,0.1172,0.2705,0.1533,0.1085,0.2847,0.1762,0.0945
3,s002,1,4,0.1291,0.2515,0.1224,0.1059,0.2495,0.1436,0.104,...,0.1457,0.1629,0.0172,0.0866,0.2341,0.1475,0.0845,0.3232,0.2387,0.0813
4,s002,1,5,0.1249,0.2317,0.1068,0.0895,0.1676,0.0781,0.0903,...,0.1312,0.1582,0.027,0.0884,0.2517,0.1633,0.0903,0.2517,0.1614,0.0818


In [5]:
def extract_first_order_features(df):

    # Dwell time (H.<key>)
    dwell_columns = [col for col in df.columns if col.startswith('H.')]
    dwell_times_df = df[dwell_columns]

    # Latency (DD.<key1>.<key2>)
    latency_columns = [col for col in df.columns if col.startswith('DD.')]
    latency_df = df[latency_columns]

    # Flight time (UD.<key1>.<key2>)
    flight_columns = [col for col in df.columns if col.startswith('UD.')]
    flight_times_df = df[flight_columns]

    # Gabungkan semua fitur orde pertama
    first_order_df = pd.concat([dwell_times_df, latency_df, flight_times_df], axis=1)

    return first_order_df

In [6]:
# Ekstraksi fitur orde pertama dari dataset
first_order_df = extract_first_order_features(df)

# Menampilkan hasil ekstraksi fitur orde pertama
first_order_df.head()

Unnamed: 0,H.period,H.t,H.i,H.e,H.five,H.Shift.r,H.o,H.a,H.n,H.l,...,UD.period.t,UD.t.i,UD.i.e,UD.e.five,UD.five.Shift.r,UD.Shift.r.o,UD.o.a,UD.a.n,UD.n.l,UD.l.Return
0,0.1491,0.1069,0.1169,0.1417,0.1146,0.1067,0.1016,0.1349,0.0932,0.1338,...,0.2488,0.0605,0.1043,1.0468,1.4909,0.6523,0.112,0.0135,0.2583,0.2171
1,0.1111,0.0694,0.0908,0.0829,0.0689,0.157,0.1066,0.1412,0.1146,0.0839,...,0.234,0.0589,0.0449,1.1141,0.7133,0.6307,0.0618,0.1146,0.1496,0.1917
2,0.1328,0.0731,0.0821,0.0808,0.0892,0.1454,0.1365,0.1621,0.1172,0.1085,...,0.0744,0.056,0.0721,0.96,0.5311,0.5741,0.1566,0.0711,0.1533,0.1762
3,0.1291,0.1059,0.104,0.09,0.0913,0.1454,0.0956,0.1457,0.0866,0.0845,...,0.1224,0.1436,0.0998,0.9656,1.1651,0.6096,0.0574,0.0172,0.1475,0.2387
4,0.1249,0.0895,0.0903,0.0805,0.0742,0.1243,0.043,0.1312,0.0884,0.0903,...,0.1068,0.0781,0.0686,0.7824,0.8213,0.6389,0.1545,0.027,0.1633,0.1614


In [7]:
# Fungsi untuk menghitung mean dan standard deviation dari fitur orde pertama
def extract_second_order_features(first_order_df, dwell_columns, latency_columns, flight_columns):
    second_order_features = {}

    # Hitung mean dan standard deviation untuk setiap baris dari fitur dwell time
    second_order_features['mean_dwell_time'] = first_order_df[dwell_columns].mean(axis=1)
    second_order_features['std_dwell_time'] = first_order_df[dwell_columns].std(axis=1)

    # Hitung mean dan standard deviation untuk setiap baris dari fitur latency
    second_order_features['mean_latency'] = first_order_df[latency_columns].mean(axis=1)
    second_order_features['std_latency'] = first_order_df[latency_columns].std(axis=1)

    # Hitung mean dan standard deviation untuk setiap baris dari fitur flight time
    second_order_features['mean_flight_time'] = first_order_df[flight_columns].mean(axis=1)
    second_order_features['std_flight_time'] = first_order_df[flight_columns].std(axis=1)

    # Buat DataFrame fitur orde kedua
    second_order_df = pd.DataFrame(second_order_features)

    return second_order_df


# Tentukan kembali kolom untuk dwell time, latency, dan flight time
dwell_columns = [col for col in df.columns if col.startswith('H.')]
latency_columns = [col for col in df.columns if col.startswith('DD.')]
flight_columns = [col for col in df.columns if col.startswith('UD.')]

# Ekstraksi fitur orde kedua
second_order_df = extract_second_order_features(first_order_df, dwell_columns, latency_columns, flight_columns)

# Menampilkan beberapa baris pertama fitur orde kedua
second_order_df.head(10)


Unnamed: 0,mean_dwell_time,std_dwell_time,mean_latency,std_latency,mean_flight_time,std_flight_time
0,0.115782,0.02253,0.54039,0.493923,0.42045,0.492082
1,0.1001,0.029165,0.434,0.361336,0.33136,0.363031
2,0.111109,0.029826,0.39526,0.298385,0.28249,0.304275
3,0.1054,0.023856,0.4645,0.405723,0.35669,0.410564
4,0.092582,0.025764,0.39389,0.311851,0.30023,0.315
5,0.100945,0.032696,0.37359,0.34835,0.26846,0.353991
6,0.094764,0.018161,0.35409,0.358556,0.25875,0.366239
7,0.094845,0.02267,0.33112,0.340749,0.23358,0.33115
8,0.089427,0.018226,0.31207,0.285961,0.22004,0.294137
9,0.102218,0.023471,0.31652,0.254738,0.21274,0.26512


In [8]:
test_size = 0.2

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Mengonversi target (user) menjadi nilai numerik menggunakan LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['subject'])

# Pisahkan data menjadi latih dan uji
X_train, X_test, y_train, y_test = train_test_split(first_order_df, y, test_size=test_size, random_state=42)

# Inisialisasi model XGBoost
model_first_order = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Melatih model pada data latih
model_first_order.fit(X_train, y_train)

# Prediksi pada data uji
y_pred_first_order = model_first_order.predict(X_test)

# Menghitung akurasi
accuracy_first_order = accuracy_score(y_test, y_pred_first_order)
print(f"Accuracy using first-order features: {accuracy_first_order * 100:.2f}%")


Parameters: { "use_label_encoder" } are not used.



Accuracy using first-order features: 93.16%


In [11]:
# Pisahkan data menjadi latih dan uji dengan fitur orde kedua
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(second_order_df, y, test_size=test_size, random_state=42)

# Inisialisasi model XGBoost
model_second_order = xgb.XGBClassifier(eval_metric='mlogloss')

# Melatih model pada data latih dengan fitur orde kedua
model_second_order.fit(X_train_2, y_train_2)

# Prediksi pada data uji
y_pred_second_order = model_second_order.predict(X_test_2)

# Menghitung akurasi
accuracy_second_order = accuracy_score(y_test_2, y_pred_second_order)
print(f"Accuracy using second-order features: {accuracy_second_order * 100:.2f}%")

Accuracy using second-order features: 44.71%


In [12]:
print(f"Accuracy comparison:")
print(f" - First-order features: {accuracy_first_order * 100:.2f}%")
print(f" - Second-order features: {accuracy_second_order * 100:.2f}%")

Accuracy comparison:
 - First-order features: 93.16%
 - Second-order features: 44.71%


In [13]:
# Menggabungkan fitur orde pertama dan orde kedua
combined_df = pd.concat([first_order_df, second_order_df], axis=1)

# Menampilkan beberapa baris pertama dari data gabungan
combined_df.head()

Unnamed: 0,H.period,H.t,H.i,H.e,H.five,H.Shift.r,H.o,H.a,H.n,H.l,...,UD.o.a,UD.a.n,UD.n.l,UD.l.Return,mean_dwell_time,std_dwell_time,mean_latency,std_latency,mean_flight_time,std_flight_time
0,0.1491,0.1069,0.1169,0.1417,0.1146,0.1067,0.1016,0.1349,0.0932,0.1338,...,0.112,0.0135,0.2583,0.2171,0.115782,0.02253,0.54039,0.493923,0.42045,0.492082
1,0.1111,0.0694,0.0908,0.0829,0.0689,0.157,0.1066,0.1412,0.1146,0.0839,...,0.0618,0.1146,0.1496,0.1917,0.1001,0.029165,0.434,0.361336,0.33136,0.363031
2,0.1328,0.0731,0.0821,0.0808,0.0892,0.1454,0.1365,0.1621,0.1172,0.1085,...,0.1566,0.0711,0.1533,0.1762,0.111109,0.029826,0.39526,0.298385,0.28249,0.304275
3,0.1291,0.1059,0.104,0.09,0.0913,0.1454,0.0956,0.1457,0.0866,0.0845,...,0.0574,0.0172,0.1475,0.2387,0.1054,0.023856,0.4645,0.405723,0.35669,0.410564
4,0.1249,0.0895,0.0903,0.0805,0.0742,0.1243,0.043,0.1312,0.0884,0.0903,...,0.1545,0.027,0.1633,0.1614,0.092582,0.025764,0.39389,0.311851,0.30023,0.315


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

# Mengonversi target (user) menjadi nilai numerik menggunakan LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['subject'])  # Target (kelas pengguna dari DataFrame asli)

# Pisahkan data menjadi latih dan uji menggunakan fitur gabungan
X_train_combined, X_test_combined, y_train_combined, y_test_combined = train_test_split(combined_df, y, test_size=test_size, random_state=42)

# Inisialisasi model XGBoost untuk fitur gabungan
model_combined = xgb.XGBClassifier(eval_metric='mlogloss')

# Melatih model pada data latih gabungan
model_combined.fit(X_train_combined, y_train_combined)

# Prediksi pada data uji gabungan
y_pred_combined = model_combined.predict(X_test_combined)

# Menghitung akurasi untuk fitur gabungan
accuracy_combined = accuracy_score(y_test_combined, y_pred_combined)
print(f"Accuracy using combined first-order and second-order features: {accuracy_combined * 100:.2f}%")

# Menghitung precision, recall, dan F1-score
precision = precision_score(y_test, y_pred_combined, average='macro')
recall = recall_score(y_test, y_pred_combined, average='macro')
f1 = f1_score(y_test, y_pred_combined, average='macro')

print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')

Accuracy using combined first-order and second-order features: 93.21%
Precision: 0.93
Recall: 0.93
F1-Score: 0.93


In [15]:
# Menampilkan perbandingan akurasi
print(f"Accuracy comparison:")
print(f" - First-order features: {accuracy_first_order * 100:.2f}%")
print(f" - Second-order features: {accuracy_second_order * 100:.2f}%")
print(f" - Combined first-order and second-order features: {accuracy_combined * 100:.2f}%")

Accuracy comparison:
 - First-order features: 93.16%
 - Second-order features: 44.71%
 - Combined first-order and second-order features: 93.21%


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Mengonversi target (user) menjadi nilai numerik menggunakan LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['subject'])

# Pisahkan data menjadi fitur (X) dan target (y)
X = combined_df  # Menggunakan fitur gabungan

# Inisialisasi model estimator untuk RFE (bisa menggunakan XGBoost atau model sederhana lainnya)
estimator = xgb.XGBClassifier(eval_metric='mlogloss')
selector = RFE(estimator, n_features_to_select=15, step=1)  # Pilih 10 fitur terbaik
X_selected = selector.fit_transform(X, y)  # Melakukan feature selection

# Pisahkan data menjadi latih dan uji setelah feature selection
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Melatih model XGBoost dengan fitur terpilih
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

# Prediksi dan evaluasi
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy after RFE feature selection: {accuracy * 100:.2f}%")


Parameters: { "use_label_encoder" } are not used.



Accuracy after RFE feature selection: 87.99%


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Mengonversi target (user) menjadi nilai numerik menggunakan LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['subject'])

# Pisahkan data menjadi fitur (X) dan target (y)
X = combined_df  # Menggunakan fitur gabungan

# Inisialisasi model estimator untuk RFE (bisa menggunakan XGBoost atau model sederhana lainnya)
estimator = xgb.XGBClassifier(eval_metric='mlogloss')
selector = RFE(estimator, n_features_to_select=20, step=1)  # Pilih 10 fitur terbaik
X_selected = selector.fit_transform(X, y)  # Melakukan feature selection

# Pisahkan data menjadi latih dan uji setelah feature selection
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Melatih model XGBoost dengan fitur terpilih
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

# Prediksi dan evaluasi
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy after RFE feature selection: {accuracy * 100:.2f}%")


Parameters: { "use_label_encoder" } are not used.



Accuracy after RFE feature selection: 90.69%


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
import xgboost as xgb

# Mengonversi target (user) menjadi nilai numerik menggunakan LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['subject'])

# Pisahkan data menjadi fitur (X) dan target (y)
X = combined_df  # Menggunakan fitur gabungan

# Inisialisasi model estimator untuk RFE (bisa menggunakan XGBoost atau model sederhana lainnya)
estimator = xgb.XGBClassifier(eval_metric='mlogloss')
selector = RFE(estimator, n_features_to_select=30, step=1)  # Pilih 10 fitur terbaik
X_selected = selector.fit_transform(X, y)  # Melakukan feature selection

# Pisahkan data menjadi latih dan uji setelah feature selection
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Melatih model XGBoost dengan fitur terpilih
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

# Prediksi dan evaluasi
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy after RFE feature selection: {accuracy * 100:.2f}%")


Parameters: { "use_label_encoder" } are not used.



Accuracy after RFE feature selection: 92.99%
