In [None]:
import pandas as pd
import numpy as np
import pickle
import os
import warnings

MODEL_FILE = '../random_forest_model.pkl'

NEW_DATA_FILE = '../Keystrokes.csv'

OUTPUT_FILE = 'keystroke_predictions.csv'
FEATURE_NAMES = [
    'H.period', 'DD.period.t', 'UD.period.t', 'H.t', 'DD.t.i',
    'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e', 'H.e', 'DD.e.five',
    'UD.e.five', 'H.five', 'DD.five.Shift.r', 'UD.five.Shift.r',
    'H.Shift.r', 'DD.Shift.r.o', 'UD.Shift.r.o', 'H.o', 'DD.o.a',
    'UD.o.a', 'H.a', 'DD.a.n', 'UD.a.n', 'H.n', 'DD.n.l', 'UD.n.l',
    'H.l', 'DD.l.Return', 'UD.l.Return', 'H.Return'
]

def predict_new_data():
    """
    Loads the saved model and predicts on a new, unlabeled CSV file.
    """
    if not os.path.exists(MODEL_FILE):
        print(f"Error: Model file '{MODEL_FILE}' not found.")
        print("Please place your .pkl file in the same directory as this script.")
        return

    if not os.path.exists(NEW_DATA_FILE):
        print(f"Error: New data file '{NEW_DATA_FILE}' not found.")
        print(f"Please update the 'NEW_DATA_FILE' variable in this script to match your file's name.")
        return

    print(f"Loading model from '{MODEL_FILE}'...")
    try:
        with open(MODEL_FILE, 'rb') as f:
            model = pickle.load(f)
    except Exception as e:
        print(f"Error loading pickle file: {e}")
        return

    print(f"Loading new data from '{NEW_DATA_FILE}'...")
    try:
        new_data = pd.read_csv(NEW_DATA_FILE, header=0) # Changed from header=1 to header=0
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return

    original_data_with_index = new_data.copy()
    original_data_with_index['original_index'] = new_data.index

    missing_cols = [col for col in FEATURE_NAMES if col not in new_data.columns]
    if missing_cols:
        print(f"\nError: The new data file is missing required feature columns:")
        print(missing_cols)
        return

    print("Preprocessing data (applying to_numeric and dropping rows with errors)...")

    for feature in FEATURE_NAMES:
        new_data[feature] = pd.to_numeric(new_data[feature], errors='coerce')

    original_row_count = len(new_data)
    new_data.dropna(subset=FEATURE_NAMES, inplace=True)
    processed_row_count = len(new_data)

    if original_row_count > processed_row_count:
        print(f"Warning: Dropped {original_row_count - processed_row_count} rows due to missing/invalid data.")

    if new_data.empty:
        print("Error: No valid data left to predict after preprocessing.")
        return

    X_predict = new_data[FEATURE_NAMES]

    print(f"Making {len(X_predict)} predictions...")
    try:
        predictions = model.predict(X_predict)
        probabilities = model.predict_proba(X_predict)
    except Exception as e:
        print(f"Error during prediction: {e}")
        return

    # --- 5. Save Results ---
    print("Saving results...")

    # Add prediction results to the processed dataframe
    new_data['Prediction'] = predictions
    new_data['Probability_Genuine'] = probabilities[:, 0]
    new_data['Probability_Imposter'] = probabilities[:, 1]

    output_df = pd.merge(
        original_data_with_index,
        new_data[['Prediction', 'Probability_Genuine', 'Probability_Imposter']],
        left_on='original_index',
        right_index=True,
        how='right' # Use 'right' to keep only the processed/predicted rows
    ).drop(columns=['original_index'])


    output_df.to_csv(OUTPUT_FILE, index=False)

    print("\n--- Prediction Complete ---")
    print(f"Results saved to '{OUTPUT_FILE}'")
    print("\nPrediction counts:")
    print(output_df['Prediction'].value_counts(dropna=False))

if __name__ == "__main__":
    # Suppress warnings
    warnings.filterwarnings('ignore')
    predict_new_data()

Loading model from '../random_forest_model.pkl'...
Loading new data from '../Keystrokes.csv'...
Preprocessing data (applying to_numeric and dropping rows with errors)...
Making 1 predictions...
Saving results...

--- Prediction Complete ---
Results saved to 'keystroke_predictions.csv'

Prediction counts:
Prediction
Imposter    1
Name: count, dtype: int64
