In [1]:
# Importing necessary libraries
import pandas as pd   
import numpy as np    
import random         

# Set pandas option to display a large number of rows (6000 in this case)
# pd.set_option('display.max_rows', None)  # Option to display all rows, commented out
import warnings
warnings.filterwarnings('ignore')  # Ignore warnings for cleaner output
pd.set_option('display.max_rows', 6000)  # Set the max number of rows to display to 6000

# Importing datetime for date manipulation
from datetime import timedelta

# Importing machine learning utilities from scikit-learn
from sklearn.utils import resample  # For resampling the dataset
from sklearn.preprocessing import StandardScaler  # For feature scaling
from sklearn.ensemble import RandomForestClassifier  # For the Random Forest model
from sklearn import metrics, preprocessing  # For evaluation metrics and preprocessing
from sklearn.model_selection import cross_val_score  # For cross-validation
from sklearn.metrics import classification_report  # For generating classification reports

# Importing tqdm for progress bars
from tqdm import tqdm

In [2]:
# Importing pickle for object serialization
import pickle  # For saving and loading model objects

In [3]:
# Loading data from a pickle file
with open('2023-07-04.pkl', 'rb') as file:  # Open the pickle file in read-binary mode
    data = pickle.load(file)  # Load the data from the pickle file into the variable 'data'

In [4]:
# Unpacking the data loaded from the pickle file
keystrokes, accs, gyrs = data  # Assuming 'data' is a tuple or list with three elements

In [5]:
#Selecting a random sample of indices for testing
test_sess_index = random.sample(range(len(keystrokes)), round(0.2 * len(keystrokes)))

# 'len(keystrokes)' gives the total number of sessions or data points
# 'round(0.2 * len(keystrokes))' calculates 20% of the total sessions and rounds to the nearest integer
# 'random.sample(range(len(keystrokes)), round(0.2 * len(keystrokes)))' selects random indices from the range
# of session indices, equivalent to choosing 20% of the sessions randomly for testing

In [6]:
# List of randomly selected indices
test_sess_index 

[35, 15, 1, 12, 24, 33, 26, 5]

In [7]:
def create_ids(x):
    final = pd.DataFrame()  # Initialize an empty DataFrame to store final data
    for i in range(len(keystrokes)):  # Iterate through all sessions or data points
        if i not in test_sess_index:  # Check if the session index 'i' is not in the test set
            x[i]['id'] = i  # Assign 'id' column with the session index 'i' to the dataframe x[i]
            final = pd.concat([final, x[i]])  # Concatenate dataframe x[i] to the final DataFrame
    return final  # Return the final concatenated DataFrame

def create_test_ids(x):
    final = pd.DataFrame()  # Initialize an empty DataFrame to store final test data
    for i in test_sess_index:  # Iterate through the indices selected for testing
        x[i]['id'] = i  # Assign 'id' column with the session index 'i' to the dataframe x[i]
        final = pd.concat([final, x[i]])  # Concatenate dataframe x[i] to the final test DataFrame
    return final  # Return the final concatenated test DataFrame

In [8]:
# Creating datasets with 'id' columns for training
key = create_ids(keystrokes)  # Create 'key' dataset with 'id' column from 'keystrokes'
acc = create_ids(accs)        # Create 'acc' dataset with 'id' column from 'accs'
gyr = create_ids(gyrs)        # Create 'gyr' dataset with 'id' column from 'gyrs'

# Creating datasets with 'id' columns for testing
t_key = create_test_ids(keystrokes)  # Create 't_key' dataset with 'id' column from 'keystrokes' for testing
t_acc = create_test_ids(accs)        # Create 't_acc' dataset with 'id' column from 'accs' for testing
t_gyr = create_test_ids(gyrs)        # Create 't_gyr' dataset with 'id' column from 'gyrs' for testing

In [9]:
acc.reset_index(drop=True, inplace=True)  # Resetting the index of 'acc' DataFrame
gyr.reset_index(drop=True, inplace=True)  # Resetting the index of 'gyr' DataFrame

In [10]:
result_df = pd.DataFrame()  # Initialize an empty DataFrame to store results
matched_rows = {}  # Initialize an empty dictionary to track matched rows

# Renaming columns in 'gyr' DataFrame
gyr = gyr.rename({'x': 'x_g', 'y': 'y_g', 'z': 'z_g'}, axis=1)

# Iterate over rows in 'gyr' DataFrame using tqdm for progress tracking
for index, row in tqdm(gyr.iterrows(), total=gyr.shape[0]):
    # Calculate the absolute time difference between 'acc' and current 'gyr' row
    time_diffs = abs(acc['t'] - row['t'])
    min_time_diff = time_diffs.min()  # Find the minimum time difference
    
    closest_index = time_diffs.idxmin()  # Find the index of the minimum time difference
    matched_rows[closest_index] = True  # Mark the index as matched in the dictionary

    # Rename and convert the current 'gyr' row to a DataFrame row
    row_adjusted = row.rename({'t': 't2', 'id': 'id2'}).to_frame().T.reset_index(drop=True)

    # Combine the closest 'acc' row and the adjusted 'gyr' row into a single DataFrame row
    combined_row = pd.concat([acc.iloc[[closest_index]].reset_index(drop=True), row_adjusted], axis=1)
    combined_row['TimeDiff'] = min_time_diff  # Add the 'TimeDiff' column to the combined row

    # Concatenate the combined row to the 'result_df' DataFrame
    result_df = pd.concat([result_df, combined_row], ignore_index=True)

100%|███████████████████████████████████████████████████████████████████████████| 22362/22362 [03:10<00:00, 117.43it/s]


In [11]:
acc.set_index('t')  # Setting 't' column as index in 'acc' DataFrame (but not assigning the result to acc)
result_df.set_index('t')  # Setting 't' column as index in 'result_df' DataFrame (but not assigning the result to result_df)

# Merging 'acc' and 'result_df' DataFrames based on 't' and 'id' columns, using outer join
merged_df = pd.merge(acc, result_df, on=['t', 'id'], how='outer').sort_values(['id', 't'])

# Dropping unnecessary columns from merged DataFrame
merged_df.drop(['x_y', 'y_y', 'z_y', 'id2'], axis=1, inplace=True)

# Renaming columns in merged DataFrame
merged_df.rename({'x_x': 'x', 'y_x': 'y', 'z_x': 'z'}, axis=1, inplace=True)


In [12]:
merged_df2 = merged_df.copy()

# This line creates a deep copy of the DataFrame 'merged_df' and assigns it to 'merged_df2'.
# Changes made to 'merged_df2' will not affect 'merged_df', and vice versa.

In [13]:
# Convert 'x_g' column to numeric values, coercing errors to NaN
merged_df2['x_g'] = pd.to_numeric(merged_df2['x_g'], errors='coerce')

# Convert 'y_g' column to numeric values, coercing errors to NaN
merged_df2['y_g'] = pd.to_numeric(merged_df2['y_g'], errors='coerce')

# Convert 'z_g' column to numeric values, coercing errors to NaN
merged_df2['z_g'] = pd.to_numeric(merged_df2['z_g'], errors='coerce')

In [14]:
#merged_df2[['x_g', 'y_g', 'z_g']] = merged_df2.groupby('id')[['x_g', 'y_g', 'z_g']].apply(lambda group: group.interpolate(method='polynomial',order=5))
merged_df2[['x_g', 'y_g', 'z_g']] = merged_df2.groupby('id')[['x_g', 'y_g', 'z_g']].apply(lambda group: group.interpolate(method='linear'))
#merged_df2[['x_g', 'y_g', 'z_g']] = merged_df2.groupby('id')[['x_g', 'y_g', 'z_g']].apply(lambda group: group.interpolate(method='spline',order=3))

In [15]:
missing_counts = merged_df2.isnull().sum()

# This line calculates the number of missing values (NaN) in each column of the DataFrame 'merged_df2'.

In [16]:
# Assuming 'key' is a DataFrame

# Drop the 'ut' column from 'key'
# key.drop('ut', axis=1, inplace=True)

# Add a new column 'target' with all values set to 1 in 'key'
key['target'] = 1

In [17]:
# Rename the column 'dt' to 't' in the DataFrame 'key'
key = key.rename({'dt': 't'}, axis=1)

In [18]:
def replace_nans(df):
    # Sort the DataFrame 'df' by the column 't'
    df = df.sort_values(by='t')
    
    # Find the times where 'target' is 1.0
    times_with_1 = df[df['target'] == 1.0]['t']

    # Iterate over these times and replace NaNs within 0.3 seconds before and after
    for time in times_with_1:
        start_time = time - timedelta(seconds=0.6)
        end_time = time + timedelta(seconds=0.6)
        
        # Locate rows where 't' is within the specified time range and 'target' is NaN
        df.loc[(df['t'] >= start_time) & (df['t'] <= end_time) & (df['target'].isna()), 'target'] = 1.0

    # Replace remaining NaNs in 'target' with 0
    df['target'].fillna(0, inplace=True)

    return df

In [19]:
def preprocessing(acce, gyro, keys):
    # Combining acc and gyr
    keys = keys.rename({'dt':'t'},axis=1)
    gyro = gyro.rename({'x': 'x_g', 'y': 'y_g', 'z': 'z_g'}, axis=1)
    df = pd.concat([acce, gyro]).sort_values(['id','t'])
    df[['x_g', 'y_g', 'z_g']] = df.groupby('id')[['x_g', 'y_g', 'z_g']].apply(lambda group: group.interpolate(method='linear'))
    df.dropna(inplace=True)
    df.reset_index(inplace=True,drop=True)

    # Removing 'ut' and creating the target value
    keys.drop('ut', axis=1, inplace=True)
    keys['target'] = 1

    # Combining acc, gyr, and keys
    merged_df = pd.merge(df, keys, on=['t','id'], how='outer').sort_values('t')

    labeled_df = replace_nans(merged_df)

    # Convert 'target' column back to numeric
    labeled_df['target'] = pd.to_numeric(labeled_df['target'])

    labeled_df.dropna(inplace=True)

    return labeled_df

In [27]:
# Assuming acc is the DataFrame with accelerometer data
res_acc = acc  # Placeholder, replace with actual processing code

# Assuming gyr is the DataFrame with gyroscope data
res_gyr = gyr  # Placeholder, replace with actual processing code

res_acc_t = t_acc  # Placeholder, replace with actual test accelerometer data processing
res_gyr_t = t_gyr  # Placeholder, replace with actual test gyroscope data processing

In [28]:
train_sessions = preprocessing(res_acc,res_gyr,key)
test_sessions = preprocessing(res_acc_t,res_gyr_t,t_key) 

In [29]:
def add_fft_features(df, column, window_size):
    
    # Function to calculate energy of FFT
    def fft_energy(x):
        return np.sum(np.abs(np.fft.fft(x))**2) / len(x)

    # Function to calculate entropy of FFT
    def fft_entropy(x):
        fft_abs = np.abs(np.fft.fft(x))
        return -np.sum((fft_abs / np.sum(fft_abs)) * np.log(fft_abs / np.sum(fft_abs) + 1e-10))

    # Function to calculate spectral centroid
    def spectral_centroid(x):
        fft_abs = np.abs(np.fft.fft(x))
        return np.sum(np.arange(len(x)) * fft_abs) / np.sum(fft_abs)

    # Function to calculate spectral spread
    def spectral_spread(x):
        fft_abs = np.abs(np.fft.fft(x))
        centroid = spectral_centroid(x)
        return np.sqrt(np.sum(((np.arange(len(x)) - centroid)**2) * fft_abs) / np.sum(fft_abs))

    # Function to calculate spectral flatness
    def spectral_flatness(x):
        fft_abs = np.abs(np.fft.fft(x))
        return np.exp(np.mean(np.log(fft_abs + 1e-10))) / np.mean(fft_abs)

    # Apply each function separately to the specified column using rolling window
    df[f'{column}_fft_energy'] = df[column].rolling(window=window_size, center=True).apply(fft_energy, raw=True)
    df[f'{column}_fft_entropy'] = df[column].rolling(window=window_size, center=True).apply(fft_entropy, raw=True)
    df[f'{column}_spectral_centroid'] = df[column].rolling(window=window_size, center=True).apply(spectral_centroid, raw=True)
    df[f'{column}_spectral_spread'] = df[column].rolling(window=window_size, center=True).apply(spectral_spread, raw=True)
    df[f'{column}_spectral_flatness'] = df[column].rolling(window=window_size, center=True).apply(spectral_flatness, raw=True)

    return df

In [46]:
def rolling_window_features(df, window_size=60):
    # Initialize an empty DataFrame to store the final result
    result_df = pd.DataFrame()
    
    # Iterate over each unique 'id' in the input DataFrame 'df'
    for unique_id in df['id'].unique():
        # Filter the DataFrame for the current unique 'id'
        id_df = df[df['id'] == unique_id]
        
        # Create a temporary DataFrame and make a copy of the filtered DataFrame
        temp_df = pd.DataFrame()
        temp_df = id_df.copy()
        
        # Calculate accelerometer and gyroscope magnitudes
        id_df['acc_magnitude'] = np.sqrt(id_df['x']**2 + id_df['y']**2 + id_df['z']**2)
        id_df['gyr_magnitude'] = np.sqrt(id_df['x_g']**2 + id_df['y_g']**2 + id_df['z_g']**2)
        
        # Calculate rolling window statistics for accelerometer data ('x', 'y', 'z')
        for axis in ['x', 'y', 'z']:
            temp_df[f'acc_{axis}_mean'] = id_df[axis].rolling(window=window_size).mean()
            temp_df[f'acc_{axis}_std'] = id_df[axis].rolling(window=window_size).std()
            temp_df[f'acc_{axis}_max'] = id_df[axis].rolling(window=window_size).max()
            temp_df[f'acc_{axis}_min'] = id_df[axis].rolling(window=window_size).min()
            temp_df[f'acc_{axis}_min_max_diff'] = temp_df[f'acc_{axis}_max'] - temp_df[f'acc_{axis}_min']
            temp_df[f'acc_{axis}_skew'] = id_df[axis].rolling(window=window_size).skew()
            temp_df[f'acc_{axis}_kurtosis'] = id_df[axis].rolling(window=window_size).kurt()
            temp_df[f'acc_{axis}_energy'] = id_df[axis].rolling(window=window_size).apply(lambda x: np.sum(x**2), raw=True)
            temp_df[f'acc_{axis}_median'] = id_df[axis].rolling(window=window_size).median()
            temp_df[f'acc_{axis}_variance'] = id_df[axis].rolling(window=window_size).var()
            temp_df[f'acc_{axis}_sum'] = id_df[axis].rolling(window=window_size).sum()
        
        # Calculate rolling window statistics for gyroscope data ('x_g', 'y_g', 'z_g')
        for axis in ['x_g', 'y_g', 'z_g']:
            temp_df[f'gyr_{axis}_mean'] = id_df[axis].rolling(window=window_size).mean()
            temp_df[f'gyr_{axis}_std'] = id_df[axis].rolling(window=window_size).std()
            temp_df[f'gyr_{axis}_max'] = id_df[axis].rolling(window=window_size).max()
            temp_df[f'gyr_{axis}_min'] = id_df[axis].rolling(window=window_size).min()
            temp_df[f'gyr_{axis}_min_max_diff'] = temp_df[f'gyr_{axis}_max'] - temp_df[f'gyr_{axis}_min']
            temp_df[f'gyr_{axis}_skew'] = id_df[axis].rolling(window=window_size).skew()
            temp_df[f'gyr_{axis}_kurtosis'] = id_df[axis].rolling(window=window_size).kurt()
            temp_df[f'gyr_{axis}_energy'] = id_df[axis].rolling(window=window_size).apply(lambda x: np.sum(x**2), raw=True)
            temp_df[f'gyr_{axis}_median'] = id_df[axis].rolling(window=window_size).median()
            temp_df[f'gyr_{axis}_variance'] = id_df[axis].rolling(window=window_size).var()
            temp_df[f'gyr_{axis}_sum'] = id_df[axis].rolling(window=window_size).sum()    
        
        # Drop rows with NaN values resulting from rolling window calculations
        temp_df.dropna(inplace=True)    
        
        # Calculate FFT features for each axis ('x', 'y', 'z', 'x_g', 'y_g', 'z_g')
        for axis in ['x', 'y', 'z', 'x_g', 'y_g', 'z_g']:
            temp_df = add_fft_features(temp_df, axis, window_size)
        
        # Concatenate the temporary DataFrame 'temp_df' to the result DataFrame 'result_df'
        result_df = pd.concat([result_df, temp_df], ignore_index=True)
        
        # Drop rows with NaN values from the result DataFrame 'result_df'
        result_df.dropna(inplace=True)
    
    # Return the final DataFrame containing all rolling window features and FFT-based features
    return result_df

In [47]:
# Apply rolling window feature extraction to train_sessions DataFrame
rolled_train = rolling_window_features(train_sessions)

# Apply rolling window feature extraction to test_sessions DataFrame
rolled_test = rolling_window_features(test_sessions)

In [88]:
# Now, you can check the shape of the rolled_train DataFrame
rolled_train.shape

(232286, 105)

In [48]:
# Calculate value counts of the 'target' column
rolled_train['target'].value_counts()

0.0    122001
1.0    100220
Name: target, dtype: int64

In [49]:
rolled_train['target'].value_counts() 

0.0    122001
1.0    100220
Name: target, dtype: int64

In [50]:
# Downsample class 0 to balance the classes in the training data
class_0 = rolled_train[rolled_train['target'] == 0]
class_1 = rolled_train[rolled_train['target'] == 1]

# Determine the number of samples to keep from class 0 (e.g., same as class 1 or a desired ratio)
desired_class_0_samples = len(class_1)
downsampled_class_0 = resample(class_0, n_samples=desired_class_0_samples, random_state=42)

# Combine the downsampled class 0 with class 1 to create the balanced training dataset
balanced_train_sessions = pd.concat([downsampled_class_0, class_1])

# Extract features and labels for training
X_train = balanced_train_sessions.drop(['id', 't', 'target'], axis=1)
y_train = balanced_train_sessions['target']

# Extract features and labels for testing (assuming you have already loaded and preprocessed test data)
X_test = rolled_test.drop(['id', 't', 'target'], axis=1)
y_test = rolled_test['target']

In [51]:
# Import necessary libraries
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform the training data
X_train = scaler.fit_transform(X_train)

# Transform the test data using the fitted scaler
X_test = scaler.transform(X_test)

# Initialize the RandomForestClassifier
rf = RandomForestClassifier(random_state=19, n_jobs=-1)

# Fit the Random Forest model on the training data
rf.fit(X_train, y_train)

# Predict the labels for the test data
y_pred = rf.predict(X_test)

# Compute the confusion matrix to evaluate the accuracy of the classification
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print(cnf_matrix)

# Print the accuracy score of the Random Forest model
print("Random Forest score is: %f" % rf.score(X_test, y_test))

# Print the classification report for detailed precision, recall, F1-score, and support metrics
print(classification_report(y_test, y_pred))

[[17151  5652]
 [ 8966 16355]]
Random Forest score is: 0.696243
              precision    recall  f1-score   support

         0.0       0.66      0.75      0.70     22803
         1.0       0.74      0.65      0.69     25321

    accuracy                           0.70     48124
   macro avg       0.70      0.70      0.70     48124
weighted avg       0.70      0.70      0.70     48124

