# Data preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import *
from tensorflow import keras
import tensorflow as tf
from numpy import *
import re
from tqdm import tqdm


In [None]:
def set_movedis_movelength(row):
    if re.search(r'Move From \[(\d+), (\d+)\] To \[(\d+), (\d+)\]', row['activity']):
        match = re.search(r'Move From \[(\d+), (\d+)\] To \[(\d+), (\d+)\]', row['activity'])
        x1, y1, x2, y2 = map(int, match.groups())
        dx = x2 - x1
        dy = y2 - y1
        dis = abs(x2 - x1)
        length = (y1 - x1)
    else:
        dis = 0
        length = 0
    return dis, length

def set_pastelength(row):
    if row['activity'] == 'Paste':
        return len(row['text_change'])
    else:
        return 0

def set_replacelength(row):
    if row['activity'] == 'Replace':
        replace_text = row['text_change']
        before_text, after_text = replace_text.split(' => ')

        before_length = len(before_text)
        after_length = len(after_text)

        return before_length - after_length
    else:
        return 0

def set_RClength(row):
    if row['activity'] == 'Remove/Cut':
        text_change = row['text_change']
        return len(text_change)
    else:
        return 0

Keyboard_keywords = ['Leftclick', 'Shift', 'Backspace', 'Enter', 'ArrowLeft', 'ArrowRight', 'Tab', 'ArrowUp', 'ArrowDown', 'Rightclick',
           'CapsLock', 'Control', 'Delete', 'Home', 'End', 'Insert', 'NumLock', 'Alt',
           'ContextMenu','PageDown', 'Middleclick', 'ScrollLock', 'Space',
           'Cancel', 'Escape', 'Clear', 'OS']
for i in range(1,16):
  Keyboard_keywords.append('F'+str(i))#F1--F15
Media_keywords = ['Meta','AudioVolumeMute','MediaPlayPause','AudioVolumeUp','AudioVolumeDown','MediaTrackPrevious', 'MediaTrackNext', 'Pause']
Other_keywords = ['Dead', 'Process', 'AltGraph',]
Unknown_keywords = ['Unidentified','Unknownclick']

def convert_down_event_char(value):
    if len(value) == 1 and (value.isalpha() or value.isdigit()):
        return 'q'
    elif value in Keyboard_keywords:
        return "keyboard_keyword"
    elif value in Media_keywords:
        return "media_keyword"
    elif value in Other_keywords:
        return "other_keyword"
    elif value in Unknown_keywords:
        return "unknown_keyword"
    else:
        return "unknown_keyword"

actype = {0:'Nonproduction',
      1:'Input',
      2:'Remove/Cut',
      3:'Paste',
      4:'Replace',
      5:'Move'
      }

def set_activity(row):
    if row['activity'] == 'Nonproduction':
        return 0
    elif row['activity'] == 'Input':
        return 1
    elif row['activity'] == 'Remove/Cut':
        return 2
    elif row['activity'] == 'Paste':
        return 3
    elif row['activity'] == 'Replace':
        return 4
    else:
      return 5
def preprocess(df):

  df.sort_values(by=['id', 'event_id'])

  df['delta_cursor'] = (df['cursor_position'] - df['cursor_position'].shift(1))
  df.loc[0, 'delta_cursor'] = 0

  df['movedis'], df['movelength'] = zip(*df.apply(set_movedis_movelength, axis=1))

  df['pastelength'] = df.apply(set_pastelength, axis=1)

  df['replacelength'] = df.apply(set_replacelength, axis=1)

  df['RClength'] = df.apply(set_RClength, axis=1)

  df['keyword'] = df['down_event'].apply(convert_down_event_char)

  df['actype'] = df.apply(set_activity,axis=1)

  df['thinktime'] = df['down_time'] - df['up_time'].shift()
  df['thinktime'] = df['thinktime'].clip(lower=0)

  df['contype'] = (df['down_time'] <= df['up_time'].shift()).astype(int)

  df.drop(["up_event", "activity", "down_event", "text_change", "cursor_position"], axis=1, inplace=True)
def getfeature(df,data):
  consecutive_input_lengths = []
  for id_value, group in tqdm(df.groupby('id')):
    #print(id_value)
    word_count = group['word_count'].iloc[-1]

    contype_count = group['contype'].eq(1).sum()

    total_think = group['thinktime'].sum()

    total_time = group['up_time'].iloc[-1] - group['down_time'].iloc[0]

    action_count = group.shape[0]

    total_actiontime = group['action_time'].sum()

    keyword_counts = group['keyword'].value_counts()
    keyboard_count = keyword_counts.get('keyboard_keyword', 0)
    media_count = keyword_counts.get('media_keyword', 0)
    other_count = keyword_counts.get('other_keyword', 0)
    unknown_count = keyword_counts.get('unknown_keyword', 0)

    delta_cursor = group['delta_cursor']
    dc0 = delta_cursor.between(0, 1).sum()
    dc1 = delta_cursor.between(2, 10).sum()
    dc2 = delta_cursor.between(11, 30).sum()
    dc3 = delta_cursor.between(31, 80).sum()
    dc4 = (delta_cursor > 80).sum()
    dc5 = delta_cursor.between(-1, -1).sum()
    dc6 = delta_cursor.between(-10, -2).sum()
    dc7 = delta_cursor.between(-30, -11).sum()
    dc8 = delta_cursor.between(-80, -31).sum()
    dc9 = (delta_cursor < -80).sum()

    movedis = group['movedis']
    md0 = movedis.between(0, 1).sum()
    md1 = movedis.between(2, 10).sum()
    md2 = movedis.between(11, 30).sum()
    md3 = movedis.between(31, 80).sum()
    md4 = (movedis > 80).sum()

    movelength = group['movelength']
    ml0 = movelength.between(0, 5).sum()
    ml1 = movelength.between(6, 15).sum()
    ml2 = movelength.between(16, 30).sum()
    ml3 = movelength.between(31, 80).sum()
    ml4 = (movelength > 80).sum()

    pastelength = group['pastelength']
    pl0 = pastelength.between(0, 1).sum()
    pl1 = pastelength.between(2, 10).sum()
    pl2 = pastelength.between(11, 30).sum()
    pl3 = pastelength.between(31, 80).sum()
    pl4 = (pastelength > 80).sum()

    replacelength = group['replacelength']
    rl0 = replacelength.between(0, 1).sum()
    rl1 = replacelength.between(2, 10).sum()
    rl2 = replacelength.between(11, 30).sum()
    rl3 = replacelength.between(31, 80).sum()
    rl4 = (replacelength > 80).sum()

    RClength = group['RClength']
    rc0 = RClength.between(0, 1).sum()
    rc1 = RClength.between(2, 10).sum()
    rc2 = RClength.between(11, 30).sum()
    rc3 = RClength.between(31, 80).sum()
    rc4 = (RClength > 80).sum()

    actype_counts = group['actype'].value_counts()
    ac0 = actype_counts.get(0, 0)
    ac1 = actype_counts.get(1, 0)
    ac2 = actype_counts.get(2, 0)
    ac3 = actype_counts.get(3, 0)
    ac4 = actype_counts.get(4, 0)
    ac5 = actype_counts.get(5, 0)

    '''consecutive_input = np.where(
        (group['actype'] == 1) & (group['actype'].shift(1) != 1), 1, 0
    )
    consecutive_input_count = consecutive_input.sum()'''

    consecutive_input_count = 0
    consecutive_input_lengths = [1]
    prev_action_type = None
    for index, row in group.iterrows():
      current_action_type = row['actype']
      if prev_action_type == 1 and current_action_type != 1:
          consecutive_input_count += 1
          consecutive_input_lengths.append(0)
      elif prev_action_type == 1 and current_action_type == 1:
          consecutive_input_lengths[-1] += 1
      prev_action_type = current_action_type

    consecutive_input_max = max(consecutive_input_lengths)
    consecutive_input_mean = np.mean(consecutive_input_lengths)
    consecutive_input_median = np.median(consecutive_input_lengths)
    consecutive_input_variance = np.var(consecutive_input_lengths)

    data = data._append(
        {
            'id': id_value,
            'word_count': word_count,
            'contype_count': contype_count,
            'total_think': total_think,
            'total_time': total_time,
            'action_count': action_count,
            'total_actiontime': total_actiontime,
            'keyboard_count': keyboard_count,
            'media_count': media_count,
            'other_count': other_count,
            'unknown_count': unknown_count,
            'dc0': dc0,
            'dc1': dc1,
            'dc2': dc2,
            'dc3': dc3,
            'dc4': dc4,
            'dc5': dc5,
            'dc6': dc6,
            'dc7': dc7,
            'dc8': dc8,
            'dc9': dc9,
            'md0': md0,
            'md1': md1,
            'md2': md2,
            'md3': md3,
            'md4': md4,
            'ml0': ml0,
            'ml1': ml1,
            'ml2': ml2,
            'ml3': ml3,
            'ml4': ml4,
            'pl0': pl0,
            'pl1': pl1,
            'pl2': pl2,
            'pl3': pl3,
            'pl4': pl4,
            'rl0': rl0,
            'rl1': rl1,
            'rl2': rl2,
            'rl3': rl3,
            'rl4': rl4,
            'rc0': rc0,
            'rc1': rc1,
            'rc2': rc2,
            'rc3': rc3,
            'rc4': rc4,
            'ac0': ac0,
            'ac1': ac1,
            'ac2': ac2,
            'ac3': ac3,
            'ac4': ac4,
            'ac5': ac5,
            'consecutive_input_count': consecutive_input_count,
            'consecutive_input_max': consecutive_input_max,
            'consecutive_input_mean': consecutive_input_mean,
            'consecutive_input_mid': consecutive_input_median,
            'consecutive_input_var': consecutive_input_variance,
        },
        ignore_index=True
    )
  return data


In [None]:
train_logs = pd.read_csv('train_logs.csv')
test_logs = pd.read_csv('test_logs.csv')

In [None]:


preprocess(train_logs)
preprocess(test_logs)


In [None]:

trainX = pd.DataFrame()
testX = pd.DataFrame()

trainX = getfeature(train_logs,trainX)
testX = getfeature(test_logs,testX)

train_scores = pd.read_csv('train_scores.csv')
trainY = train_scores['score']

In [None]:
trainX = trainX.drop(['id'],axis=1)


In [None]:
print(trainX.shape)
# apply normalization techniques
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
trainX = scaler.fit_transform(trainX)
trainX = np.nan_to_num(trainX)


In [None]:
paramgrid = {'alpha': logspace(-6, 0, 10), 
             'gamma': logspace(-6, 0, 10)}

krrcv = model_selection.GridSearchCV(estimator=kernel_ridge.KernelRidge(kernel='rbf'),
                                      param_grid=paramgrid, cv=5)
krrcv.fit(trainX, trainY)

# print the RMSE for the best alpha value
print("Best alpha value for Kernel Ridge Regression: ", krrcv.best_params_)
print("RMSE for Kernel Ridge Regression: ", sqrt(metrics.mean_squared_error(trainY, krrcv.predict(trainX))))

In [None]:
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor()
lgbm.fit(trainX, trainY)
print("RMSE for LightGBM: ", sqrt(metrics.mean_squared_error(trainY, lgbm.predict(trainX))))