# Phase 1: Setup and Data Preparation

## Import Libraries and Define File Paths

In [6]:
import pandas as pd
import numpy as np

# Data visualisation libraries (to be used later)
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning libraries (to be used later)
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Path to your data files
data_path = '../data/'
logon_file = data_path + 'logon.csv'
file_file = data_path + 'file.csv'
email_file = data_path + 'email.csv'
device_file = data_path + 'device.csv'
psychometric_file = data_path + 'psychometric.csv'

print("Libraries imported and file paths defined.")

Libraries imported and file paths defined.


## Load Datasets

In [7]:
# Load the datasets
logon = pd.read_csv(logon_file)
file= pd.read_csv(file_file)
email = pd.read_csv(email_file)
device= pd.read_csv(device_file)
psychometric = pd.read_csv(psychometric_file)

print("Datasets loaded successfully.")
print("Logon DataFrame shape:", logon.shape)
print("File DataFrame shape:", file.shape)
print("Email DataFrame shape:", email.shape)
print("Device DataFrame shape:", device.shape)
print("Psychometric DataFrame shape:", psychometric.shape)

Datasets loaded successfully.
Logon DataFrame shape: (3530285, 5)
File DataFrame shape: (2014883, 9)
Email DataFrame shape: (10994957, 12)
Device DataFrame shape: (1551828, 6)
Psychometric DataFrame shape: (4000, 7)


## Timestamps Convertion

In [8]:
# Convert timestamps to datetime format
logon['date'] = pd.to_datetime(logon['date'])
file['date'] = pd.to_datetime(file['date'])
email['date'] = pd.to_datetime(email['date'])
device['date'] = pd.to_datetime(device['date'])

print("Timestamp columns converted to datetime.")
print("Example of converted timestamp:", logon['date'].iloc[0])

Timestamp columns converted to datetime.
Example of converted timestamp: 2010-01-02 02:19:18


## Datasets Merge

In [9]:
# Create a unique master index of all user-date pairs
all_user_dates = pd.concat([logon[['user', 'date']], 
                           file[['user', 'date']], 
                           email[['user', 'date']],
                           device[['user', 'date']]]).drop_duplicates()

# Use this master index to create a new, empty DataFrame
master_df = all_user_dates.copy()

# Iteratively merge the features from each log file
master_df = pd.merge(master_df, logon.drop_duplicates(subset=['user', 'date']), on=['user', 'date'], how='left')
master_df = pd.merge(master_df, file.drop_duplicates(subset=['user', 'date']), on=['user', 'date'], how='left')
merged = pd.merge(master_df, email.drop_duplicates(subset=['user', 'date']), on=['user', 'date'], how='left')

print("All datasets merged using the master index.")
print("Merged DataFrame shape:", merged.shape)

All datasets merged using the master index.
Merged DataFrame shape: (17701957, 22)


## Handle Missing Values and Duplicates

In [10]:
# Fill missing numerical values with 0, as NaN often means no activity occurred
# You can select specific columns or apply to all numerical ones.
merged = merged.fillna(0)

# Drop any potential duplicate rows
merged.drop_duplicates(inplace=True)

print("Missing values filled with 0.")
print("Duplicate rows dropped.")
print("Final Merged DataFrame shape:", merged.shape)

Missing values filled with 0.
Duplicate rows dropped.
Final Merged DataFrame shape: (17701957, 22)


## Save the Merged Dataset

In [11]:
# Create an 'outputs' directory if it doesn't exist
import os
if not os.path.exists('../outputs'):
    os.makedirs('../outputs')

# Save the final dataset
merged.to_csv('../outputs/merged_sessions.csv', index=False)

print("Final merged dataset saved to '../outputs/merged_sessions.csv'.")

Final merged dataset saved to './outputs/merged_sessions.csv'.
