# 2 Feature Engineering

## Step 1: Create a Sample Dataset

In [None]:
import pandas as pd
import numpy as np
import warnings

# Suppress DtypeWarnings for a cleaner output
warnings.filterwarnings('ignore', category=pd.errors.DtypeWarning)

# Load the full, merged dataset from Phase 1
merged_sessions = pd.read_csv('../outputs/merged_sessions.csv')

# --- Create a smaller sample of the data for faster processing ---
sample_size = 400000 # You can adjust this value as needed
sampled_merged_sessions = merged_sessions.sample(n=sample_size, random_state=42)

print(f"Created a sample of {sample_size} rows.")
print("Sampled DataFrame shape:", sampled_merged_sessions.shape)

# Save this sampled DataFrame for Phase 2
sampled_merged_sessions.to_csv('../outputs/sampled_merged_sessions.csv', index=False)

## Step 2:  Engineer the Core Features

In [4]:
# Load the sampled dataset for feature engineering
sampled_merged_sessions = pd.read_csv('../outputs/sampled_merged_sessions.csv')

# Ensure the 'date' column is in datetime format
sampled_merged_sessions['date'] = pd.to_datetime(sampled_merged_sessions['date'])
sampled_merged_sessions['hour'] = sampled_merged_sessions['date'].dt.hour

# Create 'after_hours_login' feature
after_hours_start = 17
after_hours_end = 9
sampled_merged_sessions['after_hours_login'] = sampled_merged_sessions['hour'].apply(
    lambda h: 1 if (h >= after_hours_start or h < after_hours_end) else 0
)

# Group by user and date to count activities
activity_counts = sampled_merged_sessions.groupby(['user', 'date']).agg(
    num_emails_sent=('from', 'count'),
    num_files_accessed=('filename', 'count'),
    num_device_events=('id', 'count')
).reset_index()

# Merge the after_hours_login flag
after_hours = sampled_merged_sessions.groupby(['user', 'date'])['after_hours_login'].max().reset_index()
uba_features = pd.merge(activity_counts, after_hours, on=['user', 'date'], how='left')

print("All core features have been engineered and saved to 'uba_features.csv'.")

All core features have been engineered and saved to 'uba_features.csv'.


## Step 3: Adding optional features - External Email Ratio and Psychometric Data

### 3A: External Mail Ratio

In [14]:
# Assuming 'sampled_merged_sessions' is your current DataFrame
# and 'uba_features' is the DataFrame with core features

# Create a copy of the sampled data to work with email-specific columns
email_data_sample = sampled_merged_sessions.copy()

# Replace with your actual internal domain
internal_domain = 'dtaa.com' 

# Create a boolean column for external emails
email_data_sample['is_external'] = email_data_sample['to'].apply(
    lambda recipients: any(internal_domain not in str(r) for r in str(recipients).split(';'))
)

# Group by user and date to get email counts
email_ratio = email_data_sample.groupby(['user', 'date']).agg(
    total_emails=('from', 'count'),
    external_emails=('is_external', 'sum')
).reset_index()

# --- REVISED CODE START ---

# Ensure the 'date' column is a datetime object before the merge
email_ratio['date'] = pd.to_datetime(email_ratio['date'])

# Calculate the external email ratio
email_ratio['external_email_ratio'] = (
    email_ratio['external_emails'] / email_ratio['total_emails']
).fillna(0) # Fill NaN with 0 if no emails were sent

# Merge this new feature into your main features DataFrame
# Use suffixes to handle potential column name conflicts
uba_features = pd.merge(uba_features, email_ratio[['user', 'date', 'external_email_ratio']], on=['user', 'date'], how='left', suffixes=('', '_email'))

# --- REVISED CODE END ---

print("External email ratio feature added from sampled data.")
uba_features.head()

External email ratio feature added from sampled data.


Unnamed: 0,user,date,num_emails_sent,num_files_accessed,num_device_events,after_hours_login,employee_name_x,O_x,C_x,E_x,...,E,A,N,employee_name_psych,O_psych,C_psych,E_psych,A_psych,N_psych,external_email_ratio
0,AAB0162,2010-01-22 11:16:55,1,1,1,0,Amos Ahmed Burch,45,36,33,...,33,15,33,Amos Ahmed Burch,45,36,33,15,33,1.0
1,AAB0162,2010-02-05 08:14:23,1,1,1,1,Amos Ahmed Burch,45,36,33,...,33,15,33,Amos Ahmed Burch,45,36,33,15,33,0.0
2,AAB0162,2010-02-08 07:53:56,1,1,1,1,Amos Ahmed Burch,45,36,33,...,33,15,33,Amos Ahmed Burch,45,36,33,15,33,0.0
3,AAB0162,2010-02-08 08:40:56,1,1,1,1,Amos Ahmed Burch,45,36,33,...,33,15,33,Amos Ahmed Burch,45,36,33,15,33,0.0
4,AAB0162,2010-02-15 09:50:21,1,1,1,0,Amos Ahmed Burch,45,36,33,...,33,15,33,Amos Ahmed Burch,45,36,33,15,33,0.0


### 3B: Psychometric Traits

In [13]:
# Load the psychometric dataset
psychometric = pd.read_csv('../data/psychometric.csv')

# --- Check the column names ---
print("Psychometric DataFrame columns:")
print(psychometric.columns)

# You may need to rename the column that contains the user ID
# For example, if the column is named 'employee_id', you would use:
psychometric.rename(columns={'user_id': 'user'}, inplace=True)

# Merge psychometric traits into the main features DataFrame
# We'll use a 'left' merge and specify suffixes
uba_features = pd.merge(uba_features, psychometric, on='user', how='left', suffixes=('', '_psych'))

# Fill any new missing values with a suitable value (e.g., 0)
uba_features.fillna(0, inplace=True)

print("Psychometric traits added.")
uba_features.head()

Psychometric DataFrame columns:
Index(['employee_name', 'user_id', 'O', 'C', 'E', 'A', 'N'], dtype='object')
Psychometric traits added.


Unnamed: 0,user,date,num_emails_sent,num_files_accessed,num_device_events,after_hours_login,employee_name_x,O_x,C_x,E_x,...,C,E,A,N,employee_name_psych,O_psych,C_psych,E_psych,A_psych,N_psych
0,AAB0162,2010-01-22 11:16:55,1,1,1,0,Amos Ahmed Burch,45,36,33,...,36,33,15,33,Amos Ahmed Burch,45,36,33,15,33
1,AAB0162,2010-02-05 08:14:23,1,1,1,1,Amos Ahmed Burch,45,36,33,...,36,33,15,33,Amos Ahmed Burch,45,36,33,15,33
2,AAB0162,2010-02-08 07:53:56,1,1,1,1,Amos Ahmed Burch,45,36,33,...,36,33,15,33,Amos Ahmed Burch,45,36,33,15,33
3,AAB0162,2010-02-08 08:40:56,1,1,1,1,Amos Ahmed Burch,45,36,33,...,36,33,15,33,Amos Ahmed Burch,45,36,33,15,33
4,AAB0162,2010-02-15 09:50:21,1,1,1,0,Amos Ahmed Burch,45,36,33,...,36,33,15,33,Amos Ahmed Burch,45,36,33,15,33


## Step 4: Save the Final Features

In [15]:
import os

# Create an 'outputs' directory if it doesn't exist
if not os.path.exists('../outputs'):
    os.makedirs('../outputs')

# Save the final features
uba_features.to_csv('../outputs/uba_features.csv', index=False)

print("Engineered features saved to 'outputs/uba_features.csv'.")

Engineered features saved to 'outputs/uba_features.csv'.
