# Settings

## Constants

In [19]:
import os

PATH_DATA = './Dataset'
sensor_data = ['UserInfo.csv', 'Service.csv', 'ContextualFactor.csv', 'Interruptibility.csv']

# Utility Functions

In [20]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import LeaveOneGroupOut
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score


# Load the Dataset into Dataframe

In [21]:
dataframes = {
    filename: pd.read_csv(os.path.join(PATH_DATA, filename)).reset_index(drop=True)
    for filename in sensor_data
}
dfService = dataframes['Service.csv']
dfContextualFactor = dataframes['ContextualFactor.csv']
dfUserInfo = dataframes['UserInfo.csv']
dfInterruptibility = dataframes['Interruptibility.csv']

# Preprocessing

In [22]:
# Select specific columns from dfContextualFactor, dfService, and dfUserInfo DataFrames
# Combine relevant columns to create a unified dataset
dfContextualFactor_selected_columns=dfContextualFactor[['uid','sid','activity1','activity2','activity3','userLocation','userPosition',]]
dfService_selected_columns=dfService[['weekOfExperiment','dayOfWeek','startTime', 'activityInquiry','availabilityInquiry','speechShadowing1','speechShadowing2','speechShadowing3','speechShadowing4','speechShadowing5','continue-to-nextInquiry1','continue-to-nextInquiry2','continue-to-nextInquiry3','continue-to-nextInquiry4','endTime','endType']]
dfInterruptibility_selected_columns=dfInterruptibility[['SHORT_INTERACTION_interruptibility', 'LONG_INTERACTION_interruptibility']]

## Create binary columns for interaction types
## SHORT_INTERACTION: True if availabilityInquiry is not NaN
# dfCombinedAll['SHORT_INTERACTION_interruptibility'] = dfCombinedAll['availabilityInquiry'].notna()
## LONG_INTERACTION: True if continue-to-nextInquiry1 is not NaN
# dfCombinedAll['LONG_INTERACTION_interruptibility'] = dfCombinedAll['continue-to-nextInquiry1'].notna()

# ! For those who want to redefine LONG_INTERACTION with thresholds longer than 3 minutes,
# you can use the following columns:
# 5 minutes  => use continue-to-nextInquiry2
# 7 minutes  => use continue-to-nextInquiry3
# 9 minutes  => use continue-to-nextInquiry4


dfCombinedAll=pd.concat([dfContextualFactor_selected_columns, dfService_selected_columns,dfInterruptibility_selected_columns], axis=1)

In [23]:
# Concatenate and merge the all columns from dfUserInfo (dfUserInfo.csv)
dfUserInfo_selected_columns = dfUserInfo[['uid', 'homeType', 'speakerLocation', 'speakerPosition']]
dfCombinedAll = pd.merge(dfCombinedAll, dfUserInfo_selected_columns, on='uid', how='left')

## Position Processing

In [24]:
# Define function to calculate proximity between user and speaker
def calculate_proximity(row):
    # Return 0 if user and speaker are in different locations
    if row['userLocation'] != row['speakerLocation']:
        return 0
    # If in the same locations
    elif row['userPosition'] == row['speakerPosition']:
        return 2  # Same position
    else:
        return 1  # Different positions (including missing position)

# Apply proximity calculation to create a new 'proximity' column
dfCombinedAll['proximity'] = dfCombinedAll.apply(calculate_proximity, axis=1)

print(dfCombinedAll[['userLocation', 'userPosition', 'speakerLocation', 'speakerPosition', 'proximity']].head(10))


  userLocation userPosition speakerLocation speakerPosition  proximity
0     Bed Room          Bed        Bed Room            Desk          1
1    Rest Room          NaN        Bed Room            Desk          0
2  Living Room          NaN        Bed Room            Desk          0
3  Living Room          NaN        Bed Room            Desk          0
4     Bed Room          Bed        Bed Room            Desk          1
5     Bed Room          Bed        Bed Room            Desk          1
6     Bed Room          Bed        Bed Room            Desk          1
7     Bed Room         Desk        Bed Room            Desk          2
8     Bed Room          Bed        Bed Room            Desk          1
9     Bed Room          Bed        Bed Room            Desk          1


## Activity and Time Processing

In [25]:
# Process activity columns for one-hot encoding
activity_cols = ['activity1', 'activity2', 'activity3']
df_activity = dfContextualFactor[activity_cols].copy()

# Get unique activities across all activity columns, excluding NaN
all_unique_activities = pd.unique(df_activity.values.ravel())
all_unique_activities = [x for x in all_unique_activities if pd.notna(x)]

# Create a DataFrame for one-hot encoding of activities
dfActivity_one_hot_encoding = pd.DataFrame(0, index=df_activity.index, columns=['act_' + str(val) for val in all_unique_activities])

# Perform one-hot encoding for each activity column
for col in activity_cols:
    for val in all_unique_activities:
        dfActivity_one_hot_encoding['act_' + str(val)] |= (df_activity[col] == val).astype(int)

# Concatenate one-hot encoded activity columns to dfCombinedAll
dfActivity_one_hot_encoding
dfCombinedAll = pd.concat([dfCombinedAll, dfActivity_one_hot_encoding], axis=1)

In [26]:
# Convert startTime to datetime and extract total minutes since midnight
dfCombinedAll['startTime'] = pd.to_datetime(dfCombinedAll['startTime'], format='%H:%M:%S', errors='coerce')
dfCombinedAll['minute'] = dfCombinedAll['startTime'].dt.hour * 60 + dfCombinedAll['startTime'].dt.minute

# Map days of the week to numerical values (MON=0, TUE=1, ..., SUN=6)
day_map = {'MON': 0, 'TUE': 1, 'WED': 2, 'THU': 3, 'FRI': 4, 'SAT': 5, 'SUN': 6}
dfCombinedAll['dayOfWeek'] = dfCombinedAll['dayOfWeek'].map(day_map)

# Bin minutes into 30-minute intervals for temporal analysis
dfCombinedAll['minute_bin'] = (dfCombinedAll['minute'] // 30).astype(int)

In [27]:
dfCombinedAll.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2830 entries, 0 to 2829
Data columns (total 43 columns):
 #   Column                                    Non-Null Count  Dtype         
---  ------                                    --------------  -----         
 0   uid                                       2830 non-null   int64         
 1   sid                                       2830 non-null   int64         
 2   activity1                                 2830 non-null   object        
 3   activity2                                 110 non-null    object        
 4   activity3                                 3 non-null      object        
 5   userLocation                              2830 non-null   object        
 6   userPosition                              2343 non-null   object        
 7   weekOfExperiment                          2830 non-null   int64         
 8   dayOfWeek                                 2830 non-null   int64         
 9   startTime                     

# Feature Extraction

In [28]:
# Select features for response prediction
dfFeatresForResponse = dfCombinedAll[['uid',
    'act_Taking a Nap / Sleeping','act_Hygiene','act_Eating','act_Using Media','act_Social Interaction',
    'act_Returning from Outside / Other Rooms','act_Studying / Working','act_Others','act_House Chores',
    'act_Self Caring','act_Visiting Outside / Other Rooms','act_Resting',
    'homeType','userLocation','userPosition','speakerLocation','speakerPosition',
    'minute_bin','dayOfWeek','SHORT_INTERACTION_interruptibility','LONG_INTERACTION_interruptibility']].copy()

# Encode categorical columns using LabelEncoder
categorical_columns = ['homeType', 'userLocation', 'userPosition', 'speakerLocation', 'speakerPosition', 'minute_bin','SHORT_INTERACTION_interruptibility','LONG_INTERACTION_interruptibility']
label_encoders = defaultdict(LabelEncoder)

# Apply label encoding to each categorical column
for col in categorical_columns:
    dfFeatresForResponse[col] = label_encoders[col].fit_transform(dfFeatresForResponse[col])

# Create a copy of the encoded data for further processing
encoded_data = dfFeatresForResponse.copy()


encoded_data.info()
encoded_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2830 entries, 0 to 2829
Data columns (total 22 columns):
 #   Column                                    Non-Null Count  Dtype
---  ------                                    --------------  -----
 0   uid                                       2830 non-null   int64
 1   act_Taking a Nap / Sleeping               2830 non-null   int64
 2   act_Hygiene                               2830 non-null   int64
 3   act_Eating                                2830 non-null   int64
 4   act_Using Media                           2830 non-null   int64
 5   act_Social Interaction                    2830 non-null   int64
 6   act_Returning from Outside / Other Rooms  2830 non-null   int64
 7   act_Studying / Working                    2830 non-null   int64
 8   act_Others                                2830 non-null   int64
 9   act_House Chores                          2830 non-null   int64
 10  act_Self Caring                           2830 non-null   in

Unnamed: 0,uid,act_Taking a Nap / Sleeping,act_Hygiene,act_Eating,act_Using Media,act_Social Interaction,act_Returning from Outside / Other Rooms,act_Studying / Working,act_Others,act_House Chores,...,act_Resting,homeType,userLocation,userPosition,speakerLocation,speakerPosition,minute_bin,dayOfWeek,SHORT_INTERACTION_interruptibility,LONG_INTERACTION_interruptibility
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,12,0,0,1
1,1,0,1,0,0,0,0,0,0,0,...,0,0,3,3,0,1,15,0,1,1
2,1,0,0,1,0,0,0,0,0,0,...,0,0,1,3,0,1,12,1,1,1
3,1,0,0,1,0,0,0,0,0,0,...,0,0,1,3,0,1,13,1,1,1
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,18,1,0,1


## Label: SHORT_INTERACTION

In [29]:
# Label distribution before balancing
print(encoded_data['SHORT_INTERACTION_interruptibility'].value_counts())

SHORT_INTERACTION_interruptibility
0    2088
1     742
Name: count, dtype: int64


### Model Building and LOSO CV

In [30]:
# Prepare features (X) and target variables (y) for SHORT_INTERACTION
X = encoded_data.drop(columns=['SHORT_INTERACTION_interruptibility', 'LONG_INTERACTION_interruptibility', 'uid'], axis=1) # Drop target and unrelated columns
y = encoded_data['SHORT_INTERACTION_interruptibility'] # Target variable
groups = encoded_data['uid'] # Group by user ID for Leave-One-Group-Out CV

# Initialize Leave-One-Group-Out cross-validator
logo = LeaveOneGroupOut()

# Initialize SMOTE for oversampling to handle class imbalance
smote = SMOTE(random_state=42)

# Define models to evaluate
models1 = {
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42, verbose=0),
    'XGBoost': XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42, verbosity=0, use_label_encoder=False),
    'LightGBM': LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42, verbose=-1),
    'CatBoost': CatBoostClassifier(iterations=100, depth=5, learning_rate=0.1, loss_function='Logloss', cat_features=[0], random_seed=42, verbose=0),
    'SVM': SVC(random_state=42, verbose=False),
    'Dummy': DummyClassifier(strategy="stratified", random_state=42)
}

results1_logo = {}

# Loop over each model
for model_name, model1 in models1.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in Leave-One-Group-Out cross-validation
    for i, (train_index, test_index) in enumerate(logo.split(X, y, groups)):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model1.fit(X_train_oversampled, y_train_oversampled.to_numpy())

        # Predict the target on the test data
        y_pred = model1.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)
    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results1_logo[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }


### K-Fold CV

In [31]:
from sklearn.model_selection import KFold

# Initialize 5-fold cross-validator
kfold = KFold(n_splits=5, shuffle=False)

results1_kfold = {}

# Loop over each model
for model_name, model1 in models1.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in 5-fold cross-validation
    for train_index, test_index in kfold.split(X, y):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model1.fit(X_train_oversampled, y_train_oversampled.to_numpy())

        # Predict the target on the test data
        y_pred = model1.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)

    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results1_kfold[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }


## Label: LONG_INTERACTION

In [32]:
# Label distribution before balancing
print(encoded_data['LONG_INTERACTION_interruptibility'].value_counts())

LONG_INTERACTION_interruptibility
1    1443
0    1387
Name: count, dtype: int64


### Model building and LOSO CV

In [33]:
# Prepare features (X) and target variable (y) for predicting LONG_INTERACTION
X = encoded_data.drop(columns=['LONG_INTERACTION_interruptibility', 'SHORT_INTERACTION_interruptibility','uid'], axis=1)
y = encoded_data['LONG_INTERACTION_interruptibility']
groups = encoded_data['uid']

# Initialize Leave-One-Group-Out cross-validator
logo = LeaveOneGroupOut()

# Initialize SMOTE for oversampling to handle class imbalance
smote = SMOTE(random_state=42)

# Define models to evaluate
models2 = {
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42, verbose=0),
    'XGBoost': XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42, verbosity=0, use_label_encoder=False),
    'LightGBM': LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42, verbose=-1),
    'CatBoost': CatBoostClassifier(iterations=100, depth=5, learning_rate=0.1, loss_function='Logloss', cat_features=[0], random_seed=42, verbose=0),
    'SVM': SVC(random_state=42, verbose=False),
    'Dummy': DummyClassifier(strategy="stratified", random_state=42)
}

results2_logo = {}

# Loop over each model
for model_name, model2 in models2.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in Leave-One-Group-Out cross-validation
    for i, (train_index, test_index) in enumerate(logo.split(X, y, groups)):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model2.fit(X_train_oversampled, y_train_oversampled.to_numpy())

        # Predict the target on the test data
        y_pred = model2.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)

    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results2_logo[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }

### K-Fold CV

In [34]:
from sklearn.model_selection import KFold

# Initialize 5-fold cross-validator
kfold = KFold(n_splits=5, shuffle=False)

results2_kfold = {}

# Loop over each model
for model_name, model2 in models2.items():
    accuracies = []
    f1_scores = []

    # Loop over each fold in 5-fold cross-validation
    for train_index, test_index in kfold.split(X, y):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Apply SMOTE to the training data to balance class distribution
        X_train_oversampled, y_train_oversampled = smote.fit_resample(X_train, y_train)

        # Train the model on the oversampled training data
        model2.fit(X_train_oversampled, y_train_oversampled.to_numpy())

        # Predict the target on the test data
        y_pred = model2.predict(X_test)

        # Evaluate the prediction
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')

        accuracies.append(accuracy)
        f1_scores.append(f1)

    # Compute average accuracy and F1-score across all folds
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_f1_score = sum(f1_scores) / len(f1_scores)

    results2_kfold[model_name] = {
        'Average Accuracy': avg_accuracy,
        'Average F1-Score (macro)': avg_f1_score
    }

In [35]:
from tabulate import tabulate

# Models list
models = [
    'Random Forest', 'Gradient Boosting', 'XGBoost',
    'LightGBM', 'CatBoost', 'SVM', 'Dummy'
]

# Prepare table data
table_data = []
for model in models:
    row = [model]
    
    # Shell 1: SHORT_INTERACTION, Leave-One-Group-Out
    row.append(results1_logo.get(model, {}).get('Average Accuracy', 0.0))
    row.append(results1_logo.get(model, {}).get('Average F1-Score (macro)', 0.0))
    
    # Shell 3: LONG_INTERACTION, Leave-One-Group-Out
    row.append(results2_logo.get(model, {}).get('Average Accuracy', 0.0))
    row.append(results2_logo.get(model, {}).get('Average F1-Score (macro)', 0.0))

    # Shell 2: SHORT_INTERACTION, 5-fold
    row.append(results1_kfold.get(model, {}).get('Average Accuracy', 0.0))
    row.append(results1_kfold.get(model, {}).get('Average F1-Score (macro)', 0.0))
    
    # Shell 4: LONG_INTERACTION, 5-fold
    row.append(results2_kfold.get(model, {}).get('Average Accuracy', 0.0))
    row.append(results2_kfold.get(model, {}).get('Average F1-Score (macro)', 0.0))
    
    table_data.append(row)

# Define headers
headers = [
    'Model',
    'LOSO CV\nShort\nInteraction\nAccuracy', 'LOSO CV\nShort\nInteraction\nF1',
    'LOSO CV\nLong\nInteraction\nAccuracy', 'LOSO CV\nLong\nInteraction\nF1',
    '5-fold CV\nShort\nInteraction\nAccuracy', '5-fold CV\nShort\nInteraction\nF1',
    '5-fold CV\nLong\nInteraction\nAccuracy', '5-fold CV\nLong\nInteraction\nF1'
]

# Print the table
print("Machine learning model performance")
print(tabulate(table_data, headers=headers, tablefmt='fancy_grid', floatfmt='.3f', numalign="decimal"))


Machine learning model performance
╒═══════════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╤═══════════════╕
│ Model             │       LOSO CV │       LOSO CV │       LOSO CV │       LOSO CV │     5-fold CV │     5-fold CV │     5-fold CV │     5-fold CV │
│                   │         Short │         Short │          Long │          Long │         Short │         Short │          Long │          Long │
│                   │   Interaction │   Interaction │   Interaction │   Interaction │   Interaction │   Interaction │   Interaction │   Interaction │
│                   │      Accuracy │            F1 │      Accuracy │            F1 │      Accuracy │            F1 │      Accuracy │            F1 │
╞═══════════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╪═══════════════╡
│ Random Forest     │         0.828 │         0.746 │         0.6