Imports

In [12]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

Loading The Dataset

In [13]:
file_path = 'CrossCheck_Daily_Data.xlsx'  
data = pd.read_excel(file_path)

print("Dataset Shape:", data.shape)
print("Column Names:", data.columns)
print(data.head())

Dataset Shape: (23573, 155)
Column Names: Index(['study_id', 'eureka_id', 'day', 'act_in_vehicle_ep_0',
       'act_in_vehicle_ep_1', 'act_in_vehicle_ep_2', 'act_in_vehicle_ep_3',
       'act_in_vehicle_ep_4', 'act_on_bike_ep_0', 'act_on_bike_ep_1',
       ...
       'unlock_duration_ep_0', 'unlock_duration_ep_1', 'unlock_duration_ep_2',
       'unlock_duration_ep_3', 'unlock_duration_ep_4', 'unlock_num_ep_0',
       'unlock_num_ep_1', 'unlock_num_ep_2', 'unlock_num_ep_3',
       'unlock_num_ep_4'],
      dtype='object', length=155)
   study_id eureka_id       day  act_in_vehicle_ep_0  act_in_vehicle_ep_1  \
0        -1      u004  20150122                    0                    0   
1        -1      u004  20150123                    0                    0   
2        -1      u004  20150124                    0                    0   
3        -1      u004  20150125                    0                    0   
4        -1      u004  20150126                    0                    0   

Data Aggregation

In [14]:
#Aggregate the eps to make it one full day
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns

new_columns = []
processed_base_columns = set()  
columns_to_drop = []  


epochs = ['ep_0', 'ep_1', 'ep_2', 'ep_3']  #only taking the first four eps as they indicates the whole day


for col in numerical_cols:
    if any(epoch in col for epoch in epochs):  # to see if the coulmn has ep_ in it 
        
        #removing ep for the bas coulmn name
        epoch_num = [epoch for epoch in epochs if epoch in col][0]
        base_col_name = col.replace(f'_{epoch_num}', '') 

        # Check if the coulmn has been aggregated before and not aggregate ep_4 
        if base_col_name not in processed_base_columns:
            columns_to_sum = [col for col in data.columns if base_col_name in col and 'ep_4' not in col]
            aggregated_col = data[columns_to_sum].sum(axis=1)
            new_columns.append(aggregated_col.rename(base_col_name + '_sum'))
            

            columns_to_drop.extend([col for col in columns_to_sum if 'ep_4' not in col]) 

            # Mark the base column as processed
            processed_base_columns.add(base_col_name)

# add the new coulmns to the data 
if new_columns:
    data = pd.concat([data] + new_columns, axis=1)

#drop the original coulmns 
data.drop(columns=columns_to_drop, inplace=True)

#data.to_csv("processed_data.csv", index=False)


In [15]:


# Identify rows with recorded EMA scores
data['is_target'] = data['ema_score'].notna()

# Find the indices where EMA scores are available
ema_indices = data[data['is_target']].index.tolist()

# List to store new dataset rows
new_data = []

# Extract feature columns (excluding 'ema_score', 'study_id', 'eureka_id', and 'day')
feature_cols = [col for col in data.columns if col not in ['ema_score', 'study_id', 'eureka_id', 'day', 'is_target']]

# Iterate over each EMA segment
for i in range(len(ema_indices) - 1):
    start_idx = ema_indices[i]  # Start from last recorded EMA
    end_idx = ema_indices[i + 1]  # Stop at next recorded EMA

    # Get segment data
    segment = data.iloc[start_idx:end_idx]

    # Compute average of feature columns
    avg_features = segment[feature_cols].mean().to_dict()

    # Assign the next recorded EMA score as the target
    avg_features['ema_score'] = data.loc[end_idx, 'ema_score']

    # Store metadata
    avg_features['study_id'] = data.loc[start_idx, 'study_id']  # Keep the participant ID
    avg_features['start_day'] = data.loc[start_idx, 'day']  # First day in the segment
    avg_features['end_day'] = data.loc[end_idx, 'day']  # Last day in the segment

    # Store the result
    new_data.append(avg_features)

# Convert new dataset to DataFrame
data = pd.DataFrame(new_data)



Data Cleaning

In [None]:
# Remove columns that have only one unique value
data = data.loc[:, data.nunique() > 1]

# Save the cleaned dataset
data.to_csv("processed_ema_dataset.csv", index=False)

#print(f"✅ Cleaned dataset saved with {data.shape[1]} features")
#print(data.head())

AttributeError: 'list' object has no attribute 'loc'

Train Model

In [None]:
# Features (X) and Target (y)
X = new_data.drop(['ema_score', 'study_id', 'start_day', 'end_day'], axis=1)  # Drop non-relevant columns
y = new_data['ema_score']

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"✅ Data split complete: {X_train.shape[0]} training rows, {X_test.shape[0]} test rows")


KeyError: "['start_day', 'end_day'] not found in axis"

In [None]:
# Initialize and train the SVR model
svm_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)  # Default hyperparameters
svm_model.fit(X_train, y_train)

# Predictions
y_pred = svm_model.predict(X_test)


In [None]:
# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"✅ SVM Model Performance:")
print(f"📉 MSE: {mse:.2f}")
print(f"📈 R²: {r2:.2f}")
