In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib
import re

# Load the dataset
data = pd.read_csv('dummy_npi_data.csv')

# Step 1: Extract the time portion ("HH:MM") from "YYYY-MM-DD HH:MM:SS"
def extract_time(time_str):
    if isinstance(time_str, str) and ' ' in time_str:
        return time_str.split(' ')[1][:5]  # Extract "HH:MM"
    return "00:00"  # Default for missing/invalid entries

data['Login Time'] = data['Login Time'].apply(extract_time)
data['Logout Time'] = data['Logout Time'].apply(extract_time)

# Step 2: Validate the time format
def is_valid_time(time_str):
    return bool(re.match(r'^\d{1,2}:\d{2}$', str(time_str)))

data = data[data['Login Time'].apply(is_valid_time)]
data = data[data['Logout Time'].apply(is_valid_time)]

# Step 3: Convert time to minutes since midnight
data['Login_Time_Minutes'] = data['Login Time'].apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]))
data['Logout_Time_Minutes'] = data['Logout Time'].apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]))

# Step 4: Feature Engineering
# Add dynamic features related to input time (e.g., input_time = 540 (9:00 AM))
def create_time_features(df, input_time):
    df['Active_Duration'] = df['Logout_Time_Minutes'] - df['Login_Time_Minutes']
    df['Overlap_With_Input_Time'] = ((df['Login_Time_Minutes'] <= input_time) & (df['Logout_Time_Minutes'] >= input_time)).astype(int)
    df['Time_Since_Last_Login'] = abs(df['Login_Time_Minutes'] - input_time)
    return df

# Simulate input_time for training (replace with actual logic if historical data exists)
# For simplicity, assume input_time is the average of Login and Logout times
data['Input_Time'] = (data['Login_Time_Minutes'] + data['Logout_Time_Minutes']) // 2
data = create_time_features(data, data['Input_Time'].iloc[0])  # Use a sample input_time for training

# Step 5: Define the target variable
# Use historical survey attempts around the input time (if available)
# For simplicity, assume the target is whether the doctor took a survey on the same day
data['Likely_to_Attend'] = (data['Count of Survey Attempts'] > 0).astype(int)

# Step 6: Features and target
features = [
    'Login_Time_Minutes',
    'Logout_Time_Minutes',
    'Usage Time (mins)',
    'Count of Survey Attempts',
    'Active_Duration',
    'Overlap_With_Input_Time',
    'Time_Since_Last_Login'
]
X = data[features]
y = data['Likely_to_Attend']

# Step 7: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Train the model (use a better algorithm)
from xgboost import XGBClassifier

model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42
)
model.fit(X_train, y_train)

# Step 9: Evaluate the model
from sklearn.metrics import accuracy_score, precision_score, recall_score

y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"Precision: {precision_score(y_test, y_pred):.2f}")
print(f"Recall: {recall_score(y_test, y_pred):.2f}")

# Step 10: Save the model
joblib.dump(model, 'survey_attendance_model.pkl')

Accuracy: 1.00
Precision: 1.00
Recall: 1.00


['survey_attendance_model.pkl']