In [25]:

import numpy as np
import pandas as pd
from google.colab import auth
from google.cloud import storage
auth.authenticate_user()
import io
import datetime
from openai import OpenAI
import openai
from google.colab import userdata
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score



In [2]:
client = storage.Client(project='cmse-381')
bucket = client.get_bucket('syncaleventdata')
blob = bucket.blob('events.json')
content = blob.download_as_string()
events = pd.read_json(io.BytesIO(content))

In [3]:
events.drop(columns = ['location', 'description'])

Unnamed: 0,summary,start,end
0,Playoff game arrive by 5:45,2018-10-26T21:45:00+00:00,2018-10-27T00:40:00+00:00
1,Microsoft event codepath zoom,2024-09-17T20:00:00+00:00,2024-09-17T21:00:00+00:00
2,"Physics Midterm Covers Chapters 21, 22, and 23",2022-09-27T16:40:00+00:00,2022-09-27T17:30:00+00:00
3,Dinner,2024-06-19T22:00:00+00:00,2024-06-19T23:00:00+00:00
4,ECE 201 Exam 2,2023-03-15T17:50:00+00:00,2023-03-15T18:40:00+00:00
...,...,...,...
211,Office Hours (Nolan Schroeder),2022-04-01T18:00:00+00:00,2022-04-01T18:15:00+00:00
212,CVS Vaccine Appointment,2021-12-19T18:00:00+00:00,2021-12-19T19:00:00+00:00
213,Practice Leetcode Work,2024-07-03T20:30:00-04:00,2024-07-03T22:00:00-04:00
214,Homework or Study,2023-10-09T09:00:00-04:00,2023-10-09T10:00:00-04:00


In [4]:
events['start'] = pd.to_datetime(events['start'], errors='coerce')
events['end'] = pd.to_datetime(events['end'], errors='coerce')

# Check for rows where the conversion failed
print(events[events['start'].isna()])

# Drop rows with invalid datetime values, if necessary
events = events.dropna(subset=['start', 'end'])

events['start'] = pd.to_datetime(events['start'], errors='coerce')
events['end'] = pd.to_datetime(events['end'], errors='coerce')

Empty DataFrame
Columns: [summary, start, end, location, description]
Index: []


  events['start'] = pd.to_datetime(events['start'], errors='coerce')
  events['end'] = pd.to_datetime(events['end'], errors='coerce')


In [5]:
# extract day of the week from start, create seperate date column, and convert start and end to begin and end times
events['date'] = events['start'].dt.normalize()
events['start_time'] = events['start'].dt.time
events['end_time'] = events['end'].dt.time
events.drop(columns = ['start', 'end'])
events['day_of_week'] = events['date'].dt.dayofweek

In [6]:
#drop rows with NaT or NaN
events = events.dropna(subset=['start_time', 'end_time'])

In [7]:
#drop start time and end time
events.drop(columns = ['start', 'end'], inplace = True)

In [8]:
#drop location descriptions
events.drop(columns = ['location', 'description'], inplace = True)

In [9]:
#remove time from date
events['date'] = events['date'].dt.date

In [10]:
#order by date\
events = events.sort_values(by='date')

In [11]:
#include only data from 2023 and 2024
events = events[(events['date'] >= datetime.date(2023, 1, 1)) & (events['date'] <= datetime.date(2024, 12, 31))]

In [12]:
api_key = userdata.get('openaikey')
openai.api_key = api_key

client = OpenAI(
    api_key=api_key
)

In [13]:
def classify_summary(x):
    summary = x['summary']
    prompt = f"""
    Categorize the following event description into one of the categories:
    School, Exercise, Social, Work, Other. Retrun only the category and no additional information

    Event: "{summary}"
    """

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that classifies summaries of events."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=100,
    )
    category = response.choices[0].message.content
    return category



In [14]:
events['categories'] = events.apply(classify_summary, axis=1)

In [15]:
events['completion'] = np.random.choice([0, 1], size=len(events), p=[0.2, 0.8])

In [16]:
#convert start_time and end_time to float using division
events['start_time'] = events['start_time'].apply(lambda x: x.hour + x.minute/60)
events['end_time'] = events['end_time'].apply(lambda x: x.hour + x.minute/60)


In [17]:
#convert categories to integer
events['categories'] = events['categories'].replace({'School': 0, 'Exercise': 1, 'Social': 2, 'Work': 3, 'Other': 4})

  events['categories'] = events['categories'].replace({'School': 0, 'Exercise': 1, 'Social': 2, 'Work': 3, 'Other': 4})


In [18]:
events

Unnamed: 0,summary,date,start_time,end_time,day_of_week,categories,completion
64,ECE 280 Exam,2023-02-01,15.333333,16.166667,2.0,0,1
13,MTH 235 Midterm Exam 1,2023-02-03,1.500000,4.500000,4.0,0,1
172,ECE 201 Exam,2023-02-08,18.833333,19.666667,2.0,0,1
109,MTH 235 Quiz 3,2023-02-09,13.000000,4.000000,3.0,0,0
36,ANP 264 Exam 1,2023-02-14,20.000000,21.333333,1.0,0,1
...,...,...,...,...,...,...,...
1,Microsoft event codepath zoom,2024-09-17,20.000000,21.000000,1.0,3,1
215,Friend hang out,2024-09-19,23.500000,0.500000,3.0,2,0
44,AI Club presentation meeting stem building 320...,2024-09-26,22.000000,23.000000,3.0,0,1
34,Open Source Hackfest,2024-10-11,15.000000,16.000000,4.0,3,0


In [19]:
#preprocess start and end time with sin
events['start_time'] = np.sin(2 * np.pi * events['start_time'] / 24)
events['end_time'] = np.sin(2 * np.pi * events['end_time'] / 24)

In [20]:
events

Unnamed: 0,summary,date,start_time,end_time,day_of_week,categories,completion
64,ECE 280 Exam,2023-02-01,-0.766044,-0.887011,2.0,0,1
13,MTH 235 Midterm Exam 1,2023-02-03,0.382683,0.923880,4.0,0,1
172,ECE 201 Exam,2023-02-08,-0.976296,-0.906308,2.0,0,1
109,MTH 235 Quiz 3,2023-02-09,-0.258819,0.866025,3.0,0,0
36,ANP 264 Exam 1,2023-02-14,-0.866025,-0.642788,1.0,0,1
...,...,...,...,...,...,...,...
1,Microsoft event codepath zoom,2024-09-17,-0.866025,-0.707107,1.0,3,1
215,Friend hang out,2024-09-19,-0.130526,0.130526,3.0,2,0
44,AI Club presentation meeting stem building 320...,2024-09-26,-0.500000,-0.258819,3.0,0,1
34,Open Source Hackfest,2024-10-11,-0.707107,-0.866025,4.0,3,0


In [21]:
X = events[['day_of_week', 'start_time', 'end_time', 'categories']]
y = events['completion']

In [22]:
#split data into training, test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
#train boosting model

model = xgb.XGBClassifier()

param_grid = {
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [50, 100, 200],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=model, param_grid=param_grid,
    scoring='accuracy', n_jobs=-1, cv=5, verbose=2
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Accuracy:", best_score)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}
Best Accuracy: 0.7666666666666667


In [26]:
#test model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.75
