INSERT REPORT TITLE AND INFO HERE

### Imports

In [1]:
# Imports
import importlib
# mypytable
import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable
# myevaluation
import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as me
# myutils
import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as mu

from mysklearn.myutils import combine_multiple_files

## Step 1: Data Cleaning

### Preprocessing

The data came as a large number of JSON files grouped into folders. Many of the files and folders were empty, so we began by deleting those. Some of the files contained irrelevant data/non-changing data, such as date of birth and device specs, so we also deleted those. What we were left with was three sets of JSON files, aggregator, fitness, and wellness.

* Aggregator: This contains the most info. It has a list of metrics related to stress, calories, heart rate, and minor metrics related to activity
* Fitness: This contains a list of activities and information about them. All of these activities are runs. Has attributes like distance, speed, heart rate, duration
* Wellness: This mainly contained sleep data

### Cleaning/Joining TODO

Many instances in the dataset are missing values, or are just instances with basically no data. TODO: Make copies of the data without these bad instances

Additionally, for this step we combine all of the many JSON files into one. First by joining all files within each folder, simply appending them onto each other. Then join the lists from each folder. 

We also opened the CSV's in Excel to guide our decision-making process for this step.

In [2]:
# Cleaning/Joining...
# SLEEP
# 1. load all files

sleep_file_names = [
    "2021-04-23_2021-08-01_96200873_sleepData.csv",
    "2021-08-01_2021-11-09_96200873_sleepData.csv",
    "2021-11-09_2022-02-17_96200873_sleepData.csv",
    "2022-02-17_2022-05-28_96200873_sleepData.csv",
    "2022-05-28_2022-09-05_96200873_sleepData.csv",
    "2022-09-05_2022-12-14_96200873_sleepData.csv",
    "2022-12-14_2023-03-24_96200873_sleepData.csv",
    "2023-03-24_2023-07-02_96200873_sleepData.csv",
    "2023-07-02_2023-10-10_96200873_sleepData.csv",
    "2023-10-10_2024-01-18_96200873_sleepData.csv",
    "2024-01-18_2024-04-27_96200873_sleepData.csv",
    "2024-04-27_2024-08-05_96200873_sleepData.csv",
    "2024-08-05_2024-11-13_96200873_sleepData.csv"   
]

full_sleep_table = combine_multiple_files(sleep_file_names, "csv_converted_data/connect_wellness")

#current_table.pretty_print()
# 3. Basic cleaning
full_sleep_table.remove_rows_with_missing_values()
full_sleep_table.remove_rows_where_col_equal_specified(
    full_sleep_table.column_names.index("sleepWindowConfirmationType"),
    'OFF_WRIST'
)

# This is all of our joined sleep data
#full_sleep_table.pretty_print()
full_sleep_table.save_to_file('joined_nullfree_subsets/full_sleep.csv')

In [3]:
# Repeat for other file sets
# ACTIVITY

acitivity_files_names = [
    "jack-brandt@comcast.net_0_summarizedActivities.csv",
    "jack-brandt@comcast.net_1001_summarizedActivities.csv"
]
full_activity_table = combine_multiple_files(acitivity_files_names, "csv_converted_data/connect_fitness")
full_activity_table.save_to_file('joined_nullfree_subsets/full_activity.csv')

In [4]:
# Repeat for other file sets
# AGGREGATOR

aggregator_file_names = [
    "UDSFile_2021-04-23_2021-08-01.csv",
    "UDSFile_2021-08-01_2021-11-09.csv",
    "UDSFile_2021-11-09_2022-02-17.csv",
    "UDSFile_2022-02-17_2022-05-28.csv",
    "UDSFile_2022-05-28_2022-09-05.csv",
    "UDSFile_2022-09-05_2022-12-14.csv",
    "UDSFile_2022-12-14_2023-03-24.csv",
    "UDSFile_2023-03-24_2023-07-02.csv",
    "UDSFile_2023-07-02_2023-10-10.csv",
    "UDSFile_2023-10-10_2024-01-18.csv",
    "UDSFile_2024-01-18_2024-04-27.csv",
    "UDSFile_2024-04-27_2024-08-05.csv",
    "UDSFile_2024-08-05_2024-11-13.csv",
]

full_aggregator_table = combine_multiple_files(aggregator_file_names, "csv_converted_data/connect_aggregator_data")
full_aggregator_table.save_to_file('joined_nullfree_subsets/full_aggregator.csv')

In [5]:
from datetime import datetime, timedelta

# Load the combined tables
full_activity_table = MyPyTable().load_from_file(
    "joined_nullfree_subsets/full_activity.csv"
)
full_sleep_table = MyPyTable().load_from_file("joined_nullfree_subsets/full_sleep.csv")
full_aggregator_table = MyPyTable().load_from_file("joined_nullfree_subsets/full_aggregator.csv")

# full_activity_table doesn't have a calendarDate column, so we need to add it
# by converting the startTimeLocal column to a date
start_time_index = full_activity_table.column_names.index("startTimeLocal")

# Add the column
full_activity_table.column_names.append("calendarDate")

# Add the calendarDate to each row
for data in full_activity_table.data:
    timestamp = data[start_time_index] / 1000
    dt_object = datetime.fromtimestamp(timestamp)
    date = dt_object.strftime("%Y-%m-%d")
    data.append(date)

# Now we can join the tables
fully_joined_table = (full_activity_table.perform_inner_join(
    full_sleep_table, ["calendarDate"]
)).perform_inner_join(
    full_aggregator_table, ["calendarDate"]
)

stress_dict = {}
calendar_date_index = full_aggregator_table.column_names.index("calendarDate")
stress_index = full_aggregator_table.column_names.index("allDayStress/aggregatorList/0/maxStressLevel")


for row in full_aggregator_table.data:
    stress_dict[row[calendar_date_index]] = row[stress_index]

# Add the stress level to the fully joined table
fully_joined_table.column_names.append("prevDayMaxStressLevel")
fully_joined_table_calender_date_index = fully_joined_table.column_names.index("calendarDate")

for row in fully_joined_table.data:
    calendar_date = row[fully_joined_table_calender_date_index]

    # Get the previous calendar date
    prev_calendar_date = (datetime.strptime(calendar_date, "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d")

    # Get the stress level for the previous day
    if prev_calendar_date in stress_dict:
        row.append(stress_dict[prev_calendar_date])
    else:
        row.append(None)

fully_joined_table.save_to_file("processed_data/fully_joined.csv")

In [6]:
columns = ["prevDayMaxStressLevel", "maxHeartRate", "duration", "avgSpeed"]
class_column = "avgSpeed"
column_index = {}

for column in columns:
    column_index[column] = fully_joined_table.column_names.index(column)

data = []

for row in fully_joined_table.data:
    data.append([row[column_index[column]] for column in columns])

table = MyPyTable(column_names=columns, data=data)

# Remove rows with missing values
table.remove_rows_with_missing_values()

# Remove runs that are less than 10 minutes
table.remove_row_if(table.get_index("duration"), lambda x: x < 600000)
table.remove_rows_where_col_equal_specified(table.get_index("avgSpeed"), 0)

columns_to_extract = ["prevDayMaxStressLevel", "maxHeartRate", "duration"]

table.save_to_file("processed_data/processed_data.csv")

X_data = table.normalize(columns_to_extract)
y_data = table.get_column(class_column)

table.compute_summary_statistics(table.column_names).pretty_print()

[92.0, 96.0, 92.0, 90.0, 94.0, 92.0, 97.0, 93.0, 90.0, 88.0, 93.0, 90.0, 88.0, 91.0, 88.0, 94.0, 90.0, 89.0, 87.0, 90.0, 95.0, 95.0, 98.0, 88.0, 93.0, 90.0, 94.0, 93.0, 93.0, 91.0, 90.0, 89.0, 91.0, 93.0, 95.0, 84.0, 90.0, 90.0, 90.0, 94.0, 95.0, 85.0, 88.0, 87.0, 91.0, 97.0, 89.0, 99.0, 84.0, 93.0, 96.0, 96.0, 93.0, 87.0, 84.0, 93.0, 90.0, 93.0, 93.0, 84.0, 93.0, 89.0, 97.0, 90.0, 96.0, 86.0, 90.0, 95.0, 90.0, 87.0, 95.0, 96.0, 91.0, 86.0, 93.0, 88.0, 90.0, 92.0, 89.0, 90.0, 91.0, 98.0, 96.0, 82.0, 93.0, 89.0, 96.0, 92.0, 96.0, 93.0, 91.0, 88.0, 87.0, 88.0, 81.0, 89.0, 84.0, 78.0, 83.0, 90.0, 86.0, 93.0, 92.0, 96.0, 94.0, 91.0, 93.0, 91.0, 95.0, 84.0, 81.0, 95.0, 96.0, 92.0, 73.0, 88.0, 94.0, 93.0, 93.0, 87.0, 90.0, 94.0, 94.0, 98.0, 98.0, 96.0, 96.0, 96.0, 89.0, 98.0, 94.0, 97.0, 96.0, 96.0, 96.0, 94.0, 95.0, 85.0, 88.0, 91.0, 96.0, 96.0, 89.0, 87.0, 87.0, 92.0, 87.0, 93.0, 91.0, 93.0, 93.0, 90.0, 81.0, 88.0, 89.0, 96.0, 95.0, 95.0, 92.0, 93.0, 98.0, 93.0, 84.0, 88.0, 91.0, 95.0, 86.

In [7]:
# Load data and discretize!

# Save discretized data?


In [None]:
#Set up models # Add more from myclassifiers if time
from mysklearn.myclassifiers import MyDecisionTreeClassifier, MyDummyClassifier, MyKNeighborsClassifier, MyNaiveBayesClassifier, MyRandomForestClassifier


dummy_model = MyDummyClassifier() # Import these from myclassifiers
knn_model = MyKNeighborsClassifier()
bayes_model = MyNaiveBayesClassifier()
tree_model = MyDecisionTreeClassifier()
forest_model = MyRandomForestClassifier()

# Set up X
final_table = MyPyTable()
final_table.load_from_file('?')
X = final_table.get_data_subset(['INSERT IMPORTANT ATTRIBUTES HERE'],False)
y = final_table.get_column('DISCRETIZED Y COLUMN HERE')
# Labels
labels = ['slow','mild','fast']
# Positive Label
pos_label='fast'

# Repeat these following two lines for each possible model
metrics, confusion, clas_repor = me.get_metrics_and_conf_matrix_and_report(dummy_model,10,X,y,labels,pos_label,'Running')
mu.report_metrics_and_confusion('Dummy',metrics, confusion,
    clas_repor)

KeyError: 0.23497595094102483

In [None]:
metrics, confusion, clas_repor = me.get_metrics_and_conf_matrix_and_report(knn_model,10,X,y,labels,pos_label,'Running')
mu.report_metrics_and_confusion('KNN',metrics, confusion,
    clas_repor)

In [None]:
metrics, confusion, clas_repor = me.get_metrics_and_conf_matrix_and_report(tree_model,10,X,y,labels,pos_label,'Running')
mu.report_metrics_and_confusion('Tree',metrics, confusion,
    clas_repor)

In [None]:
metrics, confusion, clas_repor = me.get_metrics_and_conf_matrix_and_report(forest_model,10,X,y,labels,pos_label,'Running')
mu.report_metrics_and_confusion('Forest',metrics, confusion,
    clas_repor)