In [2]:
import pandas as pd
import numpy as np

import pyarrow.parquet as pq

import os
import sys

from tqdm import tqdm

current_folder_path = os.path.abspath('')

## CSV data
#### Train events
Contains the target data, ie the *starting to sleep* and *waking up* events.
- **series_id** : unique subject id, primary key when linked to the **train_series** parquet data  
- **night** : counts the nth night of the serie  
- **event** : target of the studie, with to values : *onset* and *wakeup*  
- **step**: gives the index sample data corresponding to the event in the of the **train_series** parquet data. When multiplied by 5, gives the equivalent lasted time since the begining of the studie on the current test subject.  
- **timestamp** : datetime object, exact time of the event, presented in the form of yyyy-mm-ddThh-mm-ss-utc

#### Sample submission
An example of the format in which the aswer data should be submitted to be evaluated


In [176]:
df_train_events = pd.read_csv(current_folder_path + "/train_events.csv").dropna()
df_train_events.head()


Unnamed: 0,series_id,night,event,step,timestamp
0,038441c925bb,1,onset,4992.0,2018-08-14T22:26:00-0400
1,038441c925bb,1,wakeup,10932.0,2018-08-15T06:41:00-0400
2,038441c925bb,2,onset,20244.0,2018-08-15T19:37:00-0400
3,038441c925bb,2,wakeup,27492.0,2018-08-16T05:41:00-0400
4,038441c925bb,3,onset,39996.0,2018-08-16T23:03:00-0400


#### Timestamp conversion
The timestamps are converted to instant of the day, as the date doesn't matter when sstudying daily events such as going to bed and waking up.
Thus, the timestamps of every sleeping events, for every test subject (series_id) are converted in seconds//5.
To get the time in hour format, the following retreiving operation can easily be computed :
```python
real_seconds = seconds * 5 
h = real_seconds // 3600
m = (real_seconds - h*3600) // 60
s = (real_seconds - h*3600 - m*60)


In [177]:
#Eg: 01:01:01 ie, 720*5 + 12*5 + 0.2*5 = 3600 + 60 + 1 = 1h + 1m + 1s

seconds = 720 + 12 + 0.2
real_seconds = seconds * 5 
h = round(real_seconds // 3600)
m = round((real_seconds - h*3600) // 60)
s = round(real_seconds - h*3600 - m*60)

print(f"{h}:{m}:{s}")

1:1:1


## Target data editing

In [186]:
show_data = 0

# Converted timestamps storing df init
df_train_events_timestamp_extension = pd.DataFrame(columns=["timestamp (hh:mm:ss)", "timestamp (seconds/5)"])

# Loop over every test subject
# Isn't useful for time conversion, because it is independant of the subject id, but will come in handy in the next steps
series_id = pd.unique(df_train_events["series_id"])
print("Test subjects studied :", len(series_id), "unique id")
for serie_id in tqdm(series_id):
    """
    Timestamp conversion into daily elapsed instants (1 instant = 5 seconds)
    """
    # Extracting every sample "target" values, for each test subject (serie_id)  
    df_train_events_individual = df_train_events[(df_train_events["series_id"] == serie_id)]
    
    if show_data:
        display(df_train_events_individual.head())
    
    # Extracting the instant of the day when an event happened (dropping the year/month/day)
    df_day_time = pd.DataFrame([datetime.strptime(t, "%Y-%m-%dT%H:%M:%S%z").time()
                                for t in df_train_events_individual["timestamp"].values],
                               columns=["timestamp (hh:mm:ss)"])
    if show_data:
        display(df_day_time.head())
    
    # Converting the instant in seconds ranging from 00:00:00 to 23:59:59, 5 seconds step (00:00:00, 00:00:05, ...)
    midnight = datetime.combine(datetime.today(), time(0, 0, 0))
    df_day_time["timestamp (seconds/5)"] = pd.DataFrame([(datetime.combine(datetime.today(), t) - midnight).total_seconds()//5
                                                       for t in df_day_time["timestamp (hh:mm:ss)"].values])
    if show_data:
        display(df_day_time.head())
        
    # Saving the converted timestamps rows of the current test subject
    df_train_events_timestamp_extension = pd.concat([df_train_events_timestamp_extension, df_day_time], 
                                                    ignore_index=True)

# Concatenating the two new seconds columns to the train_events dataframe
df_train_events_updated = pd.concat([df_train_events, df_train_events_timestamp_extension], axis=1)

"""
Event conversion into targets
"""
# "Onset" event is labelled as target 0
# "Wakeup" event is labelled as target 1
event_replacement = {'onset': 0, 'wakeup': 1}
df_train_events_updated = df_train_events_updated.replace(event_replacement)

"""
Dataframe conversion to csv and saving
"""
display(df_train_events_updated.head())
df_train_events_updated.to_csv("train_events_updated.csv", index=False)
print("Updated data saved as 'train_events_updated.csv'")

Test subjects studied : 269 unique id


100%|██████████| 269/269 [00:00<00:00, 460.29it/s]


Unnamed: 0,series_id,night,event,step,timestamp,timestamp (hh:mm:ss),timestamp (seconds/5)
0,038441c925bb,1.0,0.0,4992.0,2018-08-14T22:26:00-0400,22:26:00,16152.0
1,038441c925bb,1.0,1.0,10932.0,2018-08-15T06:41:00-0400,06:41:00,4812.0
2,038441c925bb,2.0,0.0,20244.0,2018-08-15T19:37:00-0400,19:37:00,14124.0
3,038441c925bb,2.0,1.0,27492.0,2018-08-16T05:41:00-0400,05:41:00,4092.0
4,038441c925bb,3.0,0.0,39996.0,2018-08-16T23:03:00-0400,23:03:00,16596.0


Updated data saved as 'train_events_updated.csv'


#### Submission data structure
The submission file should be generated as:
- row_id : row number, found in the test_series parquet
- series_id : unique identifyer, found in the test_series parquet
- step : instant of the event daily-wise, according to the evaluation criteria, should be within [12, 36, 60, 90, 120, 150, 180, 240, 300, 360] steps
- event : predicted event, **target should be converted from label to string**
- score : confidence score, should return the value of the argmax, ie the probability of the event happening, according to the nn output 
```python
predicted_class = class_probabilities.argmax(dim=1)
confidence_scores = class_probabilities.max(dim=1).values  
#eg: 
class_probabilities = [0.03, 0.76, 0.28]
predicted_class = 1
#Thus, score=0.76 for event=1
```




In [187]:
df_sample_submission = pd.read_csv(current_folder_path + "/sample_submission.csv")
df_sample_submission.head()


Unnamed: 0,row_id,series_id,step,event,score
0,0,038441c925bb,100,onset,0.0
1,1,038441c925bb,105,wakeup,0.0
2,2,03d92c9f6f8a,80,onset,0.5
3,3,03d92c9f6f8a,110,wakeup,0.5
4,4,0402a003dae9,90,onset,1.0


## Parquet data

In [162]:
df_test_series = pd.read_parquet(current_folder_path + "/test_series.parquet")
df_test_series.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo
0,038441c925bb,0,2018-08-14T15:30:00-0400,2.6367,0.0217
1,038441c925bb,1,2018-08-14T15:30:05-0400,2.6368,0.0215
2,038441c925bb,2,2018-08-14T15:30:10-0400,2.637,0.0216
3,038441c925bb,3,2018-08-14T15:30:15-0400,2.6368,0.0213
4,038441c925bb,4,2018-08-14T15:30:20-0400,2.6368,0.0215


In [154]:
batch_parquet = pq.ParquetFile(current_folder_path + "/train_series.parquet")

c=0
for batch in batch_parquet.iter_batches():
    c+=1
    df_batch_train_series = batch.to_pandas()
    display(df_batch_train_series.head(1))
    display(df_batch_train_series.tail(1))
    print(df_batch_train_series.shape)
    print(pd.unique(df_batch_train_series["series_id"]))
    
    condition = 
    target_array = 
    
    if c>=10:
        break
        
print(batch_parquet.metadata)


SyntaxError: invalid syntax (1336740548.py, line 12)

In [20]:
c=0
for batch in batch_parquet.iter_batches():
    c+=1
print(c)    

1953
