In [2]:
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import os

from utils import *


# Data Exploration

#### Phone Types

In [3]:
phone_data = "./data/phone_types.csv"
df_phones = pd.read_csv(phone_data)
print(df_phones.phonetype.value_counts())
df_phones

iPhone     11
Android     2
Name: phonetype, dtype: int64


Unnamed: 0,pid,phonetype
0,BK7610,iPhone
1,BU4707,iPhone
2,CC6740,Android
3,DC6359,iPhone
4,DK3500,iPhone
5,HV0618,iPhone
6,JB3156,Android
7,JR8022,iPhone
8,MC7070,iPhone
9,MJ8002,iPhone


#### Accelerometer Data

In [4]:
acc_data = "./data/all_accelerometer_data_pids_13.csv"
acc_df = pd.read_csv(acc_data)
print(f"Number of datapoints = {len(acc_df)}")
acc_df.head()

Number of datapoints = 14057567


Unnamed: 0,time,pid,x,y,z
0,0,JB3156,0.0,0.0,0.0
1,0,CC6740,0.0,0.0,0.0
2,1493733882409,SA0297,0.0758,0.0273,-0.0102
3,1493733882455,SA0297,-0.0359,0.0794,0.0037
4,1493733882500,SA0297,-0.2427,-0.0861,-0.0163


In [5]:
unique_users = acc_df.pid.unique()
print(unique_users)
print(f"Number of unique users: {len(unique_users)}")

['JB3156' 'CC6740' 'SA0297' 'PC6771' 'BK7610' 'DC6359' 'MC7070' 'MJ8002'
 'BU4707' 'JR8022' 'HV0618' 'SF3079' 'DK3500']
Number of unique users: 13


In [6]:
readings = acc_df.pid.value_counts()

fig = px.bar(readings, x = readings.index, y = readings.values, height=600, width=1200)
fig.update_layout(xaxis_title = "User", yaxis_title= "Frequency", title={
        'text': "Number of readings per user",
        'y':0.93,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
	    margin=dict(l=10, r=10, t=20, b=20),
)
fig.show()


In [7]:
acc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14057567 entries, 0 to 14057566
Data columns (total 5 columns):
 #   Column  Dtype  
---  ------  -----  
 0   time    int64  
 1   pid     object 
 2   x       float64
 3   y       float64
 4   z       float64
dtypes: float64(3), int64(1), object(1)
memory usage: 536.3+ MB


In [8]:
acc_df.describe()

Unnamed: 0,time,x,y,z
count,14057570.0,14057570.0,14057570.0,14057570.0
mean,1493778000000.0,-0.009269848,-7168398.0,7168398.0
std,564045300.0,0.9540696,35289980.0,35289980.0
min,0.0,-43.33507,-180900800.0,-49.023
25%,1493755000000.0,-0.0057,-0.0045,-0.0042
50%,1493779000000.0,-0.0002,-0.0002,0.0061
75%,1493801000000.0,0.0076,0.0045,0.04605889
max,1493829000000.0,39.2254,27.31123,180900800.0


In [9]:
user = get_acc_user(acc_df,'CC6740')
user['datetime'] = pd.to_datetime(user['time'], unit='ms')
user.head()


Unnamed: 0,time,pid,x,y,z,datetime
0,1493740580198,CC6740,0.019649,-0.007789,0.168013,2017-05-02 15:56:20.198
1,1493740580217,CC6740,-0.018393,0.01212,0.129684,2017-05-02 15:56:20.217
2,1493740580243,CC6740,0.020991,0.012421,0.110526,2017-05-02 15:56:20.243
3,1493740580268,CC6740,-0.036003,-0.102264,0.148829,2017-05-02 15:56:20.268
4,1493740580293,CC6740,-0.015756,-0.043726,0.09134,2017-05-02 15:56:20.293


In [None]:
%run utils.py
for user in unique_users:
	plot_acc_reading(acc_df, user)


In [None]:
%run utils.py
# Pre-process accelerometer data for each pid and save as pkl file.
path = "data/"
new_path = "preprocessed_data/"
preprocess_acc(path, new_path)

#### TAC (Transdermal Alcohol Content)

Inside clean_tac/*.csv  
Each user has a TAC reading i.e continous and time-series

In [10]:
## for user bk7610
tac_file = "./data/clean_tac/BK7610_clean_TAC.csv"
tac_df = pd.read_csv(tac_file)
tac_df.head()


Unnamed: 0,timestamp,TAC_Reading
0,1493718714,-0.000482
1,1493720697,0.001573
2,1493721027,0.002144
3,1493721357,0.000877
4,1493721686,-0.001145


In [11]:
tac_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   timestamp    57 non-null     int64  
 1   TAC_Reading  57 non-null     float64
dtypes: float64(1), int64(1)
memory usage: 1.0 KB


In [12]:
tac_df.describe()

Unnamed: 0,timestamp,TAC_Reading
count,57.0,57.0
mean,1493758000.0,0.041313
std,28415.95,0.050705
min,1493719000.0,-0.002732
25%,1493729000.0,0.000744
50%,1493756000.0,0.012099
75%,1493782000.0,0.074544
max,1493808000.0,0.171758


In [25]:
%run utils.py
tac_all = preprocess_tac("./data/clean_tac/")

In [26]:
tac_all.head()

Unnamed: 0,timestamp,TAC_Reading,pid,intoxicated,datetime
0,1493716723,-0.010229,SA0297,0.0,2017-05-02 09:18:43
1,1493717859,4.3e-05,PC6771,0.0,2017-05-02 09:37:39
2,1493718546,-0.002512,SA0297,0.0,2017-05-02 09:49:06
3,1493718714,-0.000482,BK7610,0.0,2017-05-02 09:51:54
4,1493718714,-0.000482,BU4707,0.0,2017-05-02 09:51:54


In [None]:
%run utils.py
plot_tac_reading(tac_all)


In [27]:
tac_all.head()

Unnamed: 0,timestamp,TAC_Reading,pid,intoxicated,datetime
0,1493716723,-0.010229,SA0297,0.0,2017-05-02 09:18:43
1,1493717859,4.3e-05,PC6771,0.0,2017-05-02 09:37:39
2,1493718546,-0.002512,SA0297,0.0,2017-05-02 09:49:06
3,1493718714,-0.000482,BK7610,0.0,2017-05-02 09:51:54
4,1493718714,-0.000482,BU4707,0.0,2017-05-02 09:51:54


In [28]:
import feature_extraction
%run feature_extraction.py


In [2]:
acc_path = "D:/preprocessed_data/"
full_acc = feature_extraction.run_feature_engineering(acc_path)

BK7610_preprocessed_acc.pkl
BU4707_preprocessed_acc.pkl
CC6740_preprocessed_acc.pkl
DC6359_preprocessed_acc.pkl
DK3500_preprocessed_acc.pkl
HV0618_preprocessed_acc.pkl
JB3156_preprocessed_acc.pkl
JR8022_preprocessed_acc.pkl
MC7070_preprocessed_acc.pkl
MJ8002_preprocessed_acc.pkl
PC6771_preprocessed_acc.pkl
SA0297_preprocessed_acc.pkl
SF3079_preprocessed_acc.pkl


In [29]:
full_acc_df = pd.read_csv("full_acc.csv")

In [31]:
%run feature_extraction.py
merged = reconcile_acc_tac(full_acc_df, tac_all)

In [33]:
merged.to_csv("extracted_features.csv")