# Using Cerebral Cortex with Machine Learning Tools

**Scenario:** Classify the type of motion from a smartphone's accelerometer and gyroscope sensors.

This is based on a kaggle competition and example: https://www.kaggle.com/morrisb/what-does-your-smartphone-know-about-you

**Reference:**
Davide Anguita, Alessandro Ghio, Luca Oneto, Xavier Parra and Jorge L. Reyes-Ortiz. _A Public Domain Dataset for Human Activity Recognition Using Smartphones_. 21st European Symposium on Artificial Neural Networks, Computational Intelligence and Machine Learning, ESANN 2013. Bruges, Belgium 24-26 April 2013.

## Initialize Cerebral Cortex

In [1]:
%reload_ext autoreload
from util.dependencies import *
CC = Kernel("/home/md2k/cc_conf/")
from settings import USER_ID

## Get stream data
Once a stream is identifier by name, it needs to be loaded into a `DataStream` object by calling `get_stream`.  This pulls into a single object all the metadata associated with the stream as well as a reference to the data so that it can be accessed as needed.

In [3]:
both_datastream = CC.get_stream('Kaggle-Features')
label_datastream = CC.get_stream('Kaggle-ActivityLabels')

both_dataframe = both_datastream.to_pandas().data
label_dataframe = label_datastream.to_pandas().data

In [4]:
both_dataframe = both_dataframe.drop(['timestamp','localtime','version','user'], axis=1)
label_dataframe = label_dataframe.drop(['timestamp','localtime','version','user'], axis=1)

In [5]:
label_dataframe.groupby('Activity').size().reset_index(name='Counts')

Unnamed: 0,Activity,Counts
0,LAYING,1944
1,SITTING,1777
2,STANDING,1906
3,WALKING,1722
4,WALKING_DOWNSTAIRS,1406
5,WALKING_UPSTAIRS,1544


In [12]:
both_dataframe

Unnamed: 0,tBodyAcc-mean__-X,tBodyAcc-mean__-Y,tBodyAcc-mean__-Z,tBodyAcc-std__-X,tBodyAcc-std__-Y,tBodyAcc-std__-Z,tBodyAcc-mad__-X,tBodyAcc-mad__-Y,tBodyAcc-mad__-Z,tBodyAcc-max__-X,...,fBodyBodyGyroJerkMag-kurtosis__,angle_tBodyAccMean_gravity_,angle_tBodyAccJerkMean__gravityMean_,angle_tBodyGyroMean_gravityMean_,angle_tBodyGyroJerkMean_gravityMean_,angle_X_gravityMean_,angle_Y_gravityMean_,angle_Z_gravityMean_,subject,Data
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.710304,-0.112754,0.030400,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,1,Train
6861,0.275630,-0.015740,-0.109464,-0.995216,-0.983728,-0.983884,-0.996171,-0.981873,-0.983060,-0.938252,...,-0.737707,0.241509,0.816861,-0.160214,-0.002834,-0.700654,-0.037651,-0.146436,29,Train
6862,0.279119,-0.018244,-0.108795,-0.994242,-0.977895,-0.985616,-0.995154,-0.973374,-0.984291,-0.936879,...,-0.817680,-0.143681,-0.287548,-0.599844,0.226497,-0.700674,-0.038028,-0.146082,29,Train
6863,0.278074,-0.018289,-0.108240,-0.994240,-0.977522,-0.982498,-0.994879,-0.972893,-0.981332,-0.936879,...,-0.856223,0.018276,-0.126989,-0.353334,0.040350,-0.701237,-0.037565,-0.145975,29,Train
6864,0.276255,-0.015158,-0.106558,-0.995511,-0.984609,-0.982028,-0.995915,-0.983177,-0.981564,-0.938005,...,-0.909723,-0.028103,0.255360,-0.225968,-0.384613,-0.701290,-0.037308,-0.146154,29,Train
6865,0.457573,-0.036804,-0.126458,-0.838042,-0.946988,-0.798421,-0.832798,-0.940544,-0.795383,-0.754088,...,-0.663755,0.170557,-0.241666,-0.189184,0.075098,0.536540,-0.620756,-0.379147,29,Train
6866,0.245427,-0.018049,-0.080685,-0.977475,-0.978023,-0.955569,-0.978019,-0.980328,-0.949267,-0.920211,...,-0.871589,-0.201961,0.056445,-0.254895,0.173408,0.490286,-0.620126,-0.373981,29,Train
6867,0.278964,-0.016177,-0.112617,-0.986395,-0.993402,-0.996253,-0.987809,-0.994375,-0.995559,-0.935219,...,-0.785954,0.126507,0.229589,-0.420273,0.306465,0.496486,-0.614039,-0.381354,29,Train
6860,0.277392,-0.017737,-0.112006,-0.994905,-0.985625,-0.982497,-0.995653,-0.983264,-0.982050,-0.938252,...,-0.959916,0.197410,-0.091665,0.083654,0.000630,-0.699567,-0.037690,-0.147404,29,Train
6868,0.289587,-0.019164,-0.112241,-0.993805,-0.995319,-0.996225,-0.994404,-0.994400,-0.994552,-0.935219,...,-0.691527,0.305468,-0.344160,-0.258986,0.264814,0.496738,-0.614920,-0.380466,29,Train


In [6]:
import pandas as pd
pd.options.display.max_rows=20

from lightgbm import LGBMClassifier

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Create datasets
tsne_data = both_dataframe.copy()
data_data = tsne_data.pop('Data')
subject_data = tsne_data.pop('subject')

# Scale data
tsne_data = StandardScaler().fit_transform(tsne_data)

# Reduce dimensions (speed up)
tsne_data = PCA(n_components=0.95, random_state=3).fit_transform(tsne_data)

# Split the data
label_encoded = LabelEncoder().fit_transform(label_dataframe.Activity)
X_train, X_test, y_train, y_test = train_test_split(tsne_data, label_encoded, random_state=3)

In [13]:
number_of_estimators=50
random_states=3

# Create the model
lgbm = LGBMClassifier(n_estimators=number_of_estimators, random_state=random_states)
lgbm = lgbm.fit(X_train, y_train)

# Test the model
score = accuracy_score(y_true=y_test, y_pred=lgbm.predict(X_test))
print('Classification Accuracy:',score)

Classification Accuracy: 0.9366990291262136


In [None]:
# from cerebralcortex.core.metadata_manager.stream.metadata import Metadata, DataDescriptor, ModuleMetadata
# stream_metadata = Metadata()

# # Labels
# stream_metadata.set_name("Kaggle-ActivityLabels").set_description("Kaggle competition activity labels") \
#     .add_module(ModuleMetadata().set_name("Kaggle Competition Labels").set_version("1.0.0") \
#                 .set_author("Davide Anguita", "unknown_email"))

# for column in label_dataframe.columns:
#     dd = DataDescriptor().set_name(column)
#     stream_metadata.add_dataDescriptor(dd)

# stream_metadata.is_valid()
# stream_metadata.to_json()



# datastream_metadata = Metadata()

# # Labels
# datastream_metadata.set_name("Kaggle-Features").set_description("Kaggle competition features") \
#     .add_module(ModuleMetadata().set_name("Kaggle Competition Features").set_version("1.0.0") \
#                 .set_author("Davide Anguita", "unknown_email"))

# for column in both_df.columns:
#     column.replace
#     dd = DataDescriptor().set_name(column)
#     datastream_metadata.add_dataDescriptor(dd)

# # datastream_metadata.is_valid()
# # datastream_metadata.to_json()



# stream_metadata.to_json()




In [None]:
# import datetime
# both_df['timestamp'] = datetime.datetime.utcnow()
# both_df['localtime'] = datetime.datetime.now()
# both_df['user'] = USER_ID
# both_df['version'] = 1

# label_dataframe['timestamp'] = datetime.datetime.utcnow()
# label_dataframe['localtime'] = datetime.datetime.now()
# label_dataframe['user'] = USER_ID
# label_dataframe['version'] = 1


In [None]:
# label_dataframe

In [None]:
# ds_label = DataStream(both_df, datastream_metadata)

In [None]:
# CC.save_stream(ds_label)

In [None]:
# CC.get_stream('Kaggle-Features').summary()
# CC.get_stream('Kaggle-ActivityLabels').summary()