# Using Cerebral Cortex with Machine Learning Tools

Scenario: 

Based on this kaggle competitions and example: https://www.kaggle.com/morrisb/what-does-your-smartphone-know-about-you

**Reference:**
Davide Anguita, Alessandro Ghio, Luca Oneto, Xavier Parra and Jorge L. Reyes-Ortiz. _A Public Domain Dataset for Human Activity Recognition Using Smartphones_. 21st European Symposium on Artificial Neural Networks, Computational Intelligence and Machine Learning, ESANN 2013. Bruges, Belgium 24-26 April 2013.

## Initialize Cerebral Cortex

In [None]:
%reload_ext autoreload
from util.dependencies import *
CC = Kernel("/home/md2k/cc_conf/")

import pandas as pd
pd.options.display.max_rows=20
import numpy as np

# To build models
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# To gbm light
from lightgbm import LGBMClassifier

## Load and combine data from files

In [None]:
train_df = pd.read_csv('ml_data/train.csv')
test_df = pd.read_csv('ml_data/test.csv')

train_df['Data'] = 'Train'
test_df['Data'] = 'Test'
both_df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
both_df['subject'] = '#' + both_df['subject'].astype(str)

## Get stream data
Once a stream is identifier by name, it needs to be loaded into a `DataStream` object by calling `get_stream`.  This pulls into a single object all the metadata associated with the stream as well as a reference to the data so that it can be accessed as needed.

In [None]:
both_df

In [None]:
label = both_df.pop('Activity')
pd.DataFrame(label).groupby(['Activity']).size().reset_index(name='Counts')

In [None]:
# Create datasets
tsne_data = both_df.copy()
data_data = tsne_data.pop('Data')
subject_data = tsne_data.pop('subject')

# Scale data
tsne_data = StandardScaler().fit_transform(tsne_data)

# Reduce dimensions (speed up)
tsne_data = PCA(n_components=0.95, random_state=3).fit_transform(tsne_data)

# Split the data
label_encoded = LabelEncoder().fit_transform(label)
X_train, X_test, y_train, y_test = train_test_split(tsne_data, label_encoded, random_state=3)


In [None]:
number_of_estimators=50
random_states=3

# Create the model
lgbm = LGBMClassifier(n_estimators=number_of_estimators, random_state=random_states)
lgbm = lgbm.fit(X_train, y_train)

# Test the model
score = accuracy_score(y_true=y_test, y_pred=lgbm.predict(X_test))
score