**Importing modules and dependencies. Reading CSV data**
.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score, r2_score
from datetime import datetime
start_time = datetime.now()


TRAIN_LABELS_FILE = 'https://courses.edx.org/assets/courseware/v1/d64e74647423e525bbeb13f2884e9cfa/asset-v1:HarvardX+PH526x+2T2020+type@asset+block/train_labels.csv'
TRAIN_TIME_SERIES_FILE = 'https://courses.edx.org/assets/courseware/v1/b98039c3648763aae4f153a6ed32f38b/asset-v1:HarvardX+PH526x+2T2020+type@asset+block/train_time_series.csv'
TEST_LABELS_FILE = 'https://courses.edx.org/assets/courseware/v1/72d5933c310cf5eac3fa3f28b26d9c39/asset-v1:HarvardX+PH526x+2T2020+type@asset+block/test_labels.csv'
TEST_TIME_SERIES_FILE = 'https://courses.edx.org/assets/courseware/v1/1ca4f3d4976f07b8c4ecf99cf8f7bdbc/asset-v1:HarvardX+PH526x+2T2020+type@asset+block/test_time_series.csv'

train_time_series_df = pd.read_csv(TRAIN_TIME_SERIES_FILE, index_col=0)
train_labels_df = pd.read_csv(TRAIN_LABELS_FILE, index_col=0)
print(train_time_series_df.tail())
print(train_labels_df.tail())

           timestamp                 UTC time accuracy         x         y  \
24325  1565110305638  2019-08-06T16:51:45.638  unknown  0.024384 -0.710709   
24326  1565110305738  2019-08-06T16:51:45.738  unknown  0.487228 -1.099136   
24327  1565110305838  2019-08-06T16:51:45.838  unknown  0.369446 -0.968506   
24328  1565110305939  2019-08-06T16:51:45.939  unknown  0.167877 -0.802826   
24329  1565110306039  2019-08-06T16:51:46.039  unknown  0.689346 -0.991043   

              z  
24325  0.030304  
24326 -0.015213  
24327  0.036713  
24328  0.049805  
24329  0.034973  
           timestamp                 UTC time  label
24289  1565110302030  2019-08-06T16:51:42.030      4
24299  1565110303032  2019-08-06T16:51:43.032      4
24309  1565110304034  2019-08-06T16:51:44.034      4
24319  1565110305037  2019-08-06T16:51:45.037      4
24329  1565110306039  2019-08-06T16:51:46.039      4


**Prep the data to fit the classifier with.**

In [2]:
# Only uses the rows from the given label set to train.
# train_data = train_time_series_df[train_time_series_df['timestamp'].isin(train_labels_df['timestamp'])]
# X = train_data[['x', 'y', 'z']] #.iloc[::10, :]
# y = train_labels_df['label']

# Fills in '2' for rows not provided in training label set.
# train_time_series_df['label'] = train_labels_df['label']
# train_time_series_df['label'] = train_time_series_df['label'].fillna(2)

# Fills increments of empty cells with the value from the provided cell data.
train_time_series_df['label'] = train_labels_df['label']
labels = []
time_data_iterator = train_time_series_df.iterrows()
data_index, data_row = next(time_data_iterator)
for labels_index, labels_row in train_labels_df.iterrows():
    while data_row['timestamp'] <= labels_row['timestamp']:
        labels.append(labels_row['label'])
        try:
            data_index, data_row = next(time_data_iterator)
        except StopIteration:
            break
train_time_series_df['label'] = labels

# Average the (x, y, z) for each increment
# avgs = []
# time_data_iterator = train_time_series_df.iterrows()
# data_index, data_row = next(time_data_iterator)
# for labels_index, labels_row in train_labels_df.iterrows():
#     section = []  
#     while data_row['timestamp'] <= labels_row['timestamp']:
#         section.append(data_row[['x','y','z']])
#         try:
#             data_index, data_row = next(time_data_iterator)
#         except StopIteration:
#             break
#     section_df = pd.DataFrame(section, columns=['x','y','z'])
#     avgs.append(np.mean(section_df[['x','y','z']]))
    
# print(avgs[8:13])

# Get changes in (x, y, z) vector for each increment
# mags = []
# time_data_iterator = train_time_series_df.iterrows()
# data_index, data_row = next(time_data_iterator)
# for labels_index, labels_row in train_labels_df.iterrows():
#     section = []  
#     while data_row['timestamp'] <= labels_row['timestamp']:
#         section.append(data_row[['x','y','z']])
#         try:
#             data_index, data_row = next(time_data_iterator)
#         except StopIteration:
#             break
#     section_df = pd.DataFrame(section, columns=['x','y','z'])
#     mags.append((np.linalg.norm(section_df[['x']]), np.linalg.norm(section_df[['y']]), np.linalg.norm(section_df[['z']])))
# print(mags[0:5])

# mags = []
# time_data_iterator = train_time_series_df.iterrows()
# data_index, data_row = next(time_data_iterator)
# for labels_index, labels_row in train_labels_df.iterrows():
#     section = []  
#     while data_row['timestamp'] <= labels_row['timestamp']:
#         section.append(data_row[['x','y','z']])
#         try:
#             data_index, data_row = next(time_data_iterator)
#         except StopIteration:
#             break
#     section_df = pd.DataFrame(section, columns=['x','y','z'])
#     mags.append((np.diff(section_df[['x']].values), np.diff(section_df[['y']].values), np.diff(section_df[['z']].values)))
# print(mags[0:5])


**Instantiate the classifier and fit it with the training data. Get the accuracy score on the training set.** Test different parameter values to tune the final model.

In [3]:
X = train_time_series_df[['x', 'y', 'z']] #[train_time_series_df['timestamp'].isin(train_labels_df['timestamp'])] #.iloc[::10, :]
y = train_time_series_df['label']
mX = train_time_series_df[['x', 'y', 'z']][train_time_series_df['timestamp'].isin(train_labels_df['timestamp'])] #.iloc[::10, :]
my = train_labels_df['label']

# # Test different min_samples_split values
# min_samples_split_results = []
# M_min_samples_split_results = []
# for i in range(2, 10):
#     forest_classifier = RandomForestClassifier(n_estimators=400, min_samples_split=i,random_state=0)
#     forest_classifier.fit(X,y)
#     M_min_samples_split_results.append(accuracy_score(my, forest_classifier.predict(mX)))
#     min_samples_split_results.append(accuracy_score(y, forest_classifier.predict(X)))

# xlabels = list(range(2,10))
# plt.plot(xlabels, min_samples_split_results, 'b')
# plt.plot(xlabels, M_min_samples_split_results, 'r')
# plt.show()
    
# # Test different max_depth values 
# max_depth_split_results = []
# M_max_depth_split_results = []
# for i in range(2, 16):
#     forest_classifier = RandomForestClassifier(n_estimators=400, min_samples_split=20, max_depth=i,random_state=0)
#     forest_classifier.fit(X,y)
#     M_max_depth_split_results.append(accuracy_score(my, forest_classifier.predict(mX)))
#     max_depth_split_results.append(accuracy_score(y, forest_classifier.predict(X)))

# xlabels = list(range(2,16))
# plt.plot(xlabels, max_depth_split_results, 'b')
# plt.plot(xlabels, M_max_depth_split_results, 'r')
# plt.show()

# Final model
# forest_classifier = RandomForestClassifier(n_estimators=400, min_samples_split=4, max_depth=16, random_state=0)
# forest_classifier.fit(X,y)

forest_classifier = RandomForestClassifier(n_estimators=400,min_samples_split=6, max_depth=14, random_state=0)
forest_classifier.fit(mX,my)

print(accuracy_score(my, forest_classifier.predict(mX)))
print(accuracy_score(y, forest_classifier.predict(X)))



0.9413333333333334
0.6335470085470085


**Get predictions with the trained model.**

In [5]:
test_data_df = pd.read_csv(TEST_TIME_SERIES_FILE, index_col=0)
test_labels = pd.read_csv(TEST_LABELS_FILE, index_col=0)

test_data = test_data_df[test_data_df['timestamp'].isin(test_labels_df['timestamp'])]
test_data = test_data[['x', 'y', 'z']]

predictions = forest_classifier.predict(test_data)
# Print mean and counts for reference
print(np.mean(predictions))
print(Counter(predictions))
print(len(predictions))

# Plot values 
labels, values = zip(*Counter(predictions).items())
plt.scatter(labels, values)
plt.xticks(labels, labels)

NameError: name 'test_labels_df' is not defined

**Save the predictions to the test labels CSV file.**

In [6]:
test_labels['label'] = predictions
test_labels.to_csv('./final-project/data/test_labels.csv')
for x in predictions:
    print(f'{x}, ', end='')

NameError: name 'predictions' is not defined

In [7]:
print(datetime.now() - start_time)

0:00:39.500778
