Initial tests of classifiers, before the use of `xarray`.

In [13]:
import collections, random, os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import sklearn.model_selection, sklearn.naive_bayes, sklearn.tree, sklearn.neural_network, sklearn.neighbors, sklearn.ensemble

In [14]:
Recording = collections.namedtuple('Recording',['fn','df'])
def load_data(dirpath):
    dfs = []
    for root, _, filenames in os.walk(dirpath):
        for filename in filenames:
            filepath = os.path.join(root,filename)
            nxt = pd.read_csv(filepath,index_col=0,compression='gzip')
            dfs.append(Recording(filename,nxt))
    return dfs

In [15]:
# load data files
data = load_data('../data_final')

In [16]:
# get only 2013 set and 2014 set
data_2013 = [rec for rec in data if rec.fn.startswith('2013')]
data_2014 = [rec for rec in data if rec.fn.startswith('2014')]

In [17]:
# numbers of recordings of animals with only saline, no QNP
saline = [50, 51, 52, 53, 54, 55,
326, 327, 328, 329, 330, 331,
355, 356, 357, 358, 359, 360,
386, 387, 388, 389, 390, 391,
415, 416, 417, 418, 419, 420,
444, 445, 446, 447, 448, 449,
473, 474, 475, 476, 477,478]

qnp = [302, 303, 304, 305, 306, 307, 308, 309,
332, 333, 334, 335, 336, 337, 338, 339,
361, 362, 363, 364, 365, 366, 367, 368,
392, 393, 394, 395, 396, 397, 398, 399,
421, 422, 423, 424, 425, 426, 427, 428,
450, 451, 452, 453, 454, 455, 456, 457]

In [18]:
# filtered into only QNP+saline and saline+saline
data_2013_filt = [rec for rec in data_2013 if int(rec.fn.split('_')[1].split('.')[0]) in saline+qnp]

In [19]:
def issaline13(i):
    return i in saline
def issaline14(i):
    return i % 2 != 0

In [20]:
# create labels
y = np.array([issaline14(int(rec.fn.split('_')[1].split('.')[0])) for rec in data_2014])
print(y)

[ True False  True False  True False  True False  True False  True False
  True False  True False  True False  True False  True False  True False
  True False  True False  True False  True False  True False  True False
  True False  True False  True False  True False  True False  True False
  True False  True False  True False  True False  True False  True False
  True False  True False  True False  True False  True False  True False
  True False  True False  True False  True False  True False  True False
  True False  True False  True False  True False  True False False  True
 False  True False  True False  True False  True False  True False  True
 False  True False  True False  True False  True False  True False  True
 False  True False  True False  True False  True False  True False  True
 False  True False  True False  True False  True False  True False False
  True False False  True False  True False  True False]


In [21]:
#create feature vectors, better version.


In [38]:
# create feature vectors
def get_features(df):
    df = df[[col for col in df.columns if col != 'Trial time (s)']]
    return pd.concat([df.mean(),df.std()]) # just each column's mean and std
X = [get_features(rec.df) for rec in data_2014]
#X.reshape(1, -1)
#X = np.reshape(X, (1, -1)).T
print(len(X))


153


In [41]:
X[0]

X center (cm)       12.473352
Y center (cm)       42.709479
Area (cm²)          67.203923
Areachange (cm²)     2.568563
Elongation           0.666619
Direction (deg)    -14.209945
X center (cm)        3.795180
Y center (cm)        9.419231
Area (cm²)           6.990185
Areachange (cm²)     2.257927
Elongation           0.103604
Direction (deg)     85.927874
dtype: float64

In [37]:
# create, train and score classifiers
simple_classifiers = [sklearn.naive_bayes.GaussianNB,
                      sklearn.tree.DecisionTreeClassifier,
                      sklearn.neural_network.MLPClassifier,
                      sklearn.neighbors.KNeighborsClassifier]
clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)

for classifier in simple_classifiers:
    c = classifier()
    r = sklearn.model_selection.cross_val_score(c,X,y,cv=10) # 10-fold cross validation
    print(classifier.__name__, r.mean(), r.std())



ValueError: setting an array element with a sequence.

In [25]:
#Try Random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_blobs
X1, y1 = make_blobs(n_samples=153, n_features=10, centers=2, random_state=0)
clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
scores = sklearn.model_selection.cross_val_score(clf, X1, y1)
print(len(X1))
#Take mean and std
scores.mean(), scores.std()

153


(1.0, 0.0)

In [None]:
# get only 2014 set
X14 = [get_features(rec.df) for rec in data_2013_filt]

In [None]:
# predict the 2014 labels
c = sklearn.naive_bayes.GaussianNB()
c.fit(X,y)
y14 = c.predict(X14)

In [None]:
# animals that received only saline
x = sorted(rec.fn for rec,l in zip(data_2014,y14) if not l)

In [None]:
sum(int(i.split('_')[1].split('.')[0]) not in saline for i in x)/len(x)

In [None]:
x

In [None]:
y14real = np.array([issaline13(int(rec.fn.split('_')[1].split('.')[0])) for rec in data_2013_filt])

In [None]:
sum(y14real == y14)/len(y14)

In [None]:
y