In [161]:
from datetime import datetime, timedelta
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os
import re
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [162]:
data_path = os.path.expanduser('Data/Data1.csv')
df = pd.read_csv(data_path, comment='#')
df.rename(columns={
    'Timestamp': 'timestamp',
    'Royal Canin Lowfat Can (13.6oz)': 'can',
    'Sweet Potato (11.5oz)': 'sweet',
    'Golden Potato (5.2oz)': 'golden',
    'Outcome': 'outcome',
    'Were We Sleeping / Jake Was Alone Just Before': 'alone',
    'Was it an Accident?': 'accident',
    'Notes': 'notes',
    'Poop Event': 'poop',
    'Feed Event': 'feed'
}, inplace=True)
df.head()

Unnamed: 0,timestamp,can,sweet,golden,outcome,alone,accident,notes
0,12/27/2018 8:57:58,2.0,1.0,1.0,,,,
1,12/27/2018 9:10:46,,,,Regular,,,Outside
2,12/27/2018 12:41:22,2.0,1.0,1.0,,,,
3,12/27/2018 14:10:08,,,,Regular,,,Outside
4,12/27/2018 14:14:35,,,,,,,Jake got 2 gabapentin at lunch and 2 at 2pm. O...


In [165]:
# This was done once on the original dataset. No longer necessary
def to_timestamp(t):
    dt = datetime.strptime(t, '%m/%d/%Y %H:%M:%S')
    return dt.strftime('%s')
df.timestamp = [to_timestamp(t) for t in df.timestamp.values]

set(df.outcome)
renames = {'Big One': 3, 'Regular': 2, 'No Poop': 0}
for k, v in renames.items():
    df.loc[df.outcome == k, 'outcome'] = v


12/27/2018 8:57:58 2018-12-27 08:57:58


ValueError: Invalid format string

In [None]:
plt.figure(figsize=(17, 7))
plt.plot(df.timestamp, df.can, 'o', alpha=1, label='can')
plt.plot(df.timestamp, df.sweet, 'o', alpha=1, label='sweet')
plt.plot(df.timestamp, df.golden, 'o', alpha=1, label='golden')
plt.plot(df.timestamp, df.outcome, 'o', alpha=1.0, label='outcome')
plt.legend()
plt.show()


In [None]:
ddf = df.copy()
ddf.loc[df.index, 'poop'] = False
ddf.loc[df.index, 'feed'] = True
ddf.loc[df.outcome >= 0, 'poop'] = True
ddf.loc[df.outcome >= 0, 'feed'] = False
ddf.loc[df.alone == 'Yes', 'alone'] = True
ddf.loc[df.accident == 'Yes', 'accident'] = True
df = ddf

In [None]:
def accumulate_pastdata(df, hours_ago=24, span=3):
    x = 0; y = 0
    df = df.sort_values(by='timestamp')
    for index, row in df.iterrows():
        if not row.poop:
            continue
        dt = datetime.fromtimestamp(row.timestamp)
        df.loc[index, 'hour_of_day'] = dt.hour
        for h in range(span, hours_ago + span, span):
            start = row.timestamp - (h * 3600)
            end = row.timestamp - (h - span) * 3600
            ddf = df[(df.timestamp > start) & (df.timestamp < end)]
            s = ddf.sum()
            start_str = h - span
            end_str = h
            df.loc[index, 'golden_{}_to_{}'.format(start_str, end_str)] = s.golden
            df.loc[index, 'can_{}_to_{}'.format(start_str, end_str)] = s.can
            df.loc[index, 'sweet_{}_to_{}'.format(start_str, end_str)] = s.sweet
            df.loc[index, 'outcome_{}_to_{}'.format(start_str, end_str)] = s.outcome  
    return df
    

ddf = accumulate_pastdata(df.copy())
ddf.head()

In [None]:
ddf.to_csv('Data/Data1.processed.csv')

In [None]:
plt.figure(figsize=(17, 7))
print(ddf.columns)
pdf = ddf[ddf.poop == True]
plt.plot(pdf.sweet_0_to_3, pdf.outcome, 'o')

In [None]:
#Build data for classification
feature_columns = [c for c in ddf.columns if re.search('_to_',c)] + ['hour_of_day']
ddf = ddf[ddf.index >= 8]  # cut out beginning of data set
X = ddf.loc[ddf.poop, feature_columns].values
Y = (ddf.outcome[ddf.poop].values > 0).astype(np.int)

In [None]:
#Classifier decision trees
clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(X, Y)
scores = cross_val_score(clf, X, Y, cv=10)

print("Machien Score: %.2f Human Score: %.2f" %(scores.mean() , (np.array(Y).sum()/np.array(Y).shape[0])))

In [None]:
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Plot the feature importances of the forest
display_columns = np.array(feature_columns)
fig, ax = plt.subplots(1, 1, figsize=(17, 7))
ax.set_title("Feature importances")
ax.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
ax.set_xticks(range(X.shape[1]))
ax.set_xlim([-1, X.shape[1]])
ax.set_xticklabels(display_columns[indices])
for tick in ax.get_xticklabels():
    tick.set_rotation(90)
plt.show()

# Print the feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, display_columns[indices[f]], importances[indices[f]]))

In [None]:
#Classifier LASSO
from sklearn import linear_model
lso = linear_model.LogisticRegression(solver='lbfgs')
#lso = linear_model.Lasso(alpha=0.01)
lso.fit(X, Y)
scores = cross_val_score(lso, X, Y, cv=10)

print("Machien Score: %.2f Human Score: %.2f" %(scores.mean() , (np.array(Y).sum()/np.array(Y).shape[0])))

#Importance
tmp = lso.coef_.flatten();
indices = np.argsort(np.abs(tmp))[::-1].flatten()
for f in range(len(indices)):
    print("%d. feature %s (%f, %f)" % (f + 1, display_columns[indices[f]], (tmp[indices[f]])**2, 
          tmp[indices[f]]))


In [None]:
v=np.vstack([np.array(clf.predict(X)), np.array(lso.predict(X)), np.array(Y)])
print(v.T)