**Imports**

In [1]:
from imports import * 
from functions import *
from archs import *
from mylearner import *
%matplotlib inline
%load_ext tensorboard

**Loading data**

In [None]:
data_folder = '/scratch/smartairsense/data/'

csv_file = os.path.join(data_folder,'df_minimal_clean.csv')
df = pd.read_csv(csv_file,usecols=['humidity_abs','temperature','tvoc','oxygen','co2','co','no2','o3'],dtype=np.float32)
df_test = df.iloc[df.shape[0]-500:] ## to get better visualization on less focused data
df = df.drop(index = df.index[df.shape[0]-500:]) 

**Missing values, drop NaN**

In [None]:
plot_missing(df)

#### substitute NaN values with mean 
## polynomial interpolation with degree > 1 uses index, also convert dtype to float to work
impute_NaN(df)

############## conmibe into df 

df.reset_index(drop=True,inplace= True)
df.columns=df.columns

**Sliding data**

In [None]:
with open('config.yaml') as f:
    hyperparams = yaml.load(f,SafeLoader)

under_window = hyperparams['sample_segment']['under_window']
seq_len = hyperparams['sample_segment']['seq_len']
stride = hyperparams['sample_segment']['stride']
sliding_mode = hyperparams['sample_segment']['sliding_mode']

X= sliding(seq_len,stride,df,mode=sliding_mode)

**splitting and standardization**

In [None]:

###### splitting
# splits = TrainValidTestSplitter(valid_size=0.1,test_size=0.1)(y) ##### we have test set here
splits = TrainValidTestSplitter(valid_size=0.1)(X[:,0,0]) ##### we DON'T have test set here
x_train = np.zeros(X[splits[0]].shape,dtype=np.float32)
x_valid = np.zeros(X[splits[1]].shape,dtype=np.float32)
# x_test = np.zeros(X[splits[2]].shape,dtype=np.float32)
# y_test = y[splits[2]]

#different test set
### uses stride = 1 to check all time points
x_test_ = sliding(seq_len,1,df_test,mode=sliding_mode)
x_test = np.zeros(x_test_.shape,dtype=np.float32) 
#################
scalers = {}
for i in range(x_train.shape[1]): ## n_features
    scalers[i] = StandardScaler()
    scalers[i].fit(np.unique((X[splits[0]])[:, i, :]).reshape(-1,1)) ### as we have overlapping samples
    x_train[:, i, :] = scalers[i].transform((X[splits[0]])[:, i, :].reshape(-1,1)).reshape(x_train.shape[0],x_train.shape[-1])
    x_valid[:, i, :] = scalers[i].transform((X[splits[1]])[:, i, :].reshape(-1,1)).reshape(x_valid.shape[0],x_valid.shape[-1])
    # x_test[:, i, :] = scalers[i].transform((X[splits[2]])[:, i, :].reshape(-1,1)).reshape(x_test.shape[0],x_test.shape[-1]) ## from splitting
    x_test[:, i, :] = scalers[i].transform((x_test_)[:, i, :].reshape(-1,1)).reshape(x_test.shape[0],x_test.shape[-1]) ## test set from outside
print(x_train.shape,x_valid.shape,x_test.shape)

**plotting distribution of data**

In [None]:
x_lbs = ['train set','valid set','test set']
y_nmrs = [x_train.shape[0],x_valid.shape[0],x_test.shape[0]]

fig, ax = plt.subplots()    
ind = np.arange(len(y_nmrs))  # the x locations for the groups
bars = ax.bar(ind, y_nmrs, color="blue")
ax.set_xticks(ind)
ax.set_xticklabels(x_lbs, minor=False)
plt.title('Distribution of datset')
# plt.xlabel('x')
# plt.ylabel('y')
ax.bar_label(bars)
plt.show()

**train cnn+lfstm autoencoder**

Load hyperparameters

In [2]:
with open('config.yaml') as f:
    hyperparams = yaml.load(f,SafeLoader)

epochs = hyperparams['model']['epochs']
bs = hyperparams['model']['bs']
num_workers = hyperparams['model']['num_workers']

Training

In [None]:
###training/validation dataloaders
Tsets = TSDatasets(x_train, inplace=True)
Vsets = TSDatasets(x_valid, inplace=True)
dls   = TSDataLoaders.from_dsets(Tsets, Vsets, bs = bs, num_workers=num_workers)#,batch_tfms=batch_tfms) ### note the normalization

## testing dataloaders
Test_set = TSDatasets(x_test, inplace=True)
Test_dls   = TSDataLoader(Test_set, bs = bs, num_workers=num_workers)#,batch_tfms=batch_tfms) ### note the normalization

autoencoder = AutoEncoder(dls.vars)

autoencoder, history = train_autoencoder(
  autoencoder,
  dls.train,
  dls.valid,
  n_epochs=100
)

### save autoencoder
torch.save(autoencoder.state_dict(), f'models/{autoencoder._get_name()}.pt')


**plot autoencoder train/valid losses**

In [None]:
plt.figure()
plt.plot(history['train'],label='train_loss')
plt.plot(history['val'],label = 'valid_loss')
plt.xlabel('epochs')
plt.legend()
plt.show()

**predict for autoencoder**

In [None]:
predictions, pred_losses = predict_autoencoder(autoencoder, Test_dls)
plt.figure()
sns.distplot(pred_losses, bins=50, kde=True)
plt.title('Distribution of reconstruction error of predictions')

### correct predictions based on threshold for reconstruction error
threshold = 0.2
correct = sum(l <= threshold for l in pred_losses)
print(f'Correct normal predictions: {correct}/{x_test.shape[0]}')

**visualizing features in a lower dimensional space [note output of encoder]**

In [None]:
pretrained_dict = torch.load(f'models/{AutoEncoder.__name__}.pt')
new_autoencoder = AutoEncoder(x_train.shape[1])
new_autoencoder.load_state_dict(pretrained_dict)
### get ptrtrained encoder
enc = new_autoencoder.encoder
#######################################################

new_model = enc ##### for encoder_classifier

###########################################
###### get the features/predicions of the model_body
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

new_model.to(device)
new_model.eval()
feats = []
lbls = []
with torch.no_grad():
    for inputs, labels in Test_dls:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = new_model(inputs)
        #####################################################
        feats.append(outputs[:,-1]) #### for output of encoder

        #######################################################
        lbls.append(labels)

test_feats = torch.cat(feats).detach().cpu().numpy()
test_lbls = torch.cat(lbls).detach().cpu().numpy()

#### get labels as strings
person_labels = []
person_dict = {1 : 'people', 0 : 'no person'}
window_labels = []
window_dict = {1 : 'open', 0 : 'closed'}
for item in test_lbls:
    person_labels.append(person_dict[item[0]])
    window_labels.append(window_dict[item[1]])

##########
## using tensorboard projector
# writer = SummaryWriter('runs/')
# writer.add_embedding(test_feats,metadata=person_labels,tag = f'person_embeddings_{learn.model._get_name()}')
# writer.add_embedding(test_feats,metadata=window_labels,tag = f'window_embeddings_{learn.model._get_name()}')
# writer.close()
##########
### using sklearn and plotly
components = visualize_embeddings(test_feats,person_labels,window_labels,n_components=2,method='pca')
