# Extract features, retrain Sherlock and generate predictions.

### Necessary imports

In [6]:
import pandas as pd
import numpy as np
import sys
import pickle
sys.path.append("..")

In [7]:
from src.features.build_features import build_features
from src.deploy.train_sherlock import train_sherlock
from src.deploy.predict_sherlock import predict_sherlock

### Load small raw data sample and corresponding labels

In [3]:
data = pd.read_csv('../data/raw/test_values.csv', sep=',', index_col=0, header=None)
labs = pd.read_csv('../data/raw/test_labels.csv', sep=',', index_col=0, header=None)

In [4]:
data.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
20368,"['Central Missouri', 'unattached', 'unattached..."
664102,"[95, 100, 95, 89, 84, 91, 88, 94, 75, 78, 90, ..."
366813,"['Katie Crews', 'Christian Hiraldo', 'Alex Est..."
530567,"['Christian', 'Non-Christian', 'Unreported', '..."
176253,"['AAF-McQuay Canada Inc.', 'AAF-McQuay Canada ..."


In [5]:
labs.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
20368,affiliation
664102,weight
366813,jockey
530567,religion
176253,company


### Extract features of raw data samples from dataframes and save preprocessed data

In [None]:
X_train = build_features(data)
y_train = labs.values.flatten()
print('Extracted features.')

Preparing feature extraction by downloading 2 files: 
 ../src/features/glove.6B.50d.txt and 
 ../src/features/par_vec_trained_400.pkl.docvecs.vectors_docs.npy.
GloVe word embedding vectors were downloaded.
Trained paragraph vector model was downloaded.
Inferring paragraph vectors with vector dimension:  400
Inferring paragraph vectors with vector dimension:  400
Inferring paragraph vectors with vector dimension:  400
Inferring paragraph vectors with vector dimension:  400
Inferring paragraph vectors with vector dimension:  400
Inferring paragraph vectors with vector dimension:  400
Inferring paragraph vectors with vector dimension:  400
Inferring paragraph vectors with vector dimension:  400
Inferring paragraph vectors with vector dimension:  400
Inferring paragraph vectors with vector dimension:  400
Inferring paragraph vectors with vector dimension:  400
Inferring paragraph vectors with vector dimension:  400
Inferring paragraph vectors with vector dimension:  400
Inferring paragraph

In [None]:
with open('../data/processed/X_train.data', 'wb') as f:
    pickle.dump(X_train, f)
    
with open('../data/processed/y_train.data', 'wb') as f:
    pickle.dump(y_train, f)

In [None]:
with open('../data/processed/X_train.data', 'rb') as f:
    X_train = pickle.load(f)
    
with open('../data/processed/y_train.data', 'rb') as f:
    y_train = pickle.load(f)

### Train sherlock on new data

In [None]:
train_sherlock(X_train, y_train, X_train, y_train, 'retrain_minimal_sample')
print('Trained new model.')

### Generate predictions with the retrained model

In [None]:
# Predict labels using the retrained model (with nn_id retrain_minimal_sample)
predicted_labels = predict_sherlock(X_train, 'retrain_minimal_sample')
print('Predicted labels: ', predicted_labels, 'true labels: ', y_train)

### FIRST COPY to a classes_sherlock.npy FILE, which should be the same as the retrain_minimal_sample file!!!

In [None]:
# Predict labels using the original model (with nn_id sherlock)
predicted_labels = predict_sherlock(X_train, 'sherlock')
print('Predicted labels: ', predicted_labels, 'true labels: ', y_train)