# Classification Notebook Instructions

In [1]:
# Change working directory to be the project root
import os
os.chdir("..")
os.getcwd()

'/Users/aaronquinton/Documents/UBC-MDS/Capstone/BCstats/DSCI_591_capstone-BCStats'

## How to: Train Model on Comment Data

In [3]:
# Import functions
import pandas as pd
import numpy as np
from src.models.wes_classification import model_theme_train

In [4]:
# Read in Qualitative Comments and format for training model
df = pd.read_csv("data/interim/train_2018-qualitative-data.csv")
df = df[['2018 Comment']].join(df.loc[:,'CPD':'OTH'])
df = df.rename(columns = {'2018 Comment' : 'comment'})

Y = np.array(df.loc[:,"CPD":"OTH"])

In [5]:
# File Paths for the Pre-trained Embeddings
fname_fasttext_crawl = "./references/pretrained_embeddings.nosync/fasttext/" \
                       "crawl-300d-2M.vec"
fname_w2v_googlenews = "./references/pretrained_embeddings.nosync/" \
                       "GoogleNews-vectors-negative300.bin"
fname_glove_wiki = "./references/pretrained_embeddings.nosync/glove/" \
                      "glove.6B.300d.w2v.txt"
fname_glove_crawl = "./references/pretrained_embeddings.nosync/glove/" \
                      "glove.840B.300d.w2v.txt"

In [None]:
# Train Models on each embedding
model1 = model_theme_train(df.comment, Y,
                  embedding_filepath=fname_glove_crawl,
                  embedding_name='glove_crawl',
                  save_model_filepath='models/keras_model_glove_crawl.h5',
                  save_tokenizer_filepath='models/tokenizer_glove_crawl.pickle')

model2 = model_theme_train(df.comment, Y,
                  embedding_filepath=fname_glove_wiki,
                  embedding_name='glove_wiki',
                  save_model_filepath='models/keras_model_glove_wiki.h5',
                  save_tokenizer_filepath='models/tokenizer_glove_wiki.pickle')

model3 = model_theme_train(df.comment, Y,
                  embedding_filepath=fname_w2v_googlenews,
                  embedding_name='w2v_google_news',
                  save_model_filepath='models/keras_model_w2v_google_news.h5',
                  save_tokenizer_filepath='models/tokenizer_w2v_google_news.pickle')

model4 = model_theme_train(df.comment, Y,
                  embedding_filepath=fname_fasttext_crawl,
                  embedding_name='fasttext_crawl',
                  save_model_filepath='models/keras_model_fasttext_crawl.h5',
                  save_tokenizer_filepath='models/tokenizer_fasttext_crawl.pickle')

## How to: Predict Themes using Trained Model

In [None]:
import pandas as pd
import numpy as np
from src.models.wes_classification import model_theme_predict

In [None]:
# Read in Qualitative Comments and format for predictions
df_test = pd.read_csv('data/interim/test_2018-qualitative-data.csv')
df_test = df[['2018 Comment']].join(df.loc[:,'CPD':'OTH'])
df_test = df.rename(columns = {'2018 Comment' : 'comment'})

In [None]:
Y_pred_glove_crawl = model_theme_predict(df_test.comment,
    embedding_name='glove_crawl',
    load_model_filepath='models/keras_model_glove_crawl.h5',
    load_tokenizer_filepath='models/tokenizer_glove_crawl.pickle')

Y_pred_glove_wiki = model_theme_predict(df_test.comment,
    embedding_name='glove_wiki',
    load_model_filepath='models/keras_model_glove_wiki.h5',
    load_tokenizer_filepath='models/tokenizer_glove_wiki.pickle')

Y_pred_w2v_google_news = model_theme_predict(df_test.comment,
    embedding_name='w2v_google_news',
    load_model_filepath='models/keras_model_w2v_google_news.h5',
    load_tokenizer_filepath='models/tokenizer_w2v_google_news.pickle')

Y_pred_fasttext_crawl = model_theme_predict(df_test.comment,
    embedding_name='fasttext_crawl',
    load_model_filepath='models/keras_model_fasttext_crawl.h5',
    load_tokenizer_filepath='models/tokenizer_fasttext_crawl.pickle')

## How to: Evaluate Performance

In [None]:
from src.models.eval import theme_results, investigate_results

In [None]:
Y_test = np.array(df_test.loc[:,"CPD":"OTH"])

In [None]:
Y_pred = (Y_pred_glove_crawl +
          Y_pred_glove_wiki +
          Y_pred_w2v_google_news +
          Y_pred_fasttext_crawl)/4

theme_results(Y_test, np.round(Y_pred))