In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import csv # for excel too 
import json 
import random
import numpy as np
import itertools 
import matplotlib as mpl
import sys
import os
import copy 
import pickle 
import scikitplot as skplt # from https://github.com/reiinakano/scikit-plot
from scipy import sparse
# import scipy

# for natural language processing 
import nltk
from nltk.stem.porter import *
# nltk.download("stopwords")
# nltk.download("averaged_perceptron_tagger")
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.preprocessing import StandardScaler 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import mixture, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier # similar to Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import f1_score, confusion_matrix, multilabel_confusion_matrix, \
    classification_report, accuracy_score, plot_roc_curve, RocCurveDisplay

from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif
from sklearn.covariance import empirical_covariance

# import word embeddings models 
from transformers import BertTokenizer, BertModel
import torch

# LSTM and CNN *** sharfard paper 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Use to reload changed modules
# %load_ext autoreload
# %autoreload 2

# How to debug
'''After runtime error, open new cell and type %debug and run the cell. 
Opens a command line where you can test your code and inspect all variables right up to the line 
that threw the error.
Type “n” and hit Enter to run the next line of code 
(The → arrow shows you the current position). Use “c” to continue until the next breakpoint. 
“q” quits the debugger and code execution.'''

In [None]:
!python HateSpeechClassifier.py

In [61]:
from Embedding import Embedding, utils_preprocess_text
from Model import Model, MODELS_PATH 

from sklearn.datasets import make_classification
from torch import nn, optim
from torch.autograd import Variable 
import torch.nn.functional as F
from skorch import NeuralNetClassifier

# # take from quickstart: https://skorch.readthedocs.io/en/stable/user/quickstart.html
X_n, y_n = make_classification(1000, 20, n_informative = 10, random_state=0)
# print(X_n)
# print(X_n.shape[0], X_n.shape[1])
# print(y_n) 
# print(y_n.shape[0])
X_n = X_n.astype(np.float32)
y_n = y_n.astype(np.int64)
vocab_size = X_n.shape[1] # len(X_n) 

class LSTM(nn.Module):  
    def __init__(self, dim = 128, num_units = 256, hidden = 128, dropout = 0.2, batch = 1, nonlin = F.relu): # nn.ReLU()): # 300 
        super(LSTM, self).__init__()
        self.num_tokens = num_units # could be X.shape[1] 
        self.lstm_size = dim # X.shape[1] # 128 # input
        self.hidden_size = self.lstm_size # 256 # hidden 
        self.num_layers = 3 # layers
        
        self.lstm = nn.LSTM(
            input_size = self.lstm_size,
            hidden_size = self.hidden_size,
            num_layers = self.num_layers,
            dropout = dropout,
        )
        self.dropout = nn.Dropout(dropout)
        
        self.encoder = nn.Embedding(self.num_tokens, self.lstm_size) # num_token, num_input 
        self.fc = nn.Linear(self.hidden_size, self.num_tokens) # num_hidden, num_token (vocab)
        
        self.init_weights()
        self.batch_size = batch
        
        weight = next(self.parameters())
        self.hidden = (weight.new_zeros(self.num_layers, self.batch_size, self.hidden_size),
                       weight.new_zeros(self.num_layers, self.batch_size, self.hidden_size)) 
        
        # self.dense0 = nn.Linear(self.lstm_size, num_units) # 20, num_units)
        # self.nonlin = nonlin
        # self.dropout = nn.Dropout(dropout)
        # self.dense1 = nn.Linear(num_units, self.lstm_size)
        # self.output = nn.Linear(self.lstm_size, 2)
        # self.softmax = nn.Softmax(dim = -1)

    def forward(self, X): # hidden 
        # X = (self.nonlin(self.dense0(X)))
        # X = (self.dropout(X))
        # X = (F.relu(self.dense1(X)))
        # X = (F.softmax(self.output(X), dim = -1))
        # print(X)
        # return X
        emb = self.dropout(X) 
        hidden = self.hidden
        output, hidden = self.lstm(emb, hidden)
        output = self.dropout(output)
        decoded = self.fc(output)
        self.hidden = hidden 
        return decoded, hidden
    
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        self.fc.weight.data.uniform_(-initrange, initrange)
    
    # def init_hidden(self, bsz):
    #     weight = next(self.parameters())
    #     hidden = (weight.new_zeros(self.num_layers, bsz, self.hidden_size),
    #               weight.new_zeros(self.num_layers, bsz, self.hidden_size))
    #     return (hidden)

# # Old tutorial 
# # https://github.com/skorch-dev/skorch/blob/master/examples/rnn_classifer/RNN_sentiment_classification.ipynbclass LSTM(nn.Module):
model = NeuralNetClassifier(
        LSTM(),
        module__dim = vocab_size, 
        module__hidden = vocab_size, 
        # module__num_units = 128,
        # module__dropout = 0.2,
        max_epochs = 10,
        lr = 0.01,
        # Shuffle training data on each epoch
        iterator_train__shuffle = True,
        # device=('cuda' if USE_CUDA else 'cpu'),
    )
# batch_size = 1
# model.init_hidden(batch_size) 
    
if cross_validate: 
    model.set_params(module__hidden = vocab_size, verbose = 0)
    params = {
        'lr': [0.05, 0.09, 0.1],
        'max_epochs': [20], # [10, 20]
        'module__num_units': [128] # , 256, 300]
    }
    gs = GridSearchCV(model, params, cv = 5, verbose = 2) #refit = False, cv = 5, scoring = 'accuracy', verbose = 2)
    gs.fit(X_n, y_n)
    print("best score: {:.3f}, best params: {}".format(gs.best_score_, gs.best_params_))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ......lr=0.05, max_epochs=20, module__num_units=128; total time=   0.0s
[CV] END ......lr=0.05, max_epochs=20, module__num_units=128; total time=   0.0s
[CV] END ......lr=0.05, max_epochs=20, module__num_units=128; total time=   0.0s
[CV] END ......lr=0.05, max_epochs=20, module__num_units=128; total time=   0.0s
[CV] END ......lr=0.05, max_epochs=20, module__num_units=128; total time=   0.0s
[CV] END ......lr=0.09, max_epochs=20, module__num_units=128; total time=   0.0s
[CV] END ......lr=0.09, max_epochs=20, module__num_units=128; total time=   0.0s
[CV] END ......lr=0.09, max_epochs=20, module__num_units=128; total time=   0.0s
[CV] END ......lr=0.09, max_epochs=20, module__num_units=128; total time=   0.0s
[CV] END ......lr=0.09, max_epochs=20, module__num_units=128; total time=   0.0s
[CV] END .......lr=0.1, max_epochs=20, module__num_units=128; total time=   0.0s
[CV] END .......lr=0.1, max_epochs=20, module__nu

15 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\trice\Twitter_Counter_Narrative\tcn_venv\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\trice\Twitter_Counter_Narrative\tcn_venv\lib\site-packages\skorch\classifier.py", line 141, in fit
    return super(NeuralNetClassifier, self).fit(X, y, **fit_params)
  File "C:\Users\trice\Twitter_Counter_Narrative\tcn_venv\lib\site-packages\skorch\net.py", line 1215, in fit
    self.partial_fit(X, y, **fit_params)
  File "C:\Users\trice\Twitter_Counter_Narrative\tcn_ven

RuntimeError: For unbatched 2-D input, hx and cx should also be 2-D but got (3-D, 3-D) tensors

In [None]:

REPORTS_PATH = 'Reports/'
SEED = 42 # Mo - maybe we should  try and use the same seed number throughout the code? - was 100
TARGET_TYPES = ['Disabled', 'Jews', 'LGBT+', 'Migrants', 'Muslims', 'POC', 'Women', 'Other/Mixed', 'None']

processed_data = pd.read_csv('Data/processed_combined_data.csv', index_col = False) # ignore index column         
    # print(processed_data)
texts = processed_data['HATE_SPEECH']
class_labels = processed_data['CLASS']

# print(texts, class_labels)
debug = True 
train_test = True
embedding_types = ['tfidf', 'word2vec', 'doc2vec', 'glove', 'bert']
model_types = ['LR', 'RF', 'NB', 'DT', 'XGB', 'SVC', 'LSTM']
model_type = "LSTM" 

with_stopwords = False
weighting_type = 'equal'
dimensions = 300
scoring = ['accuracy', 'balanced_accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'] 

# Add scoring for hyperparameter tuning 
# Final cross validation dictionary/dataframe comparison 
model_dfs = [] #  {} 

# remove stopwords if necessary
if not with_stopwords:
    stop_words = stopwords.words('english')
    texts = [t for t in texts if not t in stop_words]

if train_test: 
    X_train_text, X_test_text, y_train, y_test = train_test_split(texts, class_labels, 
                                                                  random_state = SEED, test_size = 0.2)
else: 
    X_train_text = texts # vectorize texts for 100% data for train
    y_train = class_labels # class_labels for 100% data for train

for embedding_type in embedding_types:
    if debug:
        print('----------------------------------------')
        print(embedding_type)

    # get embeddings for input text
    embedding = Embedding(embedding_type, 
                          with_stopwords = with_stopwords, 
                          weighting = weighting_type,
                          dimensions = dimensions)

    embedding_filename = embedding.get_filename()

    # vectorize train and test set
    X = embedding.vectorize(X_train_text, load_train = True) 
    y = y_train  
    # print(X.shape[0])
    # print(y.shape[0]) 
    
    X = X.astype(np.float32)
    y = y.astype(np.int64)
    vocab_size = X.shape[1]
    
    # print(X)
    # print(X.shape)
    # print(y) 
    # print(y.shape) 
    # X = np.reshape(X, (X.shape[1], X.shape[0]))
    # print(X)
    # print(X.shape) 

    # might need to save tfidf vectorizer and matrix for later use
    # embedding.save(train_test_split=True)
    if train_test: 
        X_test = embedding.vectorize(X_test_text, unseen = True, load_test = True)
        # embedding.save(train_test_split=True, save_test=True)

    model_params = {
                    'random_state' : SEED 
                   }
    # model_num = 0
    debug = debug  
    output = True 
    save = True # False 
    num = True
    cross_validate = True # whether to fine tune the hyper parameters of the model with a cross validator 
    outfile = ""     
    model_name = model_type + "_" + embedding_filename 
    if debug:
        print('----------------------------------------')
        print(model_type)
        print('----------------------------------------')
        print("Filename ")
        print("", model_name) 
        
    model = NeuralNetClassifier(
        LSTM(),
        module__dim = vocab_size, 
        # module__num_units = 128,
        # module__dropout = 0.2,
        max_epochs = 10,
        lr = 0.01,
        # Shuffle training data on each epoch
        iterator_train__shuffle = True,
    )
    
    if cross_validate: 
        model.set_params(verbose = 0)
        params_grid = {
            'lr': [0.01, 0.02, 0.05],
            'max_epochs': [20], # [10, 20]
            'module__num_units': [128] # , 256, 300]
        }
        gs = GridSearchCV(model, params_grid, cv = 5, verbose = 2) #refit = False, cv = 5, scoring = 'accuracy', verbose = 2)
        # print(y.dtype)
        # y = y.astype(np.float64)
        # print(y.dtype) 
        gs.fit(X, y)
        print("best score: {:.3f}, best params: {}".format(gs.best_score_, gs.best_params_))
        
    # model.fit(X, y)
    # y_proba = model.predict_proba(X)
    # y_proba = gs.predict_proba(X) 
    # print(y_proba)
    
    # model = NeuralNet(model_type, X = X, y = y, debug = debug, 
    #               model_params = model_params, filename = model_name) #max_iter=2000) 
    
    # LSTM(X, y, debug = debug) #max_iter=2000)
#     mm = MinMaxScaler()
#     ss = StandardScaler()
#     X_ss = ss.fit_transform(X)
#     y_mm = mm.fit_transform(y) 
    
#     X_train = X_ss[:int(0.8 * len(X)), :]
#     X_test = X_ss[int(0.8 * len(X)):, :]
#     y_train = y_mm[:int(0.8 * len(y)), :] 
#     y_test = y_mm[int(0.8 * len(X)):, :]
#     batches = [batch_size] * len(X) 
#     train(model)    
    # model.fit(X, y) 
                
    # y_preds = model.predict(X_test)
    # print(pd.concat[y_test, y_preds]) 
#     cls_report = model.classification_report(y_test, y_preds, output = output, save = save, 
#                                              path = REPORTS_PATH) #, path = reports_path)
#     acc_report = model.accuracy_score(y_test, y_preds, path = REPORTS_PATH)
#     conf_m = model.confusion_matrix(y_test, y_preds, output = output, save = save, 
#                                     path = REPORTS_PATH) #, path = reports_path)

#     model.heatmap(save = save, num = num, path = REPORTS_PATH) # path = reports_path)
#     model.roc_curve(X_test, y_test, save = save, path = REPORTS_PATH) # path = reports_path)
#     # model.report_results(output = output, save = save, path = REPORTS_PATH) # path = reports_path) 
#     model.save_model(MODELS_PATH)

    # print("Cross Validation Evaluation: ") 
    # print("Cross validator: ", model.CV) 
    # print("Old estimator: ", model.untuned_model) 
    # print("Best estimator: ", model.model)
    # print("Best parameters: ", model.model_params) 
    # print("Cross_validation report comparison: ")
    # print(model.cv_report) 

In [None]:
from Embedding import Embedding, utils_preprocess_text
from Model import Model, MODELS_PATH 
from torch import nn, optim
from torch.autograd import Variable 

# taken from tutorial: https://closeheat.com/blog/pytorch-lstm-text-generation-tutorial
from sklearn.datasets import make_classification
from torch import nn
from skorch import NeuralNetClassifier

class LSTM(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken = X.shape[1], num_units = 128, 
                 nhid = 128, nlayers = 3, dropout=0.5, tie_weights=False):
        super(LSTM, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, num_units)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(num_units, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(num_units, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            if nhid != num_units:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()),
                    Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()))
        else:
            return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())

## Define Paths

In [None]:
conan_path = 'CONAN-master/Multitarget-CONAN/'
conan_file = 'Multitarget-CONAN.json'
conan_p = conan_path + conan_file

davidson_path = 'hate-speech-and-offensive-language-master/data/'
davidson_f = 'labeled_data.csv' 
davidson_p = davidson_path + davidson_f

combined_path = ''
combined_f = 'combined_dataset.csv'
combined_p = combined_path + combined_f

reports_path = 'classification_reports/'
data_path = 'datasets/'
model_path = 'models/'
embed_path = 'embeds/'

# command to activate virtual environment on Windows 
# cd into src folder 
# run Scripts\activate.bat 

# list of classifiers to test 
classifiers = ["Logistic Regression", "Random Forest", "Decision Trees", "XGBoost", "SVM", "Naive Bayes"]

# create lists of names for loading models 
class_list = [str.lower(clsf).replace(" ", "_") + ".pkl" for clsf in classifiers]# ['random_forest.pkl', 'svm.pkl']
class_model_list =  [str.lower(clsf).replace(" ", "_") + "_model.pkl" for clsf in classifiers] # ['random_forest_model.pkl, svm_model.pkl']
# print(class_list)
# print(class_model_list)

## Multi-class Hate Speech Classifier

In [None]:
class Classifier: # args are unnamed parameters (makes a list), and kwargs are named parameters (makes a dictionary)
    def __init__(self, labels, *args, **kwargs): # X, y, **kwargs):   
        self.labels = labels # enter a list of the class labels you want to output for the graphs
        self.X = kwargs.get('X', None) # input independent variable data
        self.y = kwargs.get('y', None) # target corresponding class variables with X
        name_regex = '([a-zA-Z]*)(\s*)([a-zA-Z]*)'
        # determine generic classifier using this as a named parameter  
        clsf = kwargs.pop('classifier', '') 
        normalize = lambda name: (name.group(1).strip().capitalize() + " " + name.group(3).strip().capitalize())
        # formats all variations of name for classifer as 'Upper Case' 
        self.model_name = re.sub(name_regex, normalize, clsf).strip() # or re.sub(name_regex, normalize, kwargs.pop('model', ''))

        # get model parameters (hyper parameters) as a named parameter 
        self.model_params = kwargs.get('model_params', None) # == 'model_params'
        # selects normal model given model_name 
        if self.model_params is None:
            self.model = LogisticRegression(random_state = 0, solver = 'lbfgs', warm_start = True) if self.model_name == "Logistic Regression" else \
                RandomForestClassifier(random_state = 0) if self.model_name == "Random Forest" else \
                GaussianNB() if self.model_name == "Naive Bayes" else \
                DecisionTreeClassifier(random_state = 0) if self.model_name == "Decision Trees" else \
                XGBClassifier(random_state = 0) if self.model_name == "Xgboost" else \
                SVC(random_state = 0, probability = True) # if self.model_name == "SVM" else None
        else:
            self.model = LogisticRegression(random_state = 0, **(self.model_params)) if self.model_name == "Logistic Regression" else \
                RandomForestClassifier(random_state = 0, **(self.model_params)) if self.model_name == "Random Forest" else \
                GaussianNB(**(self.model_params)) if self.model_name == "Naive Bayes" else \
                DecisionTreeClassifier(random_state = 0, **(self.model_params)) if self.model_name == "Decision Trees" else \
                XGBClassifier(random_state = 0, **(self.model_params)) if self.model_name == "Xgboost" else \
                SVC(random_state = 0, probability = True, **(self.model_params)) # if self.model_name == "SVM" else None
        
        # if passing model as named parameter 
        if self.model is None:
            self.model = kwargs.get('model', None) # if self.model_name == "SVM" else None
            
        # get Cross Validator object as a named parameter 
        self.CV = kwargs.get('cv', None) # == 'cv'
        print("cross validator", self.CV)
        # get cv parameter (parameters) 
        self.cv_params = kwargs.get('cv_params', None) # == 'cv_params'
        # print("cross validator parameters", self.cv_params)
        # print(self.cv_params) 
        
        # Retain model evaluation metrics
        self.preds = None
        self.cls_report = None 
        self.acc_report = None 
        self.train_score = None
        self.test_score = None 
        self.conf_m = None  # confusion matrix for test data 
        # self.features_n = len(tfidf_vectorizer.vocabulary_)
        self.filename = (self.model_name.lower().replace(" ", "_"))  
        self.old_model = None 
        if self.X is None: 
            self.X = args[0]
        if self.y is None: 
            self.y = args[1] 
        if self.CV is not None and self.cv_params is not None: 
            self.CV, self.old_model, self.model = self.cross_validation(*args, **kwargs) 
            # model = self.model, model_params = self.model_params, 
                              # cv = self.CV, cv_params = self.cv_params)
      
    # Defines cross_validation function to test modified models
    def cross_validation(self, *args, **kwargs): 
        model = self.model if self.model is not None else kwargs.pop('model', None)
        model_params = self.model_params if self.model_params is not None else kwargs.pop('model_params', None)
        cv = self.CV if self.CV is not None else kwargs.pop('cv', None)
        cv_params = self.cv_params if self.cv_params is not None else kwargs.pop('cv_params', None)
        debug = kwargs.pop('debug', False) 
        
        cv_args = []
        cv_kwargs = cv_params
            
        if debug: 
            print("cross_validation")
            print("model: ", self.model_name)
            print("args", args)
            print("kwargs:", kwargs)
            print("model params:", model_params) 
            print("cv params:", cv_params)
            print("param_grid:", param_grid)
            print("cv_args", cv_args)
            print("cv_kwargs", cv_kwargs)
        
        self.CV = cv(model, **cv_kwargs) # *args, **default_args) # create cross validator model 
        if debug: 
            print(model)
            print(self.CV) 
        self.CV.fit(self.X, self.y)
        self.model = self.CV.best_estimator_  # replace old model with new best model 
        return self.CV, model, self.model 
    
    def set_params(self, *args, **kwargs):
        self.labels = kwargs.pop('labels', '') # should be labels 
        
        name_regex = '([a-zA-Z]*)(\s*)([a-zA-Z]*)'
        clsf = kwargs.pop('classifier', '')
        normalize = lambda name: (name.group(1).strip().capitalize() + " " + name.group(3).strip().capitalize())
        self.model_name = re.sub(name_regex, normalize, clsf).strip() # or re.sub(name_regex, normalize, kwargs.pop('model', ''))
        self.model = LogisticRegression(random_state = 0) if self.model_name == "Logistic Regression" else \
            RandomForestClassifier(random_state = 0) if self.model_name == "Random Forest" else \
            GaussianNB() if self.model_name == "Naive Bayes" else \
            DecisionTreeClassifier(random_state = 0) if self.model_name == "Decision Trees" else \
            XGBClassifier(random_state = 0) if self.model_name == "Xgboost" else \
            SVC(random_state = 0, probability = True) 
        if self.model is None:
            self.model = kwargs.pop('model', None) # if self.model_name == "SVM" else None
        self.model_params = kwargs.pop('model_params', None) # == 'model_params'
        self.CV = kwargs.pop('cv', None) # == 'cv'
        self.cv_params = kwargs.pop('cv_params', None) # == 'cv_params'
        
        self.preds = kwargs.pop('preds', None)
        self.score = kwargs.pop('score', None)
        self.cls_report = kwargs.pop('cls_report', None) 
        self.acc_report = kwargs.pop('acc_report', None)
        self.conf_m = None 
        self.filename = (self.model_name.lower().replace(" ", "_")) 
        
    # Use this function with a model object and set parameters for that model from its scikit-learn documentation     
    def set_model_params(self, *args, **kwargs): 
        self.model = LogisticRegression(args, kwargs, random_state = 0) if (self.model_name == "Logistic Regression") else \
            RandomForestClassifier(args, kwargs, random_state = 0) if (self.model_name == "Random Forest") else \
            GaussianNB(args, kwargs) if self.model_name == "Naive Bayes" else \
            DecisionTreeClassifier(args, kwargs, random_state = 0) if self.model_name == "Decision Trees" else \
            XGBClassifier(args, kwargs, random_state = 0) if self.model_name == "Xgboost" else \
            SVC(args, kwargs, random_state = 0) # if self.model_name == "SVM" else None
        if self.model is None:
            self.model = (kwargs.pop('model', None))(args, kwargs) # if self.model_name == "SVM" else None
  
    def get_model(self): # return current model instance 
        return self.model
    
    def load(self, infile = "", path = ""):  # load Classifier object and model from pkl
        if infile is "": 
            with open(path + self.filename + ".pkl", "rb") as file: # read byte 
                self = pickle.load(file)
        else: 
            with open(infile, "rb") as file: # read byte 
                self = pickle.load(file)
        return self # Classifier(labels, model) 
    
    def load_model(self, infile = "", path = ""): 
        if infile is "": 
            with open(path + self.filename + "_model.pkl", "rb") as file: # read byte 
                self.model = pickle.load(file)
        else: 
            with open(infile, "rb") as file: # read byte 
                self.model = pickle.load(file)
        return self.model # Classifier(labels, model) 
    
    def save(self, path = "", outfile = ""):
        if outfile is "": 
            with open(path + self.filename + ".pkl", "wb") as file: # write byte 
                pickle.dump(self, file)
        else: 
            with open(outfile, "wb") as file: # write byte 
                pickle.dump(self, file)
        return self 
    
    def save_model(self, path = "", outfile = ""):
        if outfile is "": 
            with open(path + self.filename + "_model.pkl", "wb") as file: # write byte 
                pickle.dump(self.model, file)
        else: 
            with open(outfile, "wb") as file: # write byte 
                pickle.dump(self.model, file)
        return self.model 
        
    def fit(self, X_train, y_train):  # returns model object as well 
        self.model.fit(X_train, y_train)
        return self.model 
        
    def predict(self, X_test): 
        self.preds = self.model.predict(X_test)
        return self.preds
    
    def predict_proba(self, X_test):
        self.preds_proba = self.model.predict_proba(X_test)
        return self.preds_proba
    
    def score(self, X_test, y_test): 
        self.score = self.model.score(X_test, y_test) 
        return self.score
    
    def report_results(self, y_test = None, preds = None, output = True, save = False, path = "", outfile = ""): 
        cls_report = self.cls_report if (y_test is None) and (preds is None) else self.classification_report(y_test, preds, output = output) 
        acc_report = self.acc_report if (y_test is None) and (preds is None) else self.accuracy_score(y_test, preds, output = output)
        conf_m = np.array_str(self.conf_m, precision = 3) if (y_test is None) and (preds is None) \
        else np.array_str(self.confusion_matrix(y_test, preds, output = output), precision = 3) 
    
        report = cls_report + "\n" + acc_report + "\n" + conf_m

        if save: 
            if outfile is "": 
                # report.to_csv(path + self.filename + "_results.csv", sep = ",", index = False, encoding = 'utf8')
                with open(path + self.filename + "_results.txt", "w+", encoding = "utf8") as file: # write byte 
                    file.write(report)
            else: 
                # report.to_csv(outfile, sep = ",", index = False, encoding = 'utf8')
                with open(outfile, "w+", encoding = "utf8") as file: # write byte 
                    file.write(report)
    
    def classification_report(self, y_test, preds, output = True, save = False, path = "", outfile = ""): 
        self.cls_report = classification_report(y_test, preds)
        if output: 
            print(self.cls_report)
        if save: 
            report = classification_report(y_test, preds, output_dict = save)
            report = pd.DataFrame(report).transpose()
            if outfile is "": 
                report.to_csv(path + self.filename + "_clsf_report.csv", sep = ",", index = False, encoding = 'utf8')
                # with open(path + self.filename + ".csv", encoding = "utf8") as file: # write byte 
                    # file.write(report)
            else: 
                report.to_csv(outfile, sep = ",", index = False, encoding = 'utf8')
                # with open(outfile, encoding = "utf8") as file: # write byte 
                    # file.write(report) 
        return self.cls_report 
            
    def accuracy_score(self, y_test, preds, output = True, save = False, path = "", outfile = ""): 
        self.train_score = self.model.score(X_train, y_train)
        self.test_score = accuracy_score(y_test, preds)
        report = '{} Train accuracy {:.3f}%'.format(self.model_name, self.train_score * 100) + '\n' \
            + '{} Test accuracy {:.3f}%'.format(self.model_name, self.test_score * 100) + '\n'
        self.acc_report = report 
        if output: 
            print(report) 
        if save: 
            if outfile is "": 
                # report.to_csv(path + self.filename + "acc_report.csv", sep = ",", index = False, encoding = 'utf8')
                with open(path + self.filename + "_acc_report.txt", "w+", encoding = "utf8") as file: # write byte 
                    file.write(report)
            else: 
                # report.to_csv(outfile, sep = ",", index = False, encoding = 'utf8')
                with open(outfile, "w+", encoding = "utf8") as file: # write byte 
                    file.write(report) 
        return self.acc_report   
        
    def confusion_matrix(self, y_test, preds, output = True, save = False, path = "", outfile = ""): 
        self.conf_m = confusion_matrix(y_test, preds)
        if output:  
            print('Confusion matrix: ')
            print(self.conf_m)
        
        if save: 
            if outfile is "": 
                # report.to_csv(path + self.filename + "conf_matrix.csv", sep = ",", index = False, encoding = 'utf8')
                with open(path + self.filename + "_conf_matrix.txt", "w+", encoding = "utf8") as file: # write byte 
                    file.write(str(self.conf_m))
            else: 
                # report.to_csv(outfile, sep = ",", index = False, encoding = 'utf8')
                with open(outfile, "w+", encoding = "utf8") as file: # write byte 
                    file.write(str(self.conf_m)) 
        return self.conf_m
    
    def heatmap(self, conf_m = None, labels = None, save = False, num = False, path = "", outfile = "", model_num = ""):  # draw confusion matrix 
        conf_m = self.conf_m if conf_m is None else conf_m 
        labels = self.labels if labels is None else labels 
        
        size = len(conf_m)
        matrix = np.zeros((size, size))
        for i in range(0, size):
            matrix[i, :] = (conf_m[i, :])/(float(conf_m[i,:].sum())) # calculate percentage matrix of true positives
            
        conf_df = pd.DataFrame(matrix, index = labels, columns = labels)
        plt.figure(figsize=(size * 1.5, size * 1.25))
        
        sns.heatmap(conf_df, annot = True, annot_kws = {"size": size * 1.33},
                        cmap = "YlGnBu", # 'gist_gray_r', 
                    cbar = False, square = True, fmt = '.2f')
        
        plt.ylabel('True categories', fontsize = size * 1.5)
        plt.xlabel('Predicted categories', fontsize = size * 1.5)
        plt.tick_params(labelsize = size * 1.33)
        
        if save:
            f_name = self.filename + "_heatmap" # "_".join(self.model_name.lower())
            
            if outfile is "":
                plt.savefig(path + f_name + ".pdf") # '.pdf')
            else: 
                plt.savefig(outfile)
        
        # second heatmap with category numbers rather than percentile 
        if num: 
            conf_df = pd.DataFrame(conf_m, index = labels, columns = labels)
            plt.figure(figsize = (size * 1.5, size * 1.25))
            sns.heatmap(conf_df, annot = True, annot_kws = {"size": size * 1.33},
                        cmap = "YlGnBu", # 'gist_gray_r', 
                    cbar = False, square = True, fmt = '5d')
            
            plt.ylabel('True categories', fontsize = size * 1.5)
            plt.xlabel('Predicted categories', fontsize = size * 1.5)
            plt.tick_params(labelsize = size * 1.33)
            if save:
                f_name = self.filename + "_heatmap_num" # "_".join(self.model_name.lower())
            
                if outfile is "":
                    plt.savefig(path + f_name + str(model_num) +  ".pdf") # + "_num.pdf") # '.pdf')
                else:
                    print(outfile[:-5] + str(model_num) + ".pdf")
                    plt.savefig(outfile[:-5] + str(model_num) + ".pdf") # '.pdf')

                
    def roc_curve(self, X_test, y_test, save = False, path = "", outfile = ""):
        size = len(self.labels) # compare to number of classes to format and size output 
        y_probs = self.model.predict_proba(X_test)
        skplt.metrics.plot_roc(y_test, y_probs, figsize = (size * 1.5, size * 1.25), title_fontsize = size * 1.75, 
                               text_fontsize = size * 1.5)
           
        # plt.show()
        # plt.ylabel('True positive rate', fontsize = size * 1.5)
        # plt.xlabel('False positive rate', fontsize = size * 1.5)
        # plt.tick_params(labelsize = size * 1.33)
        
        if save:
            f_name = self.filename + "_roc_curves" # "_".join(self.model_name.lower())
            if outfile is "":
                plt.savefig(path + f_name + ".pdf") # '.pdf')
            else: 
                plt.savefig(outfile)
        
# Report classifications to file  
# Be able to allow Lesh and Kimberly to train  and get a predictive value (and also true, false positive rates 
# and recall)

In [None]:
with open(data_path + "combined_data.csv", "r", encoding = "utf8") as file: 
    combined_hate_df = pd.read_csv(file)
    file.close() 
    
with open(data_path + "combined_class.csv", "r", encoding = "utf8") as file: 
    combined_tar_df = pd.read_csv(file)
    file.close() 
    
# with open(data_path + "combined_labels.csv", "r", encoding = "utf8") as file: 
#     label_names = pd.read_csv(file) 
#     file.close()
    
# display(combined_hate_df)
# display(combined_tar_df)
combined_hate_list = combined_hate_df.iloc[:, 0] 
# display(combined_hate_list) 
# combined_hate_df 
combined_tar_list = combined_tar_df.iloc[:, 0]
# display(combined_tar_list)
# label_names = list(label_names)
label_names = ['Disabled', 'Jews', 'LGBT+', 'Migrants', 'Muslims', 'POC', 'Women', 'Other/Mixed', 'None']
# labels = label_names

In [None]:
# min_tar_orig = combined_tar_list
# labels = np.unique(min_tar_orig)

# display(labels)
display(label_names)

## Embeddings

### Create tfidf vectorizer embeddings

In [None]:
#TF-IDF Features-F1
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# max and min are cutoffs for document frequency 

# Word Embeddings and Feature Selection 
# tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),
#                                    max_df = 0.75, min_df = 5, # 0.7 - 1.0 for max_df takes care of stop words
#                                    max_features = 10000)
# # TF-IDF feature matrix
# docs = tfidf_vectorizer.fit_transform(combined_hate_list)
# features = tfidf_vectorizer.get_feature_names() # _out(input_features = None)
# print(len(features))

# Not necessary with max_df 
# stop_words = tfidf_vectorizer.get_stop_words()
# print(stop_words)

# embed_params = tfidf_vectorizer.get_params()

# with open(embed_path + "tfidf_vectorizer.pkl", "wb") as file: # write byte 
#     pickle.dump(save_tfidf_vectorizer, file)
#     file.close() 
  
# print(embed_params)
# display(docs)
# display(tfidf_vectorizer.vocabulary_)
# display(tfidf_vectorizer.idf_)

# encoded vector
# display(docs.shape)
# display(docs.toarray())

### Load tfidf vectorizer embedding

In [None]:
with open(embed_path + "tfidf_vectorizer.pkl", "rb") as file: # write byte 
    tfidf_vectorizer = pickle.load(file)
    file.close() 
    
# TF-IDF feature matrix
docs = tfidf_vectorizer.fit_transform(combined_hate_list)
features = tfidf_vectorizer.get_feature_names() # _out(input_features = None)
print(len(features))

# Not necessary with max_df 
# stop_words = tfidf_vectorizer.get_stop_words()
# print(stop_words)
embed_params = tfidf_vectorizer.get_params()

In [None]:
skplt

# Fit, predict, and hyperparameter tune models with Cross Validation

#### Runs Logistic Regression, Random Forest, Decision Trees, XGBoost, SVM, and Naive Bayes algorithms in that order

In [None]:
# classifiers = ["Logistic Regression", "Random Forest", "Decision Trees", "XGBoost", "SVM", "Naive Bayes"]
# reports_path = 'classification_reports/'
# data_path = 'datasets/'
# model_path = 'models/'
# embed_path = 'embeds/' 


# def confusion_matrix_scorer(clf, X_train, y_train):
#     y_preds = clf.predict(X_train)
#     conf_m = confusion_matrix(y_train, y_preds)
#     return {'true negative': conf_m[0, 0], 'false positive': conf_m[0, 1],
#             'false negative': conf_m[1, 0], 'true positive': conf_m[1, 1]}

model_num = 0 

output = True 
save = True # False 
num = True

cross_validate = False # whether to fine tune the hyper parameters of the model with a cross validator object 
debug = False
outfile = ""


cv = None 
X = docs
y = combined_tar_list

for i in range((len(classifiers))): 
    if cross_validate: 
        cv = GridSearchCV # almost always uses StratifiedKFold with 5 splits 
        cv_params = None # [[{'random_state' : 0}]]
        
    if (classifiers[i] == "Logistic Regression"): 
        model_params = None
        if cross_validate: 
            model_params = {'max_iter': 1000 # control number of iterations for regression convergence 
                           }
            solvers = ['newton-cg', 'lbfgs', 'liblinear']
            penalty = ['l2']
            C = [100, 10, 1.0, 0.1, 0.01]
            # define grid search
            param_grid = {
                          'penalty': penalty,
                          'C': c_values,
                          'solver': solvers, # penalty, C, solver is order 
                         }
            
            cv_params = {'param_grid': param_grid, 
                         # 'scorer': confusion_matrix_scorer, 
                         'verbose': 3
                        } # [param_grid, {'verbose': 3}]
    elif (classifiers[i] == "Random Forest"):  # do with randomized search 
        model_params = None
        if cross_validate: 
            cv = RandomizedSearchCV # almost always uses StratifiedKFold with 5 splits 
            # Number of trees in random forest
            n_estimators = [int(x) for x in np.linspace(start = 800, stop = 2000, num = 3)] # [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
            # Number of features to consider at every split
            max_features = ['auto', 'sqrt']
            # Maximum number of levels in tree
            max_depth = [int(x) for x in np.linspace(10, 110, num = 3)] # [int(x) for x in np.linspace(10, 110, num = 11)]
            max_depth.append(None)
            # Minimum number of samples required to split a node
            min_samples_split = [2, 5, 10]
            # Minimum number of samples required at each leaf node
            min_samples_leaf = [1, 2, 4]
            bootstrap = [True, False]
            # param_grid = {'base_estimator__max_depth': [2, 4, 6, 8]}
            model_params = None
            # Create the param grid
            param_grid = { 
                           'n_estimators': n_estimators,
                           'max_features': max_features,
                           'max_depth': max_depth,
                           'min_samples_split': min_samples_split,
                           'min_samples_leaf': min_samples_leaf,
                           'bootstrap': bootstrap # Method of selecting samples for training each tree
                         }
            cv_params = {'param_distributions': param_grid, 
                         # 'scorer': confusion_matrix_scorer, 
                         'verbose': 2
                        } # [param_grid, {'verbose': 3}]
    elif (classifiers[i] == "Decision Trees"): 
        model_params = None
        if cross_validate: 
            # choose function to measure quality of node split 
            criterion = ["gini", "entropy"]
            model_params = None
            param_grid = {'criterion' : criterion,
                          'max_features': max_features,
                          'max_depth': max_depth,
                          'min_samples_split': min_samples_split,
                          'min_samples_leaf': min_samples_leaf,
                         }
            cv_params = {'param_grid': param_grid, 
                         # 'scorer': confusion_matrix_scorer, 
                         'verbose': 2
                        } # [param_grid, {'verbose': 3}]
    elif (classifiers[i] == "XGBoost"): 
        model_params = None
        if cross_validate: 
            cv = RandomizedSearchCV
            param_grid = {'objective': ['reg:squarederror', 'reg:squaredlogerror'],
                          'max_depth': [3, 6, 10],
                          'learning_rate': [0.01, 0.05, 0.1],
                          'n_estimators': [100, 500, 1000],
                          'colsample_bytree': [0.3, 0.7]
                          }        
            cv_params = {'param_distributions': param_grid, 
                         # 'scorer': confusion_matrix_scorer, 
                         'verbose': 2
                        } # [param_grid, {'verbose': 3}]
    elif (classifiers[i] == "SVM"):  # Support Vector Machine
        model_params = None
        if cross_validate: 
            cv = GridSearchCV # RandomizedSearchCV
            param_grid = { 
                           'C': [0.1, 1, 10, 100], 
                           'gamma': [1 , 0.1, 0.01, 0.001]
                         } 
            
            cv_params = {'param_grid': param_grid, 
                         # 'scorer': confusion_matrix_scorer, 
                         # 'refit': True, 
                         'probability': True, 
                         'verbose': 2
                        } # [param_grid, {'verbose': 3}]
    elif (classifiers[i] == "Naive Bayes"):
        X = X.toarray()
        model_params = None
        if cross_validate: 
            param_grid = {'var_smoothing': np.logspace(0, -9, num = 100)
        # np.<a onclick="parent.postMessage({'referent':'.numpy.logspace'}, '*')">logspace(0,-9, num=100)}
                         }     
            cv_params = {'param_grid': param_grid, 
                         # 'scorer': confusion_matrix_scorer, 
                         'verbose': 3
                        } # [param_grid, {'verbose': 3}]

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2) 
    
    if cross_validate:
        model = Classifier(labels, X_train, y_train, debug = debug, classifier = classifiers[i], model_params = model_params,
                           cv = cv, cv_params = cv_params) 
    else: 
        model = Classifier(labels, X_train, y_train, debug = debug, classifier = classifiers[i], model_params = model_params) 

    # Example results 
    cv = model.CV
    if outfile is "" and cross_validate: 
        filename = model.filename
        # report.to_csv(path + self.filename + "conf_matrix.csv", sep = ",", index = False, encoding = 'utf8')
        cv_path = filename + "_cross_validation_scoring.txt" # "_cross_validation_scoring_" + str(model_num) + ".pdf"
        with open(reports_path + cv_path, "w+", encoding = "utf8") as file: # write byte 
            file.write(classifiers[i] + " Best: %f using %s\n" % (cv.best_score_, cv.best_params_))
            # print(classifiers[i], "Best: %f using %s" % (cv.best_score_, cv.best_params_))
            means = cv.cv_results_['mean_test_score']
            stds = cv.cv_results_['std_test_score']
            params = cv.cv_results_['params']
            for mean, stdev, param in zip(means, stds, params):
                file.write("%f (%f) with: %r\n" % (mean, stdev, param))
                # print("%f (%f) with: %r" % (mean, stdev, param))
    elif outfile is not "": 
        # report.to_csv(outfile, sep = ",", index = False, encoding = 'utf8')
        with open(outfile, "w+", encoding = "utf8") as file: # write byte 
            file.write(classifiers[i], "Best: %f using %s\n" % (cv.best_score_, cv.best_params_))
            # print(classifiers[i], "Best: %f using %s" % (cv.best_score_, cv.best_params_))
            means = cv.cv_results_['mean_test_score']
            stds = cv.cv_results_['std_test_score']
            params = cv.cv_results_['params']
            for mean, stdev, param in zip(means, stds, params):
                file.write("%f (%f) with: %r\n" % (mean, stdev, param))
                # print("%f (%f) with: %r" % (mean, stdev, param))

    model.fit(X_train, y_train)
    y_preds = model.predict(X_test)
    # y_probs = model.predict_proba(X_test)
    
    old_path = (classifiers[i].lower().replace(" ", "_")) + "_clsf_report_" + str(model_num) + ".txt"
    old_heatmap_path = (classifiers[i].lower().replace(" ", "_")) + "_heat_map_" + str(model_num) + ".pdf"
    old_roc_path = (classifiers[i].lower().replace(" ", "_")) + "_roc_curves_" + str(model_num) + ".pdf"
    # print(old_path)
    # print(old_heatmap_path)
    # print(old_roc_path)
    cls_report = model.classification_report(y_test, y_preds, output = output, save = save, 
                                             outfile = reports_path + old_path) #, path = reports_path)
    
    old_path = (classifiers[i].lower().replace(" ", "_")) + "_acc_report_" + str(model_num) + ".txt"
    acc_report = model.accuracy_score(y_test, y_preds, outfile = reports_path + old_path)
    
    old_path = (classifiers[i].lower().replace(" ", "_")) + "_conf_matrix_" + str(model_num) + ".txt"
    conf_m = model.confusion_matrix(y_test, y_preds, output = output, save = save, 
                                    outfile = reports_path + old_path) #, path = reports_path)

    model.heatmap(save = save, num = num, outfile = reports_path + old_heatmap_path) # path = reports_path)
    model.roc_curve(X_test, y_test, save = save, outfile = reports_path + old_roc_path) # path = reports_path)
    model.report_results(output = output, save = save, outfile = reports_path + old_path) # path = reports_path) 
    
    model.save_model(path = model_path)
    model.save(path = model_path)

In [None]:
X = docs
y = combined_tar_list
choice = 0 

save = False

'''Load last updated classifier object''' 
new_clsf = Classifier(labels).load(infile = model_path + class_list[choice])
print(new_clsf.get_model()) 
#OR 

'''Create new classifier object and pass it copy of last updated model''' 
# sec_clsf = Classifier(labels) # labels, classifier = classifiers[1])                                                              
# load_model = sec_clsf.load_model(infile = model_path + class_model_list[choice])
# print(load_model)

'''Two ways to load up and test models'''
y_preds = new_clsf.predict(X_test)
print(y_preds)

# sec_y_preds = sec_clsf.predict(X_test)
# print(sec_y_preds)

''' test model only ''' 
# md_y_preds = load_model.predict(X_test)
# print(md_y_preds)

# # new_clsf.set_params(model = load_model)

# # sec_y_preds.set_params(labels = labels, classifier = classifiers[1])

# cls_report = new_clsf.classification_report(y_test, y_preds)
acc_report = new_clsf.accuracy_score(y_test, y_preds)
print(new_clsf.model_name)
# conf_m = new_clsf.confusion_matrix(y_test, y_preds)

# new_clsf.report_results(y_test, y_preds)
# new_clsf.heatmap()

# cls_report = sec_clsf.classification_report(y_test, y_preds)
acc_report = sec_clsf.accuracy_score(y_test, y_preds) 
print(sec_clsf.model_name) # this is empty for sec_model

# conf_m = sec_clsf.confusion_matrix(y_test, y_preds)

# sec_clsf.report_results(y_test, y_preds)
# sec_clsf.heatmap()

# # look at set_model_params() with
# load_model.get_params().keys()

In [None]:
# # probability for the label, and the label itself (parallel list) 
save = False 
output = True 

X = docs
y = combined_tar_list 
# label_names are types of 9 targets 

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2)

# print(X.shape)
# print(X_test.shape)
# print(y.shape)
# print(y_test.shape)

test_model_0 = Classifier(label_names, classifier = classifiers[0]) 

test_model_0.fit(X_train, y_train)
y_preds = test_model_0.predict(X_test)
y_probs = test_model_0.predict_proba(X_test)

# display(X_test)
# display(y_test)
display(pd.concat([pd.DataFrame(y_preds, columns = ["prediction"]), pd.DataFrame(y_probs, columns = label_names)], axis = 1))

cls_report = test_model_0.classification_report(y_test, y_preds, output = output, save = save, path = reports_path)
acc_report = test_model_0.accuracy_score(y_test, y_preds)
conf_m = test_model_0.confusion_matrix(y_test, y_preds, output = output, save = save, path = reports_path)

test_model_0.heatmap(save = save, path = reports_path)
test_model_0.report_results(output = output, save = save, path = reports_path) 

# test_model_0.save_model(path = model_path)
# test_model_0.save(path = model_path)

In [None]:
# consider n for self-created test set  n = 200 or n = 250 

In [None]:
# LSTM and CNN *** sharfard paper 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

X = docs
y = combined_tar_list

# min_max = MinMaxScaler()
# standard_scale = StandardScaler()
# tfidf_X = tfidf_vectorizer.transform(X).toarray()
# tfidf_y = tfidf_vectorizer.transform(y).toarray()
# print(len(tfidf_X))
# print(len(tfidf_y))

tfidf_X = X
tfidf_y = y
X_ss = tfidf_X # standard_scale.fit_transform(tfidf_X)
y_mm = tfidf_y # min_max.fit_transform(tfidf_y) 
# display(X_ss)
# display(y_mm)

train_num = int(0.8 * len(y)) 
# print(train_num)
# Must keep data in sequential order for memory learning 
X_train = X_ss[:train_num, :]
X_test = X_ss[train_num:, :]
# print("X_train ")
# print(X_train)
# print("X_test ")
# print(X_test)

# print(X_ss)
# print(y_mm)
y_train = y_mm[:train_num] # , :]
y_test = y_mm[train_num:] # , :]
print("y_train ", y_train)
print("y_test ", y_test)

# max_review_length = 500
# X_train = sequence.pad_sequences(X_train, maxlen = max_review_length)
# X_test = sequence.pad_sequences(X_test, maxlen = max_review_length)

# print("Training Shape", X_train.shape, y_train.shape)
# print("Testing Shape", X_test.shape, y_test.shape)

from torch.autograd import Variable 
X_train_tensors = Variable(torch.Tensor(X_train))# .shape[0]))
X_test_tensors = Variable(torch.Tensor(X_test))# .shape[0]))
display(X_train_tensors)
display(X_test_tensors)

y_train_tensors = Variable(torch.Tensor(y_train))
display(y_train_tensors)
y_test_tensors = Variable(torch.Tensor(list(y_test))) 
display(y_test_tensors)

X_train_tensors_final = torch.reshape(X_train_tensors, 
                                      (X_train_tensors.shape[0], 1, X_train_tensors.shape[1]))
X_test_tensors_final = torch.reshape(X_test_tensors,  
                                     (X_test_tensors.shape[0], 1, X_test_tensors.shape[1])) 
display(X_train_tensors_final)
display(X_test_tensors_final)

display(y_train_tensors)
display(y_test_tensors)

# lstm = torch.nn.LSTM(X) # 10, 20, 2)
# output, (hn, cn) = lstm(inputs, (h0, c0))

# create the model
embed_vec_length = 32
lstm = Sequential()
lstm.add(Embedding(train_num, embed_vec_length))# , input_length = train_num))

lstm.add(LSTM(100))

# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropogate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (autograd.Variable(torch.randn(1, 1, 3)), autograd.Variable(
    torch.randn((1, 1, 3))))  # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

# lstm.add(Dense(1, activation = 'sigmoid'))

lstm.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

lstm.summary()
lstm.fit(X_train, y_train, batch_size = 32) # epochs = 3, batch_size = 64)

# print("Output:")
# print(output)
# print()
# print("hn:")
# print(hn)
# print()
# print("cn:")
# print(cn)
# lstm.compile(loss='cosine_proximity', optimizer='sgd', metrics = ['accuracy'])

# Final evaluation of the model
scores = lstm.evaluate(X_test, y_test, epochs = 1, verbose = 0)
print("Accuracy: %.2f%%" % (scores[1] * 100))

In [None]:
nn_model = Sequential()
nn_model.add(Embedding(input_dim=1000, output_dim=64))

# The output of GRU will be a 3D tensor of shape (batch_size, timesteps, 256)
nn_model.add(GRU(256, return_sequences=True))

# The output of SimpleRNN will be a 2D tensor of shape (batch_size, 128)
nn_model.add(SimpleRNN(128))

nn_model.add(Dense(10))

nn_model.summary()

nn_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
nn_model.fit(X_train, y_train, epochs = 3, batch_size = 64)

In [None]:
# considers params and gamma heavily with gaussian

# model selection and hyperparamter tuning step 
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1 , 0.1, 0.01, 0.001]} 
svm_grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)

svm_grid.fit(X_train, y_train)

y_preds = svm_grid.predict(X_test)
report = classification_report(y_test, y_preds)
acc4 = accuracy_score(y_test, y_preds)
mod_train4 = svm_grid.score(X_train, y_train) 

print(report)

print('SVM Train accuracy {:.3f}%'.format(mod_train4 * 100)) 
print('SVM Test accuracy {:.3f}%'.format(acc4 * 100)) 

print('Confusion matrix: ')
conf_m = confusion_matrix(y_test, y_preds)
print(conf_m)
make_heatmap(conf_m, labels, save = False, out_file = output_path + classifiers[0] + '_combined.pdf')
