# Model3b
## Ktrain Customized Regression (Text+Tabular) with BERT (API) Embeddings

### TOC

* [Dataset Prep](#d)
* [Preprocessing](#p)
* [Modeling](#m)
    * [Model Selection](#ms)
    * [Data Reformat](#dr)

## Dataset Prep <a class="anchor" id="d"></a>

In [1]:
from tensorflow import keras
import numpy as np
import pandas as pd
import math

df_1 = pd.read_csv("df_model_2.csv")
df_1.head()

#df_1['PercentChg.y']=df_1['PercentChg.y']*100

#texts
df_text = df_1[['News content', 'PercentChg.y']]

x_text_prep = df_text.iloc[:, :-1].values
y_text_prep = df_text.iloc[:, -1].values

from sklearn.model_selection import train_test_split
x_text_train, x_text_test, y_text_train, y_text_test = train_test_split(x_text_prep, y_text_prep, 
                                                    test_size = 0.2, random_state = 123)

#tabulars
df_tab = df_1[['currentRatio', 'quickRatio', 'debtEquityRatio', 'interestCoverage',
                   'returnOnEquity', 'priceEarningsRatio', 'receivablesTurnover', 
                   'payablesTurnover', 'eps', 'PercentChg.y']]

x_tab_prep = df_tab.iloc[:, :-1].values
y_tab_prep = df_tab.iloc[:, -1:].values

from sklearn.model_selection import train_test_split
x_tab_train, x_tab_test, y_tab_train, y_tab_test = train_test_split(x_tab_prep, y_tab_prep, 
                                                    test_size = 0.2, random_state = 123)

tab_train, tab_test = train_test_split(df_tab,test_size = 0.2, random_state = 123)

#tab_train = tab_train.values.tolist()
#tab_test = tab_test.values.tolist()

# to list for texts
x_text_train = x_text_train.tolist()
x_text_test = x_text_test.tolist()
y_text_train = y_text_train.tolist()
y_text_test = y_text_test.tolist()

for i in range(0, len(x_text_train)): 
    x_text_train[i] = str(x_text_train[i]) 
for i in range(0, len(x_text_test)): 
    x_text_test[i] = str(x_text_test[i]) 

## Preprocessing <a class="anchor" id="p"></a>

In [None]:
import ktrain
from ktrain import text

trn_text, val_text, preproc_text = text.texts_from_array(x_train=x_text_train, y_train=y_text_train,
                                          x_test=x_text_test, y_test=y_text_test,
                                          preprocess_mode = 'bert',
                                          maxlen=300, 
                                          max_features=35000)

In [None]:
import ktrain
from ktrain import tabular

trn_tab, val_tab, preproc_tab = tabular.tabular_from_df(tab_train, 
                                                        label_columns=['PercentChg.y'],
                                                        val_df=tab_test,
                                                        is_regression=True, 
                                                        random_state=123)

## Modeling <a class="anchor" id="m"></a>

Model Selection <a class="anchor" id="ms"></a>

In [None]:
model_text = text.text_regression_model('bert', train_data = trn_text, 
                                        preproc = preproc_text, metrics=['mse','mae'])

In [None]:
model_tab = tabular.tabular_regression_model('mlp', trn_tab, metrics=['mse', 'mae'])

In [21]:
merged_out = keras.layers.concatenate([model_tab.output, model_text.output])
merged_out = keras.layers.Dropout(0.25)(merged_out)
merged_out = keras.layers.Dense(1000, activation='relu')(merged_out)
merged_out = keras.layers.Dropout(0.25)(merged_out)
merged_out = keras.layers.Dense(500, activation='relu')(merged_out)
merged_out = keras.layers.Dropout(0.5)(merged_out)
merged_out = keras.layers.Dense(1)(merged_out)

combined_model = keras.Model([model_tab.input] + [model_text.input], merged_out)
combined_model.compile(loss='mae', optimizer='adam', metrics=['mae', 'mse'])

Data Reformatting for Modeling <a class="anchor" id="d"></a>

In [22]:
#trn_text_a,trn_text_b = [ [individualArray] for individualArray in trn_text[0]] 
#val_text_a,val_text_b = [ [individualArray] for individualArray in val_text[0]] 
trn_text_1 = trn_text[0]
val_text_1 = val_text[0]

In [23]:
trn_text_a = trn_text_1[0]
trn_text_b = trn_text_1[1]

val_text_a = val_text_1[0]
val_text_b = val_text_1[1]

In [24]:
class MyCustomDataset(ktrain.SequenceDataset):
    def __init__(self, x, y, batch_size=32, shuffle=True):
        # error checks
        err = False
        if type(x) == np.ndarray and len(x.shape) != 2: err = True
        elif type(x) == list:
            for d in x:
                if type(d) != np.ndarray or len(d.shape) != 2:
                    err = True
                    break
        else: err = True
        if err:
            raise ValueError('x must be a 2d numpy array or a list of 2d numpy arrays')
        if type(y) != np.ndarray:
            raise ValueError('y must be a numpy array')
        if type(x) == np.ndarray:
            x = [x]

        # set variables
        super().__init__(batch_size=batch_size)
        self.x, self.y = x, y
        self.indices = np.arange(self.x[0].shape[0])
        self.n_inputs = len(x)
        self.shuffle = shuffle

    # required for instances of tf.keras.utils.Sequence
    def __len__(self):
        return math.ceil(self.x[0].shape[0] / self.batch_size)

    # required for instances of tf.keras.utils.Sequence
    def __getitem__(self, idx):
        inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_x = []
        for i in range(self.n_inputs):
            batch_x.append(self.x[i][inds])
        batch_y = self.y[inds]
        return tuple(batch_x), batch_y

    # required for instances of ktrain.Dataset
    def nsamples(self):
        return self.x[0].shape[0]

    #required for instances of ktrain.Dataset
    def get_y(self):
        return self.y

    def on_epoch_end(self):
        if self.shuffle:  np.random.shuffle(self.indices)

In [None]:
train_data = MyCustomDataset([x_tab_train] +  [trn_text_a] + [trn_text_b], trn_text[1], shuffle=True)
val_data = MyCustomDataset([x_tab_test] + [val_text_a] + [val_text_b], val_text[1], shuffle=False)
learner = ktrain.get_learner(combined_model, train_data=train_data, val_data=val_data, batch_size=14)

In [11]:
learner.fit_onecycle(1e-3, 5)



begin training using onecycle policy with max lr of 0.001...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fb296eda4c0>