# Model3a
## Ktrain Customized Regression (Text+Tabular) with Standard Embeddings

### TOC

* [Dataset Prep](#d)
* [Preprocessing](#p)
* [Modeling](#m)
    * [Model Selection](#ms)
    * [Modeling](#mm)

## Dataset Prep <a class="anchor" id="d"></a>

In [1]:
import pandas as pd
from tensorflow import keras
import numpy as np
import math

df_1 = pd.read_csv("df_model_2.csv")
df_1.head()

#texts
df_text = df_1[['News content', 'PercentChg.y']]

x_text_prep = df_text.iloc[:, :-1].values
y_text_prep = df_text.iloc[:, -1].values

from sklearn.model_selection import train_test_split
x_text_train, x_text_test, y_text_train, y_text_test = train_test_split(x_text_prep, y_text_prep, 
                                                    test_size = 0.2, random_state = 123)

#tabulars
df_tab = df_1[['currentRatio', 'quickRatio', 'debtEquityRatio', 'interestCoverage',
                   'returnOnEquity', 'priceEarningsRatio', 'receivablesTurnover', 
                   'payablesTurnover', 'eps', 'PercentChg.y']]

x_tab_prep = df_tab.iloc[:, :-1].values
y_tab_prep = df_tab.iloc[:, -1:].values

from sklearn.model_selection import train_test_split
x_tab_train, x_tab_test, y_tab_train, y_tab_test = train_test_split(x_tab_prep, y_tab_prep, 
                                                    test_size = 0.2, random_state = 123)

tab_train, tab_test = train_test_split(df_tab,test_size = 0.2, random_state = 123)

#tab_train = tab_train.values.tolist()
#tab_test = tab_test.values.tolist()

# to list for texts
x_text_train = x_text_train.tolist()
x_text_test = x_text_test.tolist()
y_text_train = y_text_train.tolist()
y_text_test = y_text_test.tolist()

for i in range(0, len(x_text_train)): 
    x_text_train[i] = str(x_text_train[i]) 
for i in range(0, len(x_text_test)): 
    x_text_test[i] = str(x_text_test[i]) 

## Preprocessing <a class="anchor" id="p"></a>

In [None]:
import ktrain
from ktrain import text

trn_text, val_text, preproc_text = text.texts_from_array(x_train=x_text_train, y_train=y_text_train,
                                          x_test=x_text_test, y_test=y_text_test,
                                          ngram_range = 20,
                                          preprocess_mode = 'standard',
                                          maxlen=300, 
                                          max_features=35000,
                                          random_state=1)

In [None]:
import ktrain
from ktrain import tabular

trn_tab, val_tab, preproc_tab = tabular.tabular_from_df(tab_train, label_columns=['PercentChg.y'],
                                                        val_df=tab_test,
                                                           is_regression=True, 
                                                           random_state=123)

## Modeling <a class="anchor" id="m"></a>

Model Selection <a class="anchor" id="ms"></a>

In [None]:
model_text = text.text_regression_model('linreg', train_data=trn_text, 
                                        preproc=preproc_text, metrics=['mse','mae'])

In [None]:
model_tab = tabular.tabular_regression_model('mlp', trn_tab, metrics=['mse','mae']) #metrics=['mae']

In [6]:
merged_out = keras.layers.concatenate([model_tab.output, model_text.output])
merged_out = keras.layers.Dropout(0.25)(merged_out)
merged_out = keras.layers.Dense(1000, activation='relu')(merged_out)
merged_out = keras.layers.Dropout(0.25)(merged_out)
merged_out = keras.layers.Dense(500, activation='relu')(merged_out)
merged_out = keras.layers.Dropout(0.5)(merged_out)
merged_out = keras.layers.Dense(1)(merged_out)

combined_model = keras.Model([model_tab.input] + [model_text.input], merged_out)
combined_model.compile(loss='mae', optimizer='adam', metrics=['mae', 'mse'])

Modeling <a class="anchor" id="mm"></a>

In [None]:
import tensorflow as tf
from ktrain.data import TFDataset
BATCH_SIZE = 64

trn_combined = [x_tab_train] +  [trn_text[0]] + [trn_text[1]]
val_combined = [x_tab_test] + [val_text[0]] + [val_text[1]]

def features_to_tfdataset(examples):

    def gen():
        for idx, ex0 in enumerate(examples[0]):
            ex1 = examples[1][idx]
            label = examples[2][idx]
            x = (ex0, ex1)
            y = label
            yield ( (x, y) )

    tfdataset= tf.data.Dataset.from_generator(gen,
            ((tf.int32, tf.int32), tf.int64),
            ((tf.TensorShape([None]), tf.TensorShape([None])), tf.TensorShape([])) )
    return tfdataset
train_tfdataset= features_to_tfdataset(trn_combined)
val_tfdataset= features_to_tfdataset(val_combined)
train_tfdataset = train_tfdataset.shuffle(trn_combined[0].shape[0]).batch(BATCH_SIZE).repeat(-1)
val_tfdataset = val_tfdataset.batch(BATCH_SIZE)

train_data = ktrain.TFDataset(train_tfdataset, n=trn_combined[0].shape[0], y=trn_combined[2])
val_data = ktrain.TFDataset(val_tfdataset, n=val_combined[0].shape[0], y=val_combined[2])

learner = ktrain.get_learner(combined_model, train_data=train_data, val_data=val_data)

In [8]:
learner.fit_onecycle(1e-3, 5)



begin training using onecycle policy with max lr of 0.001...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fe4f97dd610>

In [None]:
learner.plot('loss')

In [None]:
learner.plot('lr')

In [None]:
learner.plot('momentum')

In [None]:
learner.view_top_losses(n=3)

In [68]:
pred = learner.predict(val_data)

In [None]:
# 5 random predictions
val_data.batch_size = 1
for i in range(5):
    idx = np.random.choice(len(x_text_test))
    #print("TEXT:\n%s" % (x_text_test[idx]))
    print([idx])
    print("\tpredicted: %s" % (np.squeeze(pred[idx])))
    print("\tactual: %s" % (y_text_test[idx]))           
    print('----------------------------------------')