In [18]:
!pip install tensorflow_text -q
!pip install transformers -q

In [19]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import tensorflow_text as text
import pandas as pd
from transformers import BertTokenizer, TFBertModel

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained("bert-base-uncased")

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [20]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
import pandas as pd

Mounted at /content/drive


In [5]:
train_df = pd.read_csv("/content/drive/MyDrive/p2.csv")

### Check

In [6]:
df_train1 = train_df.copy()

In [7]:
df_train1['T_BP'] = df_train1['T_BP'].astype("string")

In [8]:
df_train1['new_col'] = df_train1['PRODUCT_TYPE_ID'].astype(str) + '[SEP]' + df_train1['T_BP']

In [31]:
import pandas as pd
from sklearn.utils import resample

# Load the dataset

# Separate the column with 19k entries
col = df_train1['PRODUCT_TYPE_ID']

# Sample 50% of the column while preserving the distribution
sampled_col = resample(col, n_samples=0.25*len(col), random_state=42, stratify=col)

# Get the indices of the sampled entries
indices = sampled_col.index

# Sample the original dataframe based on the indices
sampled_df = df_train1.loc[indices]

In [32]:
Y = sampled_df["PRODUCT_LENGTH"]
X = sampled_df['new_col']

In [33]:
X = X.fillna("")

In [34]:
X.shape

(140605,)

In [35]:
Y.shape

(140605,)

In [36]:
#X = X[0:23940]
#Y = Y[0:23940]

In [37]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.33, random_state=42)

In [38]:
df_train1['PRODUCT_TYPE_ID']

0          6314
1          5852
2             1
3             0
4          2834
          ...  
562416     6320
562417     8036
562418      819
562419     6115
562420    12316
Name: PRODUCT_TYPE_ID, Length: 562421, dtype: int64

In [6]:
hub_url = "https://tfhub.dev/google/sentence-t5/st5-base/1"
bert_encoder = hub.KerasLayer(hub_url)

bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
#bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")



In [None]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.84351724, -0.5132727 , -0.88845736, ..., -0.7474883 ,
        -0.75314754,  0.91964495],
       [-0.87208354, -0.50543964, -0.94446665, ..., -0.85847497,
        -0.71745336,  0.88082975]], dtype=float32)>

In [None]:
X_train

335466    3598[SEP]chance premium rubber outdoor indoor ...
36229     96[SEP]paul kelver 1902 autobiographical novel...
35473                     1[SEP]des esels schatten op posth
487316    1218[SEP]big time toy yoyo ball automatic retu...
254220                              6132[SEP]utilitarianism
                                ...                        
155806    1458[SEP]panchhi products best cheese vegetabl...
130755    3072[SEP]ukal women kaftan maxi kimono style n...
133054    12440[SEP]violet vibes rakhi gift peel wooden ...
334143    8490[SEP]bhoolugoolu little prince princess mi...
134685    10484[SEP]yash desert aquarium fountain high q...
Name: new_col, Length: 188410, dtype: string

In [None]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.5, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(32, activation='elu', name="hl1")(l)
l = tf.keras.layers.BatchNormalization()(l)
# l = tf.keras.layers.Dense(8, activation='tanh', name="hl2")(l)
# l = tf.keras.layers.BatchNormalization()(l)
l = tf.keras.layers.Dense(1, activation='linear', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [None]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_4 (KerasLayer)     {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                

In [None]:
# tf.keras.metrics.

In [None]:
METRICS = [
      tf.keras.metrics.MAPE,
      tf.keras.metrics.MSE
]

model.compile(optimizer=tf.keras.optimizers.Adam(
    learning_rate=0.005,
    beta_1=0.905,
    beta_2=0.995,
    epsilon=1e-07,
    weight_decay=None,
),
              loss=tf.keras.losses.mape,
              metrics=METRICS)

In [None]:
Y.shape

(281210,)

In [None]:
X.shape

(281210,)

In [None]:
X_train = np.array(X_train)
Y_train = np.array(y_train)

In [None]:
X_test = np.array(X_test)
Y_test = np.array(y_test)

In [None]:
# type(X)

In [None]:
# Y = np.asarray(Y).astype('float32')
import keras

# class CustomSaver(keras.callbacks.Callback):
#     def on_epoch_end(self, epoch, logs={}):
#         if epoch == 1:  # or save after some epoch, each k-th epoch etc.
#             self.model.save("model_{}.hd5".format(epoch))


In [None]:
checkpoint = keras.callbacks.ModelCheckpoint('model{epoch:08d}.h5', period=1) 



In [None]:
model.fit(X_train, Y_train, epochs=10, validation_split = 0.2, callbacks=[checkpoint])

Epoch 1/10
1005/4711 [=====>........................] - ETA: 21:15 - loss: 97.9589 - mean_absolute_percentage_error: 97.9589 - mean_squared_error: 14351646720.0000

### Testing

In [7]:
import torch, gc, random
from transformers.file_utils import is_tf_available, is_torch_available
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
#%load_ext memory_profiler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np

In [None]:
import pandas as pd
from sklearn.utils import resample

# Load the dataset

# Separate the column with 19k entries
col = df_train1['PRODUCT_TYPE_ID']

# Sample 50% of the column while preserving the distribution
sampled_col = resample(col, n_samples=0.25*len(col), random_state=42, stratify=col)

# Get the indices of the sampled entries
indices = sampled_col.index

# Sample the original dataframe based on the indices
sampled_df = df_train1.loc[indices]
y = sampled_df["PRODUCT_LENGTH"]
X = sampled_df['new_col']
X = X.fillna("")
X.shape
y.shape

In [44]:
#X = Data
#y = Target


# Split Data
X_train, X_test, y_train, y_test = train_test_split(X.tolist(), y, test_size=0.33)

# Call the Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Encode the text
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128)
valid_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=128)



class MakeTorchData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        item["labels"] = float(item["labels"])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = MakeTorchData(train_encodings, y_train.ravel())
valid_dataset = MakeTorchData(valid_encodings, y_test.ravel())

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [45]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', 
                                                           num_labels = 1).to("cuda")

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    mse = mean_squared_error(labels, logits)
    rmse = mean_squared_error(labels, logits, squared=False)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    smape = 1/len(labels) * np.sum(2 * np.abs(logits-labels) / (np.abs(labels) + np.abs(logits))*100)
    
    single_squared_errors = ((logits - labels).flatten()**2).tolist()
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)
    
    #return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "accuracy": accuracy}

    return {"mse": mse, "rmse": rmse, "mae": mae, "r2": r2, "smape": smape}

In [47]:
training_args = TrainingArguments(
    output_dir ='./results',          
    num_train_epochs = 10,     
    per_device_train_batch_size = 64,   
    per_device_eval_batch_size = 20,   
    weight_decay = 0.01,               
    learning_rate = 2e-5,
    logging_dir = './logs',            
    save_total_limit = 10,
    load_best_model_at_end = True,     
    metric_for_best_model = 'rmse',    
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
) 

# Call the Trainer
trainer = Trainer(
    model = model,                         
    args = training_args,                  
    train_dataset = train_dataset,         
    eval_dataset = valid_dataset,          
    compute_metrics = compute_metrics_for_regression,     
)

# Train the model
trainer.train()

# Call the summary
trainer.evaluate()



Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,R2,Smape
1,7048875999.232,5457762304.0,5457762304.0,73876.671875,2149.249512,-0.000847,175.446272
2,10924662456.32,5457678848.0,5457678848.0,73876.101562,2130.618652,-0.000832,165.081153
3,7282364010987.519,5457608192.0,5457607168.0,73875.617188,2114.459717,-0.000819,156.994386
4,7143583907.84,5457546752.0,5457546240.0,73875.210938,2100.611328,-0.000807,150.532112
5,18693337645.056,5457494528.0,5457494016.0,73874.851562,2088.991455,-0.000798,145.391595
6,7221765780537.343,5457452544.0,5457452544.0,73874.570312,2079.668701,-0.00079,141.496843


KeyboardInterrupt: ignored

In [10]:
model1 = torch.load('/content/results/checkpoint-8832/pytorch_model.bin')
test_data = pd.read_csv('/content/drive/MyDrive/test_processed.csv')
test_data['T_BP'] = test_data['T_BP'].astype("string")
test_data['new_col'] = test_data['PRODUCT_TYPE_ID'].astype(str) + '[SEP]' + test_data['T_BP']

In [12]:
#y_test = test_data["PRODUCT_LENGTH"]
X_test = test_data['new_col']
X_test = X_test.fillna("")
X_test = X_test.tolist()
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=128)

KeyboardInterrupt: ignored

In [None]:
inputs = torch.tensor(preprocessed_data)
# put the model in evaluation mode
model1.eval()
# make predictions on the new data
with torch.no_grad():
    predictions = model1(inputs)
# convert the PyTorch tensor back to a numpy array
predictions = predictions.numpy()

### Check

In [None]:
df_train1 = train_df.copy()

In [None]:
df_train1['T_BP'] = df_train1['T_BP'].astype("string")

In [None]:
df_train1['new_col'] = df_train1['PRODUCT_TYPE_ID'].astype(str) + '[SEP]' + df_train1['T_BP']

In [None]:
import pandas as pd
from sklearn.utils import resample

# Load the dataset

# Separate the column with 19k entries
col = df_train1['PRODUCT_TYPE_ID']

# Sample 50% of the column while preserving the distribution
sampled_col = resample(col, n_samples=0.5*len(col), random_state=42, stratify=col)

# Get the indices of the sampled entries
indices = sampled_col.index

# Sample the original dataframe based on the indices
sampled_df = df_train1.loc[indices]

In [None]:
Y = sampled_df["PRODUCT_LENGTH"]
X = sampled_df['new_col']

In [None]:
X = X.fillna("")

In [None]:
X.shape

(281210,)

In [None]:
Y.shape

(281210,)

In [None]:
#X = X[0:23940]
#Y = Y[0:23940]

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.33, random_state=42)

In [None]:
df_train1['PRODUCT_TYPE_ID']

0          6314
1          5852
2             1
3             0
4          2834
          ...  
562416     6320
562417     8036
562418      819
562419     6115
562420    12316
Name: PRODUCT_TYPE_ID, Length: 562421, dtype: int64

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [None]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "500$ discount. hurry up", 
    "Bhavin, are you up for a volleybal game tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.84351724, -0.5132727 , -0.88845736, ..., -0.7474883 ,
        -0.75314754,  0.91964495],
       [-0.87208354, -0.50543964, -0.94446665, ..., -0.85847497,
        -0.71745336,  0.88082975]], dtype=float32)>

In [None]:
X_train

335466    3598[SEP]chance premium rubber outdoor indoor ...
36229     96[SEP]paul kelver 1902 autobiographical novel...
35473                     1[SEP]des esels schatten op posth
487316    1218[SEP]big time toy yoyo ball automatic retu...
254220                              6132[SEP]utilitarianism
                                ...                        
155806    1458[SEP]panchhi products best cheese vegetabl...
130755    3072[SEP]ukal women kaftan maxi kimono style n...
133054    12440[SEP]violet vibes rakhi gift peel wooden ...
334143    8490[SEP]bhoolugoolu little prince princess mi...
134685    10484[SEP]yash desert aquarium fountain high q...
Name: new_col, Length: 188410, dtype: string

In [None]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.5, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(32, activation='elu', name="hl1")(l)
l = tf.keras.layers.BatchNormalization()(l)
# l = tf.keras.layers.Dense(8, activation='tanh', name="hl2")(l)
# l = tf.keras.layers.BatchNormalization()(l)
l = tf.keras.layers.Dense(1, activation='linear', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [None]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_4 (KerasLayer)     {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                

In [None]:
# tf.keras.metrics.

In [None]:
METRICS = [
      tf.keras.metrics.MAPE,
      tf.keras.metrics.MSE
]

model.compile(optimizer=tf.keras.optimizers.Adam(
    learning_rate=0.005,
    beta_1=0.905,
    beta_2=0.995,
    epsilon=1e-07,
    weight_decay=None,
),
              loss=tf.keras.losses.mape,
              metrics=METRICS)

In [None]:
Y.shape

(281210,)

In [None]:
X.shape

(281210,)

In [None]:
X_train = np.array(X_train)
Y_train = np.array(y_train)

In [None]:
X_test = np.array(X_test)
Y_test = np.array(y_test)

In [None]:
# type(X)

In [None]:
# Y = np.asarray(Y).astype('float32')
import keras

# class CustomSaver(keras.callbacks.Callback):
#     def on_epoch_end(self, epoch, logs={}):
#         if epoch == 1:  # or save after some epoch, each k-th epoch etc.
#             self.model.save("model_{}.hd5".format(epoch))


In [None]:
checkpoint = keras.callbacks.ModelCheckpoint('model{epoch:08d}.h5', period=1) 



In [None]:
model.fit(X_train, Y_train, epochs=10, validation_split = 0.2, callbacks=[checkpoint])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10

In [25]:
import tensorflow as tf
from tensorflow import keras
from transformers import AutoTokenizer, TFBertModel 

In [29]:
my_reloaded_model = tf.keras.models.load_model(
       '/content/drive/MyDrive/model00000007.h5',
       custom_objects={'KerasLayer':hub.KerasLayer}
)

In [30]:
df_test = pd.read_csv('/content/drive/MyDrive/test_processed.csv')

In [32]:
df_test['T_BP'] = df_test['T_BP'].astype("string")
df_test['new_col'] = df_test['PRODUCT_TYPE_ID'].astype(str) + '[SEP]' + df_test['T_BP']
X_test = df_test['new_col']
X_test = X_test.fillna("")
X_test = np.array(X_test)

In [None]:
y_predict = my_reloaded_model.predict(
    X_test,
    batch_size=256,
    verbose="auto",
    steps=None,
    callbacks=None,
    max_queue_size=10,
    workers=2,
    use_multiprocessing=True,
)

