In [None]:
import numpy as np
import regex as re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import math
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow.keras.backend as K
import tokenizers
from transformers import RobertaTokenizer, TFRobertaModel
import transformers

from collections import Counter

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Detect hardware, return appropriate distribution strategy (you can see that it is pretty easy to set up).
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is set (always set in Kaggle)
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print('Running on TPU ', tpu.master())
except ValueError:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
MODEL_NAME = 'camembert-base'
MAX_LEN = 256
ARTIFACTS_PATH = '../artifacts/'

BATCH_SIZE = 8 * strategy.num_replicas_in_sync
EPOCHS = 6

if not os.path.exists(ARTIFACTS_PATH):
    os.makedirs(ARTIFACTS_PATH)

In [None]:
def roberta_encode(texts, tokenizer):
    ct = len(texts)
    input_ids = np.ones((ct, MAX_LEN), dtype='int32')
    attention_mask = np.zeros((ct, MAX_LEN), dtype='int32')
    token_type_ids = np.zeros((ct, MAX_LEN), dtype='int32') # Not used in text classification

    for k, text in enumerate(texts):
        # Tokenize
        tok_text = tokenizer.tokenize(text)
        
        # Truncate and convert tokens to numerical IDs
        enc_text = tokenizer.convert_tokens_to_ids(tok_text[:(MAX_LEN-2)])
        
        input_length = len(enc_text) + 2
        input_length = input_length if input_length < MAX_LEN else MAX_LEN
        
        # Add tokens [CLS] and [SEP] at the beginning and the end
        input_ids[k,:input_length] = np.asarray([0] + enc_text + [2], dtype='int32')
        
        # Set to 1s in the attention input
        attention_mask[k,:input_length] = 1

    return {
        'input_word_ids': input_ids,
        'input_mask': attention_mask,
        'input_type_ids': token_type_ids
    }

In [None]:
# Import tokenizer from HuggingFace
from transformers import CamembertForMaskedLM
from transformers import CamembertTokenizer

tokenizer = CamembertTokenizer.from_pretrained(MODEL_NAME)
model = CamembertForMaskedLM.from_pretrained(MODEL_NAME)
model.eval()


In [None]:
import pandas as pd
import numpy as np
from keras.models import load_model

text = pd.read_csv('./Post_Data/Testing_data.csv')
text = text[['text']].to_numpy().reshape(-1)
text = roberta_encode(text, tokenizer)

model= load_model('./Post_Data/model.h5')
y_pred = model.predict(text)
y_pred_max = [np.argmax(i) for i in y_pred]

y_pred_fin = [category_to_name[i] for i in y_pred_max]

df = pd.DataFrame(y_pred_fin)
df.to_csv('./Post_Data/Output.csv', index=False)



In [None]:
import tensorflow as tf

new_model = tf.keras.models.load_model('./Post_Data/model.h5')

# Show the model architecture
new_model.summary()

In [None]:
model = saved_model.load_model('./Post_Data/model.h5')