In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

!pip install tqdm

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm, trange

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
files = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        files.append(os.path.join(dirname,filename))
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Detect hardware, return appropriate distribution strategy (you can see that it is pretty easy to set up).
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is set (always set in Kaggle)
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    print('Running on TPU ', tpu.master())
except ValueError:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
tweet_corpus = pd.read_json(files[0],lines=True)
data_type_csv = pd.read_csv(files[2])
emotion_csv = pd.read_csv(files[3])

In [None]:
tweet_corpus

In [None]:
id_based_corpus = {}
for i in range(len(tweet_corpus)):
    id_based_corpus[tweet_corpus['_source'][i]['tweet']['tweet_id']] = tweet_corpus['_source'][i]['tweet']['text']

tweet_corpus['id'] = id_based_corpus.keys()
tweet_corpus['text'] = id_based_corpus.values()
tweet_corpus = tweet_corpus.drop(['_index','_score','_crawldate','_type','_source'], axis = 1)

In [None]:
tweet_train_id = data_type_csv['tweet_id'][data_type_csv['identification'] == 'train']
tweet_test_id = data_type_csv['tweet_id'][data_type_csv['identification'] == 'test']
tweet_corpus_train = tweet_corpus[tweet_corpus['id'].isin(tweet_train_id)]
tweet_corpus_test = tweet_corpus[tweet_corpus['id'].isin(tweet_test_id)]
tweet_corpus_train = tweet_corpus_train.reset_index(drop=True)
tweet_corpus_test = tweet_corpus_test.reset_index(drop=True)

In [None]:
tweet_corpus_train = (pd.merge(tweet_corpus_train,emotion_csv,left_on="id",right_on="tweet_id",sort=False)).drop(['tweet_id'],axis=1)
tweet_corpus_train['text'] = tweet_corpus_train['text'].apply(lambda x: x.replace("<LH>", ""))

In [None]:
tags = ['joy','anticipation','trust','disgust','anger','surprise','fear','sadness']
y_train = tweet_corpus_train['emotion'].copy()
tweet_corpus_train['emotion']

In [None]:
tweet_corpus_test

In [None]:
tweet_corpus_test.to_json('test_data.json')

In [None]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

def label_encode(le, labels):
    enc = le.transform(labels)
    return to_categorical(enc)

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)



In [None]:
tweet_corpus_train

In [None]:
label_encoder.fit(tweet_corpus_train['emotion'])
emotion_one_hot = label_encode(label_encoder, tweet_corpus_train['emotion'])
label = emotion_one_hot.astype(object)
tweet_corpus_train['label'] = pd.Series(list(label))

In [None]:
MAX_LEN = 64
def roberta_encode(texts, tokenizer):
    ct = len(texts)
    input_ids = np.ones((ct, MAX_LEN), dtype='int32')
    attention_mask = np.zeros((ct, MAX_LEN), dtype='int32')
    token_type_ids = np.zeros((ct, MAX_LEN), dtype='int32') # Not used in text classification

    for k, text in enumerate(texts):
        # Tokenize
        tok_text = tokenizer.tokenize(text)
        
        # Truncate and convert tokens to numerical IDs
        enc_text = tokenizer.convert_tokens_to_ids(tok_text[:(MAX_LEN-2)])
        
        input_length = len(enc_text) + 2
        input_length = input_length if input_length < MAX_LEN else MAX_LEN
        
        # Add tokens [CLS] and [SEP] at the beginning and the end
        input_ids[k,:input_length] = np.asarray([0] + enc_text + [2], dtype='int32')
        
        # Set to 1s in the attention input
        attention_mask[k,:input_length] = 1

    return {
        'input_word_ids': input_ids,
        'input_mask': attention_mask,
        'input_type_ids': token_type_ids
    }

In [None]:
tweet_corpus_train
type(y_train)
y_train = y_train.to_numpy()

In [None]:
# Transform categories into numbers
category_to_id = {}
category_to_name = {}

for index, c in tqdm(enumerate(y_train)):
    if c in category_to_id:
        category_id = category_to_id[c]
    else:
        category_id = len(category_to_id)
        category_to_id[c] = category_id
        category_to_name[category_id] = c
    
    y_train[index] = category_id

# Display dictionary
category_to_name

In [None]:
y_train

In [None]:
tweet_corpus_train['label_by_id'] = y_train

In [None]:
tweet_corpus_train

In [None]:
tweet_corpus_train.to_json('train_data.json')

In [None]:
trial = pd.read_json('train_data.json')

In [None]:
trial

In [None]:
from transformers import RobertaTokenizer, TFRobertaModel

MODEL_NAME = 'roberta-base'

tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tweet_corpus_train, y_train, test_size=0.3, random_state=777) # random_state to reproduce results


X_train = roberta_encode(X_train, tokenizer)
X_test = roberta_encode(X_test, tokenizer)


y_train = np.asarray(y_train, dtype='int32')
y_test = np.asarray(y_test, dtype='int32')


In [None]:
import tensorflow as tf
import timeit
# tf.keras.mixed_precision.set_global_policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy('float32')

In [None]:
def build_model(n_categories):
    with strategy.scope():
        input_word_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_word_ids')
        input_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_mask')
        input_type_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_type_ids')

        # Import RoBERTa model from HuggingFace
        roberta_model = TFRobertaModel.from_pretrained(MODEL_NAME)
        x = roberta_model(input_word_ids, attention_mask=input_mask, token_type_ids=input_type_ids)

        # Huggingface transformers have multiple outputs, embeddings are the first one,
        # so let's slice out the first position
        x = x[0]

        x = tf.keras.layers.Dropout(0.1)(x)
        x = tf.keras.layers.Flatten()(x)
        x = tf.keras.layers.Dense(256, activation='relu')(x)
        x = tf.keras.layers.Dense(n_categories, activation='softmax')(x)

        model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=x)
        model.compile(
            optimizer=tf.keras.optimizers.Adam(lr=1e-5),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy'])

        return model

In [None]:
n_categories = len(category_to_name)
n_categories

In [None]:
with strategy.scope():
    model = build_model(n_categories)
    model.summary()

In [None]:
with strategy.scope():
    print('Training...')
    history = model.fit(X_train,
                        y_train,
                        epochs=3,
                        batch_size=20,
                        verbose=1,
                        validation_data=(X_test, y_test))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from transformers import BertTokenizer, BertConfig,AdamW, BertForSequenceClassification,get_linear_schedule_with_warmup


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)



BOW = CountVectorizer(features=300,tokenizer=tokenizer) 
BOW.fit(tweet_corpus_train['text'])
train_data = BOW.transform(tweet_corpus_train['text'])

In [None]:
import tensorflow as tf
import timeit

device_name = tf.test.gpu_device_name()
print('Found GPU at: {}'.format(device_name))


In [None]:
from keras.callbacks import CSVLogger

csv_logger = CSVLogger('training_log.csv')

# training setting
epochs = 25
batch_size = 1
# training!
tf.config.list_physical_devices('GPU')
history = model.fit(train_data, emotion_one_hot, 
                    epochs=epochs, 
                    batch_size=batch_size)