In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import tensorflow as tf

from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [3]:
df_train['text_combo'] = df_train['location'].astype(str) + " : " + df_train['keyword'].astype(str) + " : " + df_train['text'].astype(str)
df_test['text_combo'] = df_test['location'].astype(str) + " : " + df_test['keyword'].astype(str) + " : " + df_test['text'].astype(str)


In [4]:
df_train_subset = df_train[["text_combo", "target"]].copy()
df_train_subset.rename(columns = {'text_combo':'text'}, inplace = True)
X_train, X_test = train_test_split(df_train_subset, test_size=0.05, random_state=0, stratify=df_train["target"])

In [22]:
MODEL_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'
BATCH_SIZE = 16
N_EPOCHS = 3

In [19]:
#HuggingFace models come with their own tokenizes, suitable for what input it expects
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

In [20]:
#tokenize the text
train_encodings = tokenizer(list(X_train["text"]),
                            truncation=True, 
                            padding=True)

test_encodings = tokenizer(list(X_test["text"]),
                           truncation=True, 
                           padding=True)


In [10]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),
                                    list(X_train["target"].values)))

test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),
                                    list(X_test["target"].values)))

In [23]:
model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)#chose the optimizer
#optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)#define the loss function 
optimizer = tf.keras.optimizers.Adam(learning_rate=18e-6)#define the loss function 
losss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)#build the model

model.compile(optimizer=optimizer,
              loss=losss,
              metrics=['accuracy'])

checkpoint_filepath = 'mycheckpoint'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    verbose=1,
    save_best_only=True)

model.fit(train_dataset.shuffle(len(X_train)).batch(BATCH_SIZE),
          epochs=N_EPOCHS,
          batch_size=BATCH_SIZE,
          callbacks=[model_checkpoint_callback],
          validation_data=test_dataset.shuffle(len(X_train)).batch(1))

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_59']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 1: val_loss improved from inf to 0.36113, saving model to mycheckpoint
Epoch 2/3
Epoch 2: val_loss did not improve from 0.36113
Epoch 3/3
Epoch 3: val_loss did not improve from 0.36113


<keras.callbacks.History at 0x1c1a4823310>

In [12]:
#now to load the saved best model weights
model.load_weights(checkpoint_filepath)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1bb8d2b0be0>

In [13]:
def predict_proba(text_list, model, tokenizer):
    encodings = tokenizer(text_list, 
                          #max_length=MAX_LEN, 
                          truncation=True, 
                          padding=True)

    #somehow these API's never read very intuitively :/ 
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings)))
    #the batch(1) seems to be required for the call..
    preds = model.predict(dataset.batch(1)).logits  
    
    #transform to array with probabilities
    res = tf.nn.softmax(preds, axis=1).numpy()      
    
    return res

In [14]:
#test_texts = list(df_test["text"])
test_texts = list(df_test["text_combo"])

In [15]:
preds = predict_proba(test_texts, model, tokenizer)



In [16]:
df_submission = pd.DataFrame()
df_submission["id"] = df_test["id"]
df_submission["target"] = preds[:, 1] = 0.5
df_submission["target"] = df_submission["target"].astype(int)
df_submission

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
...,...,...
3258,10861,0
3259,10865,0
3260,10868,0
3261,10874,0


In [17]:
df_submission.to_csv("distilbert_submission.csv", index=False)