In [233]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path 

import os

import torch
import torch.optim as optim

import random 

# fastai
from fastai import *
from fastai.text import *
from fastai.callbacks import *

# transformers
from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig

from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig
from transformers import XLMForSequenceClassification, XLMTokenizer, XLMConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig

In [234]:
import fastai
import transformers
print('fastai version :', fastai.__version__)
print('transformers version :', transformers.__version__)

fastai version : 1.0.60
transformers version : 2.7.0


In [235]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/zindi-weekend-vaccine/Train.csv
/kaggle/input/zindi-weekend-vaccine/SampleSubmission.csv
/kaggle/input/zindi-weekend-vaccine/Test.csv


In [236]:
DATA_ROOT = Path("..") / "/kaggle/input/zindi-weekend-vaccine"
train = pd.read_csv(DATA_ROOT / 'Train.csv')
test = pd.read_csv(DATA_ROOT / 'Test.csv')
print(train.shape,test.shape)
train.head()

(10001, 4) (5177, 2)


Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0


In [237]:
train=train[train['label'].notnull()]

In [238]:
train.shape

(10000, 4)

In [239]:
train['label']=train['label'].apply(lambda x:int(x))

In [240]:
train['label'] = train['label'].map({-1: 0, 0: 1,1:2})

In [241]:

train.safe_text = train.safe_text.str.replace(r'http(\S)+', r'')
train.safe_text = train.safe_text.str.replace(r'http ...', r'')
train.safe_text = train.safe_text.str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
train.safe_text = train.safe_text.str.replace(r'@[\S]+',r'')

# Remove non-ascii words or characters
#train.safe_text = [''.join([i if ord(i) < 128 else '' for i in safe_text]) for safe_text in train.safe_text]
train.safe_text = train.safe_text.str.replace(r'_[\S]?',r'')

# Remove extra space
train.safe_text = train.safe_text.str.replace(r'[ ]{2, }',r' ')

# Remove &, < and >
train.safe_text = train.safe_text.str.replace(r'&amp;?',r'and')
train.safe_text = train.safe_text.str.replace(r'&lt;',r'<')
train.safe_text = train.safe_text.str.replace(r'&gt;',r'>')

# Insert space between words and punctuation marks
train.safe_text = train.safe_text.str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
train.safe_text = train.safe_text.str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

# Lowercased and strip
#train.safe_text = train.safe_text.str.lower()
train.safe_text = train.safe_text.str.strip()


In [242]:

test.safe_text = test.safe_text.str.replace(r'http(\S)+', r'')
test.safe_text = test.safe_text.str.replace(r'http ...', r'')
test.safe_text = test.safe_text.str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
test.safe_text = test.safe_text.str.replace(r'@[\S]+',r'')

# Remove non-ascii words or characters
#test.safe_text = [''.join([i if ord(i) < 128 else '' for i in safe_text]) for safe_text in test.safe_text]
test.safe_text = test.safe_text.str.replace(r'_[\S]?',r'')

# Remove extra space
test.safe_text = test.safe_text.str.replace(r'[ ]{2, }',r' ')

# Remove &, < and >
test.safe_text = test.safe_text.str.replace(r'&amp;?',r'and')
test.safe_text = test.safe_text.str.replace(r'&lt;',r'<')
test.safe_text = test.safe_text.str.replace(r'&gt;',r'>')

# Insert space between words and punctuation marks
test.safe_text = test.safe_text.str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
test.safe_text = test.safe_text.str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

# Lowercased and strip
#test.safe_text = test.safe_text.str.lower()
test.safe_text = test.safe_text.str.strip()


In [243]:
train['safe_text'][0]

'Me and The Big Homie meanboy3000 # MEANBOY # MB # MBS # MMR # STEGMANLIFE @ Stegman St . < url >'

In [244]:
train['safe_text'][1]

"I ' m 100 % thinking of devoting my career to proving autism isn ' t caused by vaccines due to the IDIOTIC posts I ' ve seen about World Autism Day"

In [245]:
train.head(2)

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me and The Big Homie meanboy3000 # MEANBOY # M...,0.0,1.0
1,E3303EME,I ' m 100 % thinking of devoting my career to ...,1.0,1.0


In [246]:
train['text_length'] = [len(safe_text.split(' ')) for safe_text in train.safe_text]

In [247]:
train = train[train['text_length']>0]
train = train.drop_duplicates(subset=['safe_text'])

In [248]:
MODEL_CLASSES = {
    'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),
    'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),
    'xlm': (XLMForSequenceClassification, XLMTokenizer, XLMConfig),
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig)
}

In [249]:
# Parameters
seed = 15
use_fp16 = False
bs = 32

model_type = 'roberta'
pretrained_model_name = 'roberta-base'


In [250]:
model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]

In [251]:
model_class.pretrained_model_archive_map.keys()

dict_keys(['roberta-base', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base', 'roberta-base-openai-detector', 'roberta-large-openai-detector'])

In [252]:
def seed_all(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) 
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
        

In [253]:
seed_all(seed)

In [254]:
class TransformersBaseTokenizer(BaseTokenizer):
    def __init__(self, pretrained_tokenizer: PreTrainedTokenizer, model_type = 'bert', **kwargs):
        self._pretrained_tokenizer = pretrained_tokenizer
        self.max_seq_len = pretrained_tokenizer.max_len
        self.model_type = model_type

    def __call__(self, *args, **kwargs): 
        return self

    def tokenizer(self, t:str) -> List[str]:
        CLS = self._pretrained_tokenizer.cls_token
        SEP = self._pretrained_tokenizer.sep_token
        if self.model_type in ['roberta']:
            tokens = self._pretrained_tokenizer.tokenize(t, add_prefix_space=True)[:self.max_seq_len - 2]
            tokens = [CLS] + tokens + [SEP]
        else:
            tokens = self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2]
            if self.model_type in ['xlnet']:
                tokens = tokens + [SEP] +  [CLS]
            else:
                tokens = [CLS] + tokens + [SEP]
        return tokens

In [255]:
transformer_tokenizer = tokenizer_class.from_pretrained(pretrained_model_name)
transformer_base_tokenizer = TransformersBaseTokenizer(pretrained_tokenizer = transformer_tokenizer, model_type = model_type)
fastai_tokenizer = Tokenizer(tok_func = transformer_base_tokenizer, pre_rules=[], post_rules=[])

In [256]:
class TransformersVocab(Vocab):
    def __init__(self, tokenizer: PreTrainedTokenizer):
        super(TransformersVocab, self).__init__(itos = [])
        self.tokenizer = tokenizer
    
    def numericalize(self, t:Collection[str]) -> List[int]:
        return self.tokenizer.convert_tokens_to_ids(t)
        #return self.tokenizer.encode(t)

    def textify(self, nums:Collection[int], sep=' ') -> List[str]:
        nums = np.array(nums).tolist()
        return sep.join(self.tokenizer.convert_ids_to_tokens(nums)) if sep is not None else self.tokenizer.convert_ids_to_tokens(nums)
    
    def __getstate__(self):
        return {'itos':self.itos, 'tokenizer':self.tokenizer}

    def __setstate__(self, state:dict):
        self.itos = state['itos']
        self.tokenizer = state['tokenizer']
        self.stoi = collections.defaultdict(int,{v:k for k,v in enumerate(self.itos)})

In [257]:
transformer_vocab =  TransformersVocab(tokenizer = transformer_tokenizer)
numericalize_processor = NumericalizeProcessor(vocab=transformer_vocab)

tokenize_processor = TokenizeProcessor(tokenizer=fastai_tokenizer, include_bos=False, include_eos=False)

transformer_processor = [tokenize_processor, numericalize_processor]

In [258]:
pad_first = bool(model_type in ['xlnet'])
pad_idx = transformer_tokenizer.pad_token_id

In [259]:
tokens = transformer_tokenizer.tokenize('This is Lawrence trial')
print(tokens)
ids = transformer_tokenizer.convert_tokens_to_ids(tokens)
print(ids)
transformer_tokenizer.convert_ids_to_tokens(ids)


['This', 'Ġis', 'ĠLawrence']
[713, 16, 6226]


['This', 'Ġis', 'ĠLawrence']

In [260]:
train.columns

Index(['tweet_id', 'safe_text', 'label', 'agreement', 'text_length'], dtype='object')

In [261]:
databunch = (TextList.from_df(train, cols='safe_text', processor=transformer_processor)
             .split_by_rand_pct(0.1,seed=seed)
             .label_from_df(cols= 'label')
             .add_test(test)
             .databunch(bs=bs, pad_first=pad_first, pad_idx=pad_idx))

In [262]:
print('[CLS] token :', transformer_tokenizer.cls_token)
print('[SEP] token :', transformer_tokenizer.sep_token)
print('[PAD] token :', transformer_tokenizer.pad_token)
databunch.show_batch()

[CLS] token : <s>
[SEP] token : </s>
[PAD] token : <pad>


text,target
<s> Ġç Ĺ ħ é Ļ ¢ å® Ł ç ¿ Ĵ è¡ Į ãģı ãģ® ãģ« MM R ãģ¨ æ° ´ ç ĸ ± ç ĺ ¡ ãģ®æ Ĭ Ĺ ä½ ĵ ãĤĴ èª ¿ ãģ ¹ ãģŁ ãĤī Ġ$ Ġ550 ãģ® è « ĭ æ ± Ĥ ãģĮ æĿ ¥ ãģŁ Ġ ãĢĤ Ċ (( (( ï ¼ Ľ Ġï ¾ Ł Ð Ķ ï ¾ Ł Ġ),0.0
<s> ĠFri Ġ12 Ġnoon Ġmeet ĠðŁ İ ī ðŁ Ĵ ¥ ĠGolden ĠGate ĠBridge ĠðŁ İ Ī Ġto Ġsave Ġbabies ĠðŁĳ ¶ ðŁ Ĵ ĵ Ġand Ġstop ĠðŁ Ĵ Ģ ðŁ Ļ Ģ ðŁ Ĵ ī ĠVacc inations ĠðŁ Ĵ ī ðŁ Ķ ª ðŁ Ķ « ĠSB Ġ277 Ġwear Ġyellow Ġ ĠðŁĺ Ĭ ðŁ İ Ī â ļ ¡ ï¸ı ðŁĺ İ ðŁĺ ĺ ðŁĳ ¨ âĢ į ðŁĳ,-1.0
<s> ĠðŁ Ĵ ¥ ðŁ İ § ðŁ İ ¼ ðŁ İ µ ðŁ İ ¶ ðŁ ı ¢ ðŁ Ĵ Ģ ðŁĳ Ĥ ðŁ Ĵ ¥ < Ġuser Ġ> Ġ# Ġmm r Ġ# Ġmix master rod Ġ# Ġd cd j Ġ# Ġscratching Al ittle bit Ġ# Ġmad h Ġ Ġ@ ĠMix Master Rod Ġ' Ġs ĠUp stairs ĠLounge Ġ< Ġurl Ġ> </s>,0.0
<s> Ġ< Ġuser Ġ> ĠEst am os Ġvac un ados Ġcontra Ġmuch as Ġcos as Ġper o Ġa Ãº n Ġas ÃŃ Ġhay Ġque Ġp oner se Ġref uer zos Ġde Ġc i ert as Ġvac un as Ġ. ĠRec ient ement e Ġme Ġp use Ġla ĠMMR Ġ. </s>,0.0
<s> Ġ< Ġuser Ġ> Ġha br Ã¡ Ġun Ġsoft Ġreset Ġy Ġtend r Ã¡s Ġque Ġj ugar Ġpart idas Ġde Ġpos icion am ient o Ġde Ġac uer do Ġa Ġtu Ġmm r Ġ. ĠNo Ġsab r ÃŃa Ġdec ir te Ġqu Ã© Ġte Ġsu ced er Ã¡ Ġ. </s>,0.0


In [263]:
print('[CLS] id :', transformer_tokenizer.cls_token_id)
print('[SEP] id :', transformer_tokenizer.sep_token_id)
print('[PAD] id :', pad_idx)
test_one_batch = databunch.one_batch()[0]
print('Batch shape : ',test_one_batch.shape)
print(test_one_batch)

[CLS] id : 0
[SEP] id : 2
[PAD] id : 1
Batch shape :  torch.Size([32, 135])
tensor([[    0, 48283,  6800,  ...,  1437, 45682,     2],
        [    0,   132,   282,  ...,     1,     1,     1],
        [    0,    44,    48,  ...,     1,     1,     1],
        ...,
        [    0, 28696,  3018,  ...,     1,     1,     1],
        [    0, 28696,  3018,  ...,     1,     1,     1],
        [    0,  1491,     7,  ...,     1,     1,     1]])


In [264]:
class CustomTransformerModel(nn.Module):
    def __init__(self, transformer_model: PreTrainedModel):
        super(CustomTransformerModel,self).__init__()
        self.transformer = transformer_model
        
    def forward(self, input_ids, attention_mask=None):
        
        attention_mask = (input_ids!=pad_idx).type(input_ids.type()) 
        
        logits = self.transformer(input_ids,
                                  attention_mask = attention_mask)[0]   
        return logits

In [265]:
config = config_class.from_pretrained(pretrained_model_name)
config.num_labels= 3
config.use_bfloat16 = use_fp16
print(config)

NameError: name 'num_labels' is not defined

In [None]:
transformer_model = model_class.from_pretrained(pretrained_model_name, config = config)
custom_transformer_model = CustomTransformerModel(transformer_model = transformer_model)

In [None]:
from fastai.callbacks import *
from transformers import AdamW
from functools import partial

CustomAdamW = partial(AdamW, correct_bias=False)

learner = Learner(databunch, 
                  custom_transformer_model, 
                  opt_func = CustomAdamW, 
                  metrics=[accuracy, error_rate])

learner.callbacks.append(ShowGraph(learner))

if use_fp16: learner = learner.to_fp16()

In [None]:
print(learner.model)

In [None]:

list_layers = [learner.model.transformer.roberta.embeddings,
              learner.model.transformer.roberta.encoder.layer[0],
              learner.model.transformer.roberta.encoder.layer[1],
              learner.model.transformer.roberta.encoder.layer[2],
              learner.model.transformer.roberta.encoder.layer[3],
              learner.model.transformer.roberta.encoder.layer[4],
              learner.model.transformer.roberta.encoder.layer[5],
              learner.model.transformer.roberta.encoder.layer[6],
              learner.model.transformer.roberta.encoder.layer[7],
              learner.model.transformer.roberta.encoder.layer[8],
              learner.model.transformer.roberta.encoder.layer[9],
              learner.model.transformer.roberta.encoder.layer[10],
              learner.model.transformer.roberta.encoder.layer[11],
               
              learner.model.transformer.roberta.pooler]
              


In [None]:
learner.split(list_layers)
num_groups = len(learner.layer_groups)
print('Learner split in',num_groups,'groups')
print(learner.layer_groups)


In [None]:
learner.save('untrain')

In [None]:
seed_all(seed)
learner.load('untrain');

In [None]:
learner.freeze_to(-1)

In [None]:
learner.summary()

In [None]:
learner.lr_find()

In [None]:
learner.recorder.plot(skip_end=10,suggestion=True)

In [None]:
learner.fit_one_cycle(1,max_lr=2e-54,moms=(0.8,0.9))

In [None]:
learner.save('first_cycle')

In [None]:
seed_all(seed)
learner.load('first_cycle');

In [None]:
learner.freeze_to(-2)

In [None]:
lr = 1e-5

In [None]:
learner.fit_one_cycle(3, max_lr=slice(lr*0.95**num_groups, lr), moms=(0.8, 0.9))

In [None]:
learner.save('second_cycle')

In [None]:
seed_all(seed)
learner.load('second_cycle');

In [None]:
learner.freeze_to(-3)

In [None]:
learner.fit_one_cycle(3, max_lr=slice(lr*0.95**num_groups, lr), moms=(0.8, 0.9))

In [None]:
learner.save('third_cycle')

In [None]:
seed_all(seed)
learner.load('third_cycle');

In [None]:
learner.unfreeze()

In [None]:
learner.fit_one_cycle(3, max_lr=slice(lr*0.95**num_groups, lr), moms=(0.8, 0.9))

In [None]:
learner.predict('me and the big homie meanboy3000 # meanboy # mb # mbs # mmr # stegmanlife @ stegman st . < url >')#1

In [None]:
learner.predict("i ' m 100 % thinking of devoting my career to proving autism isn ' t caused by vaccines due to the idiotic posts i ' ve seen about world autism day")#2

In [None]:
learner.export(file = 'transformer.pkl');

In [None]:
path = '/kaggle/working'
export_learner = load_learner(path, file = 'transformer.pkl')

In [None]:
def get_preds_as_nparray(ds_type) -> np.ndarray:
  
    preds = learner.get_preds(ds_type)[0].detach().cpu().numpy()
    sampler = [i for i in databunch.dl(ds_type).sampler]
    reverse_sampler = np.argsort(sampler)
    return preds[reverse_sampler, :]

test_preds = get_preds_as_nparray(DatasetType.Test)

In [None]:
sub_df=test[['tweet_id']]

In [None]:
sub_df['label']=np.nan

In [None]:
sample_submission = sub_df
sample_submission['label'] = np.argmax(test_preds,axis=1)
sample_submission['label'] = sample_submission['label'].map({0:-1, 1: 0,2:1})
sample_submission.to_csv("allsub7.csv", index=False)