In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
import pandas as pd 
import numpy as np
import spacy
import re 

#### Load data

In [3]:
df_train = pd.read_csv('gdrive/MyDrive/train.csv')
df_test = pd.read_csv('gdrive/MyDrive/test.csv')

In [4]:
## nlp from  spacy lib 
nlp = spacy.load('en_core_web_sm') 

###### Drop duplicates from train data

In [5]:
print('shape = %s'%df.shape[0])
df.drop_duplicates(['text', 'target'], inplace=True, ignore_index=True)
print('shape = %s'%df.shape[0])

shape = 7613
shape = 7521


In [6]:
new_duplicates = df[df.duplicated(['keyword', 'text'], keep=False)]

print(f'Train Duplicate Entries (keyword, text): {len(new_duplicates)}')
new_duplicates[['text', 'target']].sort_values(by='text')

Train Duplicate Entries (keyword, text): 30


Unnamed: 0,text,target
4238,#Allah describes piling up #wealth thinking it...,0
4253,#Allah describes piling up #wealth thinking it...,1
4171,#foodscare #offers2go #NestleIndia slips into ...,1
4193,#foodscare #offers2go #NestleIndia slips into ...,0
2802,.POTUS #StrategicPatience is a strategy for #G...,1
2803,.POTUS #StrategicPatience is a strategy for #G...,0
4554,CLEARED:incident with injury:I-495 inner loop...,0
4535,CLEARED:incident with injury:I-495 inner loop...,1
4182,Caution: breathing may be hazardous to your he...,1
4185,Caution: breathing may be hazardous to your he...,0


In [7]:
df.drop([4253, 4193, 2802, 4554, 4182, 3212, 4249, 4259, 6535, 4319, 4239, 606, 3936, 6018, 5573], inplace=True)

In [8]:
df = df.reset_index(drop=True)

### Preprocessing data

In [9]:
#puncts list
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '\xa0', '\t',
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '\u3000', '\u202f',
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '«',
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', '\x89', 'û', 'ì', 'å', 'ò']
#data for preprocessing for replace words in lines
mispell_dict = {"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"couldnt" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"doesnt" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"havent" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"shouldnt" : "should not",
"that's" : "that is",
"thats" : "that is",
"there's" : "there is",
"theres" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"theyre":  "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not"}
lookup_dict = {
  'abt' : 'about',
  'afaik' : 'as far as i know',
  'bc' : 'because',
  'bfn' : 'bye for now',
  'bgd' : 'background',
  'bh' : 'blockhead',
  'br' : 'best regards',
  'btw' : 'by the way',
  'cc': 'carbon copy',
  'chk' : 'check',
  'dam' : 'do not annoy me',
  'dd' : 'dear daughter',
  'df': 'dear fiance',
  'ds' : 'dear son',
  'dyk' : 'did you know',
  'em': 'email',
  'ema' : 'email address',
  'ftf' : 'face to face',
  'fb' : 'facebook',
  'ff' : 'follow friday', 
  'fotd' : 'find of the day',
  'ftw': 'for the win',
  'fwiw' : 'for what it is worth',
  'gts' : 'guess the song',
  'hagn' : 'have a good night',
  'hand' : 'have a nice day',
  'hotd' : 'headline of the day',
  'ht' : 'heard through',
  'hth' : 'hope that helps',
  'ic' : 'i see',
  'icymi' : 'in case you missed it',
  'idk' : 'i do not know',
  'ig': 'instagram',
  'iirc' : 'if i remember correctly',
  'imho' : 'in my humble opinion',
  'imo' : 'in my opinion',
  'irl' : 'in real life',
  'iwsn' : 'i want sex now',
  'jk' : 'just kidding',
  'jsyk' : 'just so you know',
  'jv' : 'joint venture',
  'kk' : 'cool cool',
  'kyso' : 'knock your socks off',
  'lmao' : 'laugh my ass off',
  'lmk' : 'let me know', 
  'lo' : 'little one',
  'lol' : 'laugh out loud',
  'mm' : 'music monday',
  'mirl' : 'meet in real life',
  'mrjn' : 'marijuana',
  'nbd' : 'no big deal',
  'nct' : 'nobody cares though',
  'njoy' : 'enjoy',
  'nsfw' : 'not safe for work',
  'nts' : 'note to self',
  'oh' : 'overheard',
  'omg': 'oh my god',
  'oomf' : 'one of my friends',
  'orly' : 'oh really',
  'plmk' : 'please let me know',
  'pnp' : 'party and play', 
  'qotd' : 'quote of the day',
  're' : 'in reply to in regards to',
  'rtq' : 'read the question',
  'rt' : 'retweet',
  'sfw' : 'safe for work',
  'smdh' : 'shaking my damn head', 
  'smh' : 'shaking my head',
  'so' : 'significant other',
   'srs' : 'serious',
  'tftf' : 'thanks for the follow',
  'tftt' : 'thanks for this tweet',
  'tj' : 'tweetjack',
  'tl' : 'timeline',
  'tldr' : 'too long did not read',
  'tmb' : 'tweet me back',
  'tt' : 'trending topic',
  'ty' : 'thank you',
  'tyia' : 'thank you in advance',
  'tyt' : 'take your time',
  'tyvw' : 'thank you very much',
  'w': 'with', 
   'wtv' : 'whatever',
  'ygtr' : 'you got that right',
  'ykwim' : 'you know what i mean',
  'ykyat' : 'you know you are addicted to',
  'ymmv' : 'your mileage may vary',
  'yolo' : 'you only live once',
  'yoyo' : 'you are on your own',
  'yt': 'youtube',
  'yw' : 'you are welcome',
  'zomg' : 'oh my god to the maximum'
}

In [10]:

def replace_reduce(text):
  words = text.split() 
  abbrevs_removed = [] 
    
  for i in words:
      if i in lookup_dict:
          i = lookup_dict[i]
      abbrevs_removed.append(i)
          
  return ' '.join(abbrevs_removed)

In [None]:
def lemmatize_text(text, nlp=nlp):
    doc = nlp(text)    
    lemma_sent = [i.lemma_ for i in doc if not i.is_stop]    
    
    return ' '.join(lemma_sent) 

###### Train

In [32]:
df_train.text = df_train.text.str.replace('\n', ' ')
#text lower
df_train.text = df_train.text.str.lower()
# replace 2
df_train.text = df_train.text.apply(replace_reduce)
# replace 1
for before, after in mispell_dict.items():
  df_train.text = df_train.text.str.replace(before, after)
#text lower
df_train.text = df_train.text.str.lower()
#del url
df_train.text = [re.sub(r'http\S+', '', x) for x in df_train.text]
#del punct
for punct in puncts:
  df_train.text = df_train.text.str.replace(punct, '')
#del digits
df_train.text = [re.sub('\d+', '', line) for line in df_train.text]

Del stopwords

In [4]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [34]:
df_train['text'] = df_train['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

Lematization text 

In [36]:
df_train.text = df_train.text.apply(lemmatize_text)

###### Test

In [None]:
df_test.text = df_test.text.str.replace('\n', ' ')
#text lower
df_test.text = df_test.text.str.lower()
# replace 2
df_test.text = df_test.text.apply(replace_reduce)
# replace 1
for before, after in mispell_dict.items():
  df_test.text = df_test.text.str.replace(before, after)
#text lower
df_test.text = df_test.text.str.lower()
#del url
df_test.text = [re.sub(r'http\S+', '', x) for x in df_test.text]
#del punct
for punct in puncts:
  df_test.text = df_test.text.str.replace(punct, '')
#del digits
df_test.text = [re.sub('\d+', '', line) for line in df_test.text]

Del word "new" from train and test data, because this word is frequent. 
This fact was find with WordCloud.

In [37]:
pattern_new = re.compile(r'\bnew\b')
df_train['text'] = df_train['text'].apply(lambda x: re.sub(pattern_new, '', x) if pd.isna(x) != True else x)
df_test['text'] = df_test['text'].apply(lambda x: re.sub(pattern_new, '', x) if pd.isna(x) != True else x)

#### Load Bert Classifier and Tokenizer from transformers

In [6]:
#!pip install transformers
import transformers
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, default_data_collator, Trainer, set_seed

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertForSequenceClassification.from_pretrained('bert-large-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [21]:
train = df.copy()

Tokenize every lines from train data. Also add attention mask for every lines

In [23]:
max_length = 64
train['input_ids'] = train['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['input_ids'])
train['attention_mask'] = train['text'].apply(lambda x: tokenizer(x, max_length=max_length, padding="max_length",)['attention_mask'])
train.rename(columns={'target': 'labels'}, inplace=True)
train.head()

Unnamed: 0,id,keyword,location,text,labels,input_ids,attention_mask
0,1,,,deeds reason earthquake allah forgive,1,"[101, 15616, 3114, 8372, 16455, 9641, 102, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,4,,,forest fire near la ronge sask canada,1,"[101, 3224, 2543, 2379, 2474, 6902, 3351, 2187...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ..."
2,5,,,resident ask shelter place notify officer evac...,1,"[101, 6319, 3198, 7713, 2173, 2025, 8757, 2961...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."
3,6,,,people receive wildfire evacuation order calif...,1,"[101, 2111, 4374, 3748, 10273, 13982, 2344, 26...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
4,7,,,get send photo ruby alaska smoke wildfires pou...,1,"[101, 2131, 4604, 6302, 10090, 7397, 5610, 374...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ..."


In [24]:
train = train[['input_ids', 'attention_mask', 'labels']]
train.head()

Unnamed: 0,input_ids,attention_mask,labels
0,"[101, 15616, 3114, 8372, 16455, 9641, 102, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
1,"[101, 3224, 2543, 2379, 2474, 6902, 3351, 2187...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...",1
2,"[101, 6319, 3198, 7713, 2173, 2025, 8757, 2961...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...",1
3,"[101, 2111, 4374, 3748, 10273, 13982, 2344, 26...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...",1
4,"[101, 2131, 4604, 6302, 10090, 7397, 5610, 374...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...",1


In [25]:
train_df = train[:-64].reset_index(drop=True)
valid_df = train[-64:].reset_index(drop=True)

In [27]:
from datasets import Dataset

train & valid data

In [28]:
train_ = Dataset.from_pandas(train_df)
valid_ = Dataset.from_pandas(valid_df)

Add args for Trainer

In [29]:
batch_size = 16

args = TrainingArguments(
    'final-start-for-competition',
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    warmup_ratio=0.1,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
)

data_collator = default_data_collator
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_,
    eval_dataset=valid_,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

training start

In [30]:
trainer.train()

***** Running training *****
  Num examples = 7442
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 8
  Total optimization steps = 174


Epoch,Training Loss,Validation Loss
0,No log,0.193373
1,No log,0.178898
2,No log,0.149149


***** Running Evaluation *****
  Num examples = 64
  Batch size = 16
Saving model checkpoint to nlp-getting-started/checkpoint-58
Configuration saved in nlp-getting-started/checkpoint-58/config.json
Model weights saved in nlp-getting-started/checkpoint-58/pytorch_model.bin
tokenizer config file saved in nlp-getting-started/checkpoint-58/tokenizer_config.json
Special tokens file saved in nlp-getting-started/checkpoint-58/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 64
  Batch size = 16
Saving model checkpoint to nlp-getting-started/checkpoint-116
Configuration saved in nlp-getting-started/checkpoint-116/config.json
Model weights saved in nlp-getting-started/checkpoint-116/pytorch_model.bin
tokenizer config file saved in nlp-getting-started/checkpoint-116/tokenizer_config.json
Special tokens file saved in nlp-getting-started/checkpoint-116/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 64
  Batch size = 16
Saving model checkpoint to nlp

TrainOutput(global_step=174, training_loss=0.40088833337542656, metrics={'train_runtime': 1730.8948, 'train_samples_per_second': 12.899, 'train_steps_per_second': 0.101, 'total_flos': 2598690606640128.0, 'train_loss': 0.40088833337542656, 'epoch': 3.0})