In [38]:
import pandas as pd

In [39]:
df = pd.read_csv('train.csv')

In [40]:
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [41]:
df['target'] = df['target'].astype(float)

In [42]:
df = df.fillna("Unavailable")

In [43]:
df.describe(include='object')

Unnamed: 0,keyword,location,text
count,7613,7613,7613
unique,222,3342,7503
top,Unavailable,Unavailable,11-Year-Old Boy Charged With Manslaughter of T...
freq,61,2533,10


In [44]:
df.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [45]:
df['input'] = 'KEY: ' + df.keyword + '; LOC: ' + df.location + '; TEXT: ' + df.text

In [46]:
df

Unnamed: 0,id,keyword,location,text,target,input
0,1,Unavailable,Unavailable,Our Deeds are the Reason of this #earthquake M...,1.0,KEY: Unavailable; LOC: Unavailable; TEXT: Our ...
1,4,Unavailable,Unavailable,Forest fire near La Ronge Sask. Canada,1.0,KEY: Unavailable; LOC: Unavailable; TEXT: Fore...
2,5,Unavailable,Unavailable,All residents asked to 'shelter in place' are ...,1.0,KEY: Unavailable; LOC: Unavailable; TEXT: All ...
3,6,Unavailable,Unavailable,"13,000 people receive #wildfires evacuation or...",1.0,"KEY: Unavailable; LOC: Unavailable; TEXT: 13,0..."
4,7,Unavailable,Unavailable,Just got sent this photo from Ruby #Alaska as ...,1.0,KEY: Unavailable; LOC: Unavailable; TEXT: Just...
...,...,...,...,...,...,...
7608,10869,Unavailable,Unavailable,Two giant cranes holding a bridge collapse int...,1.0,KEY: Unavailable; LOC: Unavailable; TEXT: Two ...
7609,10870,Unavailable,Unavailable,@aria_ahrary @TheTawniest The out of control w...,1.0,KEY: Unavailable; LOC: Unavailable; TEXT: @ari...
7610,10871,Unavailable,Unavailable,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1.0,KEY: Unavailable; LOC: Unavailable; TEXT: M1.9...
7611,10872,Unavailable,Unavailable,Police investigating after an e-bike collided ...,1.0,KEY: Unavailable; LOC: Unavailable; TEXT: Poli...


In [47]:
! pip install datasets



In [48]:
from datasets import Dataset, DatasetDict

ds = Dataset.from_pandas(df)

In [49]:
ds

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'target', 'input'],
    num_rows: 7613
})

In [50]:
model_nm = 'microsoft/deberta-v3-small'

In [51]:
!pip install sentencepiece



In [52]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokz = AutoTokenizer.from_pretrained(model_nm)



In [53]:
tokz.tokenize("A platypus is an ornithorhynchus anatinus.")

['▁A',
 '▁platypus',
 '▁is',
 '▁an',
 '▁or',
 'ni',
 'tho',
 'rhynch',
 'us',
 '▁an',
 'at',
 'inus',
 '.']

In [54]:
def tok_func(x): return tokz(x['input'])

In [55]:
tok_ds = ds.map(tok_func, batched=True)

Map:   0%|          | 0/7613 [00:00<?, ? examples/s]

In [56]:
tok_ds

Dataset({
    features: ['id', 'keyword', 'location', 'text', 'target', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 7613
})

In [57]:
tok_ds = tok_ds.rename_columns({'target':'labels'})

In [58]:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5709
    })
    test: Dataset({
        features: ['id', 'keyword', 'location', 'text', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1904
    })
})

In [67]:
import numpy as np

In [68]:
def corr(x,y): return np.corrcoef(x,y)[0][1]
def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

In [60]:
from transformers import TrainingArguments,Trainer

In [61]:
bs = 128
epochs = 4

In [62]:
lr = 8e-5

In [63]:
! pip install transformers[torch]



In [64]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
  evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
  num_train_epochs=epochs, weight_decay=0.01, report_to='none')

In [65]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokz, compute_metrics=corr_d)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [69]:
trainer.train();

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.127289,0.695736
2,No log,0.158544,0.692614
3,No log,0.151987,0.682962
4,No log,0.150614,0.676443


Epoch,Training Loss,Validation Loss


In [98]:
eval_df = pd.read_csv('test.csv')
eval_df = eval_df.fillna("Unavailable")
eval_df

Unnamed: 0,id,keyword,location,text
0,0,Unavailable,Unavailable,Just happened a terrible car crash
1,2,Unavailable,Unavailable,"Heard about #earthquake is different cities, s..."
2,3,Unavailable,Unavailable,"there is a forest fire at spot pond, geese are..."
3,9,Unavailable,Unavailable,Apocalypse lighting. #Spokane #wildfires
4,11,Unavailable,Unavailable,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,Unavailable,Unavailable,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,Unavailable,Unavailable,Storm in RI worse than last hurricane. My city...
3260,10868,Unavailable,Unavailable,Green Line derailment in Chicago http://t.co/U...
3261,10874,Unavailable,Unavailable,MEG issues Hazardous Weather Outlook (HWO) htt...


In [99]:
eval_df['input'] = 'KEY: ' + eval_df.keyword + '; LOC: ' + eval_df.location + '; TEXT: ' + eval_df.text
eval_ds = Dataset.from_pandas(eval_df)

In [100]:
pd.Series(eval_df.input)

0       KEY: Unavailable; LOC: Unavailable; TEXT: Just...
1       KEY: Unavailable; LOC: Unavailable; TEXT: Hear...
2       KEY: Unavailable; LOC: Unavailable; TEXT: ther...
3       KEY: Unavailable; LOC: Unavailable; TEXT: Apoc...
4       KEY: Unavailable; LOC: Unavailable; TEXT: Typh...
                              ...                        
3258    KEY: Unavailable; LOC: Unavailable; TEXT: EART...
3259    KEY: Unavailable; LOC: Unavailable; TEXT: Stor...
3260    KEY: Unavailable; LOC: Unavailable; TEXT: Gree...
3261    KEY: Unavailable; LOC: Unavailable; TEXT: MEG ...
3262    KEY: Unavailable; LOC: Unavailable; TEXT: #Cit...
Name: input, Length: 3263, dtype: object

In [101]:
tok_eval_ds = eval_ds.map(tok_func, batched=True)

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

In [103]:
preds = trainer.predict(tok_eval_ds).predictions.astype(float)
preds

array([[1.078125  ],
       [1.07128906],
       [1.06445312],
       ...,
       [1.06445312],
       [1.06152344],
       [1.07519531]])

In [112]:
preds = np.round(preds, 0).astype('int')

In [113]:
preds

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [122]:
preds = preds.reshape(-1)

In [123]:
import datasets

submission = datasets.Dataset.from_dict({
    'id': eval_ds['id'],
    'target': preds
})

submission.to_csv('submission.csv', index=False)

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

22746