In [None]:
import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd
import collections
import os
import re
import math
import copy
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
print(train.columns)

# gpu acceleration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
#misclassifed datapoints in train
train.loc[train['text'] == 'like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit', 'target'] = 0
train.loc[train['text'] == 'Hellfire is surrounded by desires so be careful and donÛªt let your desires control you! #Afterlife', 'target'] = 0
train.loc[train['text'] == 'To fight bioterrorism sir.', 'target'] = 0
train.loc[train['text'] == '.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4', 'target'] = 1
train.loc[train['text'] == 'CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring', 'target'] = 1
train.loc[train['text'] == '#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption', 'target'] = 0
train.loc[train['text'] == 'In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!', 'target'] = 0
train.loc[train['text'] == 'Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE', 'target'] = 1
train.loc[train['text'] == 'RT NotExplained: The only known image of infamous hijacker D.B. Cooper. http://t.co/JlzK2HdeTG', 'target'] = 1
train.loc[train['text'] == "Mmmmmm I'm burning.... I'm burning buildings I'm building.... Oooooohhhh oooh ooh...", 'target'] = 0
train.loc[train['text'] == "wowo--=== 12000 Nigerian refugees repatriated from Cameroon", 'target'] = 0
train.loc[train['text'] == "He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam", 'target'] = 0
train.loc[train['text'] == "Hellfire! We donÛªt even want to think about it or mention it so letÛªs not do anything that leads to it #islam!", 'target'] = 0
train.loc[train['text'] == "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'", 'target'] = 0
train.loc[train['text'] == "Caution: breathing may be hazardous to your health.", 'target'] = 1
train.loc[train['text'] == "I Pledge Allegiance To The P.O.P.E. And The Burning Buildings of Epic City. ??????", 'target'] = 0
train.loc[train['text'] == "#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect", 'target'] = 0
train.loc[train['text'] == "that horrible sinking feeling when youÛªve been at home on your phone for a while and you realise its been on 3G this whole time", 'target'] = 0

In [None]:
def clean(tweet): 
    # Special characters
    tweet = re.sub(r"\x89Û_", "", tweet)
    tweet = re.sub(r"\x89ÛÒ", "", tweet)
    tweet = re.sub(r"\x89ÛÓ", "", tweet)
    tweet = re.sub(r"\x89ÛÏWhen", "When", tweet)
    tweet = re.sub(r"\x89ÛÏ", "", tweet)
    tweet = re.sub(r"China\x89Ûªs", "China's", tweet)
    tweet = re.sub(r"let\x89Ûªs", "let's", tweet)
    tweet = re.sub(r"\x89Û÷", "", tweet)
    tweet = re.sub(r"\x89Ûª", "", tweet)
    tweet = re.sub(r"\x89Û\x9d", "", tweet)
    tweet = re.sub(r"å_", "", tweet)
    tweet = re.sub(r"\x89Û¢", "", tweet)
    tweet = re.sub(r"\x89Û¢åÊ", "", tweet)
    tweet = re.sub(r"fromåÊwounds", "from wounds", tweet)
    tweet = re.sub(r"åÊ", "", tweet)
    tweet = re.sub(r"åÈ", "", tweet)
    tweet = re.sub(r"JapÌ_n", "Japan", tweet)    
    tweet = re.sub(r"Ì©", "e", tweet)
    tweet = re.sub(r"å¨", "", tweet)
    tweet = re.sub(r"SuruÌ¤", "Suruc", tweet)
    tweet = re.sub(r"åÇ", "", tweet)
    tweet = re.sub(r"å£3million", "3 million", tweet)
    tweet = re.sub(r"åÀ", "", tweet)
    
    # Character entity references
    tweet = re.sub(r"&gt;", ">", tweet)
    tweet = re.sub(r"&lt;", "<", tweet)
    tweet = re.sub(r"&amp;", "&", tweet)
    
    # Typos, slang, and informal abbreviations
    tweet = re.sub(r"w/e", "whatever", tweet)
    tweet = re.sub(r"w/", "with", tweet)
    tweet = re.sub(r"USAgov", "USA government", tweet)
    tweet = re.sub(r"recentlu", "recently", tweet)
    tweet = re.sub(r"Ph0tos", "Photos", tweet)
    tweet = re.sub(r"amirite", "am I right", tweet)
    tweet = re.sub(r"exp0sed", "exposed", tweet)
    tweet = re.sub(r"<3", "love", tweet)
    tweet = re.sub(r"amageddon", "armageddon", tweet)
    tweet = re.sub(r"Trfc", "Traffic", tweet)
    tweet = re.sub(r"8/5/2015", "2015-08-05", tweet)
    tweet = re.sub(r"WindStorm", "Wind Storm", tweet)
    tweet = re.sub(r"8/6/2015", "2015-08-06", tweet)
    tweet = re.sub(r"10:38PM", "10:38 PM", tweet)
    tweet = re.sub(r"10:30pm", "10:30 PM", tweet)
    tweet = re.sub(r"16yr", "16 year", tweet)
    tweet = re.sub(r"lmao", "laughing my ass off", tweet)   
    tweet = re.sub(r"TRAUMATISED", "traumatized", tweet)
    
    # URLs
    tweet = re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+", "", tweet)
    
    # Words with punctuations and special characters
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`"
    for p in punctuations:
        tweet = tweet.replace(p, f' {p} ')
        
    # ... and ..
    tweet = tweet.replace('...', ' ... ')
    if '...' not in tweet:
        tweet = tweet.replace('..', ' ... ')      
        
    # Acronyms
    tweet = re.sub(r"MH370", "Malaysia Airlines Flight 370", tweet)
    tweet = re.sub(r"mÌ¼sica", "music", tweet)
    tweet = re.sub(r"okwx", "Oklahoma City Weather", tweet)
    tweet = re.sub(r"arwx", "Arkansas Weather", tweet)    
    tweet = re.sub(r"gawx", "Georgia Weather", tweet)  
    tweet = re.sub(r"scwx", "South Carolina Weather", tweet)  
    tweet = re.sub(r"cawx", "California Weather", tweet)
    tweet = re.sub(r"tnwx", "Tennessee Weather", tweet)
    tweet = re.sub(r"azwx", "Arizona Weather", tweet)  
    tweet = re.sub(r"alwx", "Alabama Weather", tweet)
    tweet = re.sub(r"wordpressdotcom", "wordpress", tweet)    
    tweet = re.sub(r"usNWSgov", "United States National Weather Service", tweet)
    tweet = re.sub(r"Suruc", "Sanliurfa", tweet)
    
    return tweet


train['text'] = train['text'].apply(clean)
test['text'] = test['text'].apply(clean)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#get top most important words
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
matrix_train = tfidf_vectorizer.fit_transform(train['text'])
words = tfidf_vectorizer.get_feature_names_out()

for i, row in enumerate(matrix_train):
    row = row.toarray().flatten()
    curr_top = row.argsort()[-2:]
    train['text'].iloc[i] = words[curr_top[1]] + ' [SEP] ' + words[curr_top[0]] + ' [SEP] ' + train['text'].iloc[i] 
    
matrix_test = tfidf_vectorizer.transform(test['text'])

for i, row in enumerate(matrix_test):
    row = row.toarray().flatten()
    curr_top = row.argsort()[-2:]
    test['text'].iloc[i] = words[curr_top[1]] + ' [SEP] ' + words[curr_top[0]] + ' [SEP] ' + test['text'].iloc[i] 

In [None]:
train['keyword'].fillna('None', inplace=True)
train['location'].fillna('None', inplace=True)
train['text'] = train['keyword'] + ' [SEP] ' + train['location']  + ' [SEP] ' + train['text']

test['keyword'].fillna('None', inplace=True)
test['location'].fillna('None', inplace=True)
test['text'] = test['keyword'] + ' [SEP] ' + test['location']  + ' [SEP] ' + test['text']
print(test['text'].iloc[0])

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

X = train['text']
y = train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.02, random_state=42)

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

train_encodings = tokenizer(list(X_train), padding=True, return_tensors="pt")
test_encodings = tokenizer(list(X_test), padding=True, return_tensors="pt")
print(train_encodings['input_ids'].shape)
print(test_encodings['input_ids'].shape)

train_labels = torch.tensor(y_train.values).to(device)
test_labels = torch.tensor(y_test.values).to(device)

#attention mask for padding
train_dataset = TensorDataset(train_encodings['input_ids'].to(device), train_encodings['attention_mask'].to(device), train_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(test_encodings['input_ids'].to(device), test_encodings['attention_mask'].to(device), test_labels)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", num_labels=2).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

epochs = 3
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=(len(train_loader) * epochs))

model.train()
models = []

for i in range(epochs):
  tot_loss = 0
  for batch in train_loader:
    optimizer.zero_grad() #need to use it to reset gradient
    inputs, mask, labels = [b.to(device) for b in batch]
    outputs = model(inputs, attention_mask=mask, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step() #updates model
    scheduler.step()
    tot_loss += loss.item()

  models.append(copy.deepcopy(model.state_dict()))
  print(f"Epoch {i} loss: {tot_loss}")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

for epoch, state_dict in enumerate(models):

  model.load_state_dict(state_dict)
  model.eval()
  pred = []
  true = []

  for batch in test_loader:
    inputs, mask, labels = [b.to(device) for b in batch]
    outputs = model(inputs, attention_mask=mask)
    logits = outputs.logits
    pred.extend(torch.argmax(logits, dim=1).tolist())
    true.extend(labels.tolist())

  accuracy = accuracy_score(true, pred)
  precision = precision_score(true, pred)
  recall = recall_score(true, pred)
  f1 = f1_score(true, pred)

  print(f"Model from Epoch {epoch+1}:")
  print(f"Accuracy: {accuracy}")
  print(f"Precision: {precision}")
  print(f"Recall: {recall}")
  print(f"F1 Score: {f1}\n")

In [None]:
model.load_state_dict(models[1])

submit_encodings = tokenizer(list(test['text']), padding=True, return_tensors="pt")
submit_ids = torch.tensor(test['id'].values).to(device)

submit_dataset = TensorDataset(submit_encodings['input_ids'].to(device), submit_encodings['attention_mask'].to(device), submit_ids)
submit_loader = DataLoader(submit_dataset, batch_size=32)

sub = []
sub_ids = []

for batch in submit_loader:
    inputs, mask, ids = [b.to(device) for b in batch]
    outputs = model(inputs, attention_mask=mask)
    logits = outputs.logits
    sub.extend(torch.argmax(logits, dim=1).tolist())
    sub_ids.extend(ids.tolist())

res = pd.DataFrame({'id': sub_ids,'target':sub})
res.to_csv("submission.csv", index=False)