In [24]:
import pandas as pd
import numpy as np
import re

from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, BertModel
from torch.optim.lr_scheduler import ExponentialLR
from torch import nn
import torch

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from flashtext import KeywordProcessor
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import codecs
import spacy

from sklearn.metrics import f1_score
from sklearn import preprocessing

from niacin.augment import RandAugment
from niacin.text import en

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = stopwords.words('english')

device = torch.device('cuda:3')

[nltk_data] Downloading package stopwords to
[nltk_data]     /cephfs/projects/vagishev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /cephfs/projects/vagishev/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /cephfs/projects/vagishev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from comet_ml import Experiment

experiment = Experiment(
    api_key="aoUalWmaagcfhfpRszq9USwVb",
    project_name="twitter-threat-classification",
    workspace="zanzibara1961",
    log_code=True
)


In [None]:
df = pd.read_csv('train.csv', index_col = 'id').reset_index(drop=True)
df

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.loc[df['keyword'].isnull(), 'keyword'] = 'Null'

In [None]:
df.keyword = df.keyword.str.replace('%20', '')

stemmer = PorterStemmer()

df.keyword = df.keyword.apply(lambda x: stemmer.stem(x))

In [None]:
df = df.sample(frac = 1).reset_index(drop=True)

In [None]:
dumm_columns = pd.get_dummies(df['keyword']).columns
len(dumm_columns) 

In [None]:
sum_1 = 0
for i in df['text'].str.lower().str.split():
    sum_1 += len(i)
sum_1 / len(df)

In [None]:
class TweetsDataset(Dataset):
    
    """Dataset for creating a dataset of text.

    Parameters:
    data (pd.DataFrame): Data for the dataset.
    target (list): Target values.
    include_keyword (bool, optional): Whether to include keyword in the dataset. Defaults to True.
    max_len (int, optional): Maximum length of the sequence. Defaults to 15.
    dumm_column (str, optional): Column to use for creating dummy variables. Defaults to None.
    lemmatize (bool, optional): Whether to lemmatize the text. Defaults to True.
    prepear (bool, optional): Whether to preprocess the text. Defaults to True.
    """
    
    def __init__(self, data, target, include_keyword = True, max_len = 15, 
                 dumm_column=None, lemmatize=True, prepear = True):
        
        super().__init__()
        self.dataset         = data
        self.target          = target
        self.include_keyword = include_keyword
        self.len             = len(data)
        self.lemmatize       = lemmatize
        self.prepear         = prepear
        self.lemmatizer      = WordNetLemmatizer() 
        self.max_len         = max_len
        
        if self.include_keyword:
            self.keywords = self.get_dummies_(self.dataset, dumm_columns)
     
        self.replacements    = ('UTC', ']', '.', '[', '?', '!', "'", ':', '=>', '-', 
                                '//t', '/', '&', ';', 'å', '`',
                                'ê', '+', '=', '(', ')', '|', '*', '_', 
                                '%', '$', '>', '<', 'ì', '¼', '~', 'http', '#', '@', '\n')
        
    def get_dummies_(self, dataset, dumm_columns):

        dummies = pd.DataFrame(0, index=np.arange(len(dataset)), columns=dumm_columns)
        for index in range(len(dataset)):
            
            if X_train.loc[index, 'keyword'] in dumm_columns:
                column = np.where(dumm_columns == dataset['keyword'][index])[0][0]
                dummies.iloc[index, [column]] = 1
            else:
                dummies.iloc[index, [0]] = 1
    
        return dummies
    
    def __len__(self):
        return self.len
    
    def slang_replace_(self, string):
        keyword_processor = KeywordProcessor()
        keyword_processor.add_keyword('u', 'you')
        keyword_processor.add_keyword('fvck', 'fuck')
        
        new_string = keyword_processor.replace_keywords(string)
        return new_string
    
    
    def clean_text_(self, string):
        
        string = string.strip()
        string = self.slang_replace_(string)
        
        for replace in self.replacements:
            string = string.replace(replace, ' ')
        string_splitted = string.split()
        string_result   = []
        
        for index in range(len(string_splitted)):
            sub_string = string_splitted[index] 
            
            if not(sub_string in stop_words or re.search("\d", sub_string) 
                   or re.search(u"[\x80-\x9f]", sub_string) or sub_string == 'co' or len(sub_string) == 1):
                
                if self.lemmatize:
                    string = self.lemmatizer.lemmatize(sub_string)

                string = string.lower()
                string_temp = re.sub(r'([a-z])\1+\1+', r'\1', string)
                string_result.append(string_temp)
                
        return '[CLS] ' + ' '.join(string_result)
        
        
    def __getitem__(self, index):
    
        text   = self.dataset.text[index]
        target = self.target[index]
        
        if self.prepear: 
            text_transformed = self.clean_text_(text)
    
        if self.include_keyword:
            keyword_array = torch.tensor(self.keywords.iloc[index].to_numpy())
            return text_transformed, keyword_array, target
        
        
        return text_transformed, target

In [None]:
def data_augmentation(train_df):
    augmentation_df_1 = train_df
#     augmentation_df_2 = train_df
    augmentor = RandAugment([
        en.add_synonyms,
        en.add_hyponyms,
        en.add_misspelling,
        en.swap_words,
        en.add_contractions,
        ], n=1, m=20, shuffle=False)
    
#     augmentor_2 = RandAugment([
#         en.add_synonyms,
#         en.add_hyponyms,
#         en.add_misspelling,
#         en.swap_words,
#         en.add_contractions,
#         ], n=1, m=50, shuffle=False)
    
    text = augmentation_df_1.text.tolist()
    augmentation_list_1 = []
#     augmentation_list_2 = []
    
    for sentence in text:
        for tx in augmentor:
            augment_text = tx(sentence)
            augmentation_list_1.append(augment_text)
            
#         for tx in augmentor_2:
#             augment_text = tx(sentence)
#             augmentation_list_2.append(augment_text)
            
    augmentation_df_1.loc[:, 'text'] = augmentation_list_1
#     augmentation_df_2.loc[:, 'text'] = augmentation_list_2
#     result_df = pd.concat([augmentation_df_1, train_df, augmentation_df_2], axis=0, ignore_index=True)
    result_df = pd.concat([augmentation_df_1, train_df], axis=0, ignore_index=True)
    
    return result_df.reset_index(drop=True)

In [None]:
train_df = df.iloc[:6613]
test_df  = df.iloc[6613:].reset_index(drop=True)
train_df

In [None]:
# train_df = data_augmentation(train_df)
# train_df

In [None]:
X_train, y_train = train_df.drop('target', axis=1), train_df.target
X_test, y_test   = test_df.drop('target', axis=1), test_df.target

In [None]:
train_dataset = TweetsDataset(X_train, y_train, True, dumm_columns)
test_dataset  = TweetsDataset(X_test, y_test, True, dumm_columns)

In [None]:
for i in range(1, 100):
    print(train_dataset.__getitem__(i)[0])

In [None]:
X_train.shape, X_test.shape

In [None]:
# AutoModel.from_pretrained("vinai/bertweet-base")

In [None]:
class Model_BertBased(nn.Module):
    
    def __init__(self):
        super().__init__()
        
        self.bert = AutoModel.from_pretrained("rabindralamsal/finetuned-bertweet-sentiment-analysis")
        
        self.classifier = nn.Sequential(
            nn.Linear(in_features = 769, out_features = 1, bias = True),
            nn.Sigmoid()
        )
        
        self.keyword_classifier = nn.Sequential(
            nn.Linear(in_features = 166, out_features = 1, bias = True),
            nn.LeakyReLU()
        )
        
    def forward(self, input_ids, attention, keyword, freeze):
        
        if freeze:
            for param in self.bert.parameters():
                param.requires_grad = False 
        else: 
             for param in self.bert.parameters():
                param.requires_grad = True

        X = self.bert(input_ids, attention)
        keyword  = self.keyword_classifier(keyword.to(torch.float32))
        X_concat = torch.cat((X.pooler_output, keyword), dim = 1)
        
        
        X = self.classifier(X_concat)
        return torch.squeeze(X)

In [None]:
def train(model, lr, train_dataset, test_dataset, num_epochs=15, max_length = 15, 
          batch_size = 64, freeze=True, weights_decay = 0):
    
    model.to(device)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr, weight_decay = weights_decay)
    criterion = nn.BCELoss()
    
    bert_tokenizer  = AutoTokenizer.from_pretrained("rabindralamsal/finetuned-bertweet-sentiment-analysis")
    
    train_loader = DataLoader(train_dataset, batch_size)
    test_loader  = DataLoader(test_dataset, batch_size)
    
    scheduler = ExponentialLR(optimizer, gamma=0.5)
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
 

    for epoch in range(num_epochs):
        
        running_loss = 0.0
        train_count  = 0
        
        running_loss_test = 0.0
        train_count_test  = 0
        
        for text, keywords, target in train_loader:
            
            optimizer.zero_grad()
            
            text = bert_tokenizer.batch_encode_plus(text, max_length=max_length, 
                                               truncation=True, 
                                               padding='max_length')
            
            input_ids = torch.tensor(text.input_ids)
            attention = torch.tensor(text.attention_mask)
        
            output = model(input_ids.to(device), attention.to(device),  keywords.to(device), freeze)
            loss   = criterion(output.to(torch.float32).to(device), target.to(torch.float32).to(device))
            loss.backward()
            
            optimizer.step()
            
            running_loss += loss.item()
            train_count  += 1
            
            
        for text, keywords, target in test_loader:
            
            with torch.no_grad():
                text = bert_tokenizer.batch_encode_plus(text, max_length=30, 
                                               truncation=True, 
                                               padding='max_length')
            
                input_ids = torch.tensor(text.input_ids)
                attention = torch.tensor(text.attention_mask)
                
                
                output = model(input_ids.to(device), attention.to(device),  keywords.to(device), freeze)
                loss   = criterion(output.to(torch.float32).to(device), target.to(torch.float32).to(device))

                running_loss_test += loss.item()
                train_count_test  += 1
                
        scheduler.step()
    
        print('Epoch: ', epoch)
        print('Train_loss: ', running_loss / train_count)
        print('Test_loss: ', running_loss_test / train_count_test, '\n\n')
        
        experiment.log_metric("train_loss", running_loss / train_count, step=epoch)
        experiment.log_metric("test_loss", running_loss_test / train_count_test, step=epoch)

In [None]:
model = Model_BertBased()

In [None]:
lr = 0.1
num_epochs = 50
max_length = 15
batch_size = 64
treshold = 0.47

hyper_params = {
    "learning_rate": lr,
    "num_epochs": num_epochs,
    "max_length": max_length, 
    "batch_size": batch_size,
    'treshold': treshold,
}
experiment.log_parameters(hyper_params)

train(model, lr, train_dataset, test_dataset,  num_epochs, 
      max_length, batch_size, freeze=True)

In [639]:
model_copy = model 

In [640]:
lr = 1e-5
num_epochs = 1
max_length = 15
batch_size = 64

train(model_copy, lr, train_dataset, test_dataset,  num_epochs, max_length, 
      batch_size, freeze=False, weights_decay=0.01)

Epoch:  0
Train_loss:  0.4077570212001984
Test_loss:  0.48740207962691784 




In [609]:
def batch_predict(model, test_dataset):
    
    test_loader = DataLoader(test_dataset, batch_size=64)
    bert_tokenizer  = AutoTokenizer.from_pretrained("vinai/bertweet-base")
    
    model.to(device)
    results = []
    targets = []

    for text, keywords, target in test_loader:
        
            with torch.no_grad():
                text = bert_tokenizer.batch_encode_plus(text, max_length=15, 
                                               truncation=True, 
                                               padding='max_length')
                
                input_ids = torch.tensor(text.input_ids)
                attention = torch.tensor(text.attention_mask)
            
                output  = model(input_ids.to(device), attention.to(device),  keywords.to(device), False)

                results += output.tolist()
                targets += target.tolist()
    
    return results, targets

In [610]:
results = batch_predict(model, test_dataset)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [614]:
preds  = np.array(results[0])
target = np.array(results[1])

preds = np.where(preds > 0.42, 1, 0)

f1_score(target, preds)

0.7613882863340564

In [612]:
experiment.log_metric("f1_score", f1_score(target, preds), step=0)

In [327]:
experiment.end()

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.com/zanzibara1961/twitter-threat-classification/29bcc2109ae64a2b99bb464216b74ff4
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     f1_score        : 0.7745098039215685
COMET INFO:     test_loss [16]  : (0.3924035392701626, 0.5645373128354549)
COMET INFO:     train_loss [16] : (0.4249946947854299, 0.6904508740856097)
COMET INFO:   Parameters:
COMET INFO:     batch_size    : 64
COMET INFO:     learning_rate : 0.1
COMET INFO:     max_length    : 15
COMET INFO:     num_epochs    : 15
COMET INFO:     treshold      : 0.47
COMET INFO:   Uploads:
COMET INFO:     environment details : 1
COMET INFO:     filename            : 1
COMET INFO:     installed packages  : 1
COMET INFO:     notebook            : 1
COMET INFO:     source_code         : 1
COMET I

# Prediction Submitting

In [328]:
test_df = pd.read_csv('test.csv', index_col = 'id').reset_index(drop=True)
test_df

Unnamed: 0,keyword,location,text
0,,,Just happened a terrible car crash
1,,,"Heard about #earthquake is different cities, s..."
2,,,"there is a forest fire at spot pond, geese are..."
3,,,Apocalypse lighting. #Spokane #wildfires
4,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...
3258,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,,,Storm in RI worse than last hurricane. My city...
3260,,,Green Line derailment in Chicago http://t.co/U...
3261,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [329]:
test_df.loc[test_df['keyword'].isnull(), 'keyword'] = 'Null'

test_df.keyword = test_df.keyword.str.replace('%20', '')

stemmer = PorterStemmer()

test_df.keyword = test_df.keyword.apply(lambda x: stemmer.stem(x))

test_df

Unnamed: 0,keyword,location,text
0,,,Just happened a terrible car crash
1,,,"Heard about #earthquake is different cities, s..."
2,,,"there is a forest fire at spot pond, geese are..."
3,,,Apocalypse lighting. #Spokane #wildfires
4,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...
3258,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,,,Storm in RI worse than last hurricane. My city...
3260,,,Green Line derailment in Chicago http://t.co/U...
3261,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [330]:
test_df['target'] = 0
test_df

Unnamed: 0,keyword,location,text,target
0,,,Just happened a terrible car crash,0
1,,,"Heard about #earthquake is different cities, s...",0
2,,,"there is a forest fire at spot pond, geese are...",0
3,,,Apocalypse lighting. #Spokane #wildfires,0
4,,,Typhoon Soudelor kills 28 in China and Taiwan,0
...,...,...,...,...
3258,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,0
3259,,,Storm in RI worse than last hurricane. My city...,0
3260,,,Green Line derailment in Chicago http://t.co/U...,0
3261,,,MEG issues Hazardous Weather Outlook (HWO) htt...,0


In [331]:
X_test, y_test = test_df.drop('target', axis=1), test_df.target
test_dataset = TweetsDataset(X_test, y_test, True, dumm_columns)

In [333]:
for i in range(1, 1000):
    print(test_dataset.__getitem__(i)[0])

[CLS] heard earthquake different cities, stay safe everyone
[CLS] forest fire spot pond, goose fleeing across street, cannot save
[CLS] apocalypse lighting spokane wildfire
[CLS] typhoon soudelor kill china taiwan
[CLS] we shaking it earthquake
[CLS] they probably still show life arsenal yesterday, eh eh
[CLS] hey how
[CLS] what nice hat
[CLS] fuck
[CLS] no like cold
[CLS] no don
[CLS] no tell
[CLS] what
[CLS] awesome
[CLS] birmingham wholesale market ablaze bbc news fire break birmingham wholesale market irwqcezweu
[CLS] sunkxssedharry wear short race ablaze
[CLS] previouslyondoyintv toke marriage crisis set nigerian twitter ablaze
[CLS] check yduixefipe nsfw
[CLS] psa splitting personality techie follow ablaze burners follow ablaze
[CLS] beware world ablaze sierra leone amp guap
[CLS] burning man ablaze turban diva hodwosamws via etsy
[CLS] not dis song people take thing run smh eye opener though he set game ablaze cyhitheprynce
[CLS] rape victim dy set ablaze year old girl died burn

In [334]:
results = batch_predict(model, test_dataset)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [335]:
preds  = np.array(results[0])
target = np.array(results[1])

preds = np.where(preds > 0.47, 1, 0)

In [336]:
submissions = pd.read_csv('sample_submission.csv', index_col = 'id')
submissions

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0
2,0
3,0
9,0
11,0
...,...
10861,0
10865,0
10868,0
10874,0


In [337]:
submissions.loc[:, 'target'] = preds
submissions

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,1
3,1
9,1
11,1
...,...
10861,1
10865,1
10868,1
10874,1


In [339]:
submissions.to_csv('my_submission.csv')