**Notebook to run on Kaggle NVIDIA Tesla P100-PCIE-16GB GPU**

In [3]:
import numpy as np
import pandas as pd

In [4]:
train=pd.read_csv("../input/preprocessedwassem/waseempreprocessed _train.csv.csv")

In [5]:
test=pd.read_csv("../input/preprocessedwassem/waseempreprocessed_test.csv")

In [6]:
train.dropna(inplace=True)
test.dropna(inplace=True)

In [7]:
train["text"] = train["text"].astype('string')
test["text"] = test["text"].astype('string')

**trying preprocessing which is specific to twitter**

In [8]:
from emoji import demojize
from nltk.tokenize import TweetTokenizer

In [9]:
tokenizer = TweetTokenizer()

In [10]:
#this is for normalising
def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token

In [11]:
""" Function to Normalize tweet into standard English"""
def normalizeTweet(tweet):
    tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("cannot ", "can not ")
        .replace("shouldn't", "should not")
        .replace("wont", "will not")
        .replace("can't", "can not")
        .replace("ain't", "am not")
        .replace("was’t", "was not")
        .replace("weren’t", "were not")
        .replace("won't", "will not")
        .replace("wouldn’t", "would not")
        .replace("shan't", "shall not")
        .replace("musn't", "must not")
        .replace("didnt", "did not")
    )
    
    normTweet = (
        normTweet.replace(" p . m .", "  p.m.")
        .replace(" p . m ", " p.m ")
        .replace(" a . m .", " a.m.")
        .replace(" a . m ", " a.m ")
    )

    return " ".join(normTweet.split())



In [12]:
train['normalized_tweet']= train['text'].apply(lambda x:  normalizeTweet(x))


In [13]:
test['normalized_tweet']= test['text'].apply(lambda x:  normalizeTweet(x))


In [14]:
#lowercasing
for i in range(len(train)):
    Tweet = train['normalized_tweet'].iloc[i]
    train.at[i,'normalized_tweet'] = Tweet.lower()


In [15]:
for i in range(len(test)):
    Tweet = test['normalized_tweet'].iloc[i]
    test.at[i,'normalized_tweet'] = Tweet.lower()

In [16]:
pip install -U imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [17]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler


In [21]:
X=train.drop('label',axis=1)
y=train['label'].values

In [22]:
# defining oversampling strategy
over = RandomOverSampler(sampling_strategy="not majority")

X, y = over.fit_resample(X,y)
# defining undersampling strategy
under = RandomUnderSampler(sampling_strategy="not minority")

X, y = under.fit_resample(X,y)
# the number of samples will be equalized

### IMPLEMENTING BERTWEET

In [23]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [25]:
import torch
from transformers import AutoModel, AutoTokenizer 

bertweet = AutoModel.from_pretrained("vinai/bertweet-large")

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large")

# INPUT TWEET IS ALREADY NORMALIZED!
line = "DHEC confirms HTTPURL via @USER :crying_face:"

input_ids = torch.tensor([tokenizer.encode(line)])

with torch.no_grad():
    features = bertweet(input_ids)  # Models outputs are now tuples
    
## With TensorFlow 2.0+:
# from transformers import TFAutoModel
# bertweet = TFAutoModel.from_pretrained("vinai/bertweet-large")

Downloading:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/bertweet-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use 

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [26]:
## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
#tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model=AutoModel.from_pretrained("vinai/bertweet-large")

#model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at vinai/bertweet-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use 

**on the datframe**

In [27]:
tokenized_res = X['normalized_tweet'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [28]:
max_len = 0
for i in tokenized_res.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized_res.values])
np.array(padded).shape

(19118, 77)

In [29]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(19118, 77)

In [30]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [31]:
input_ids = torch.tensor(padded) 
att_mask_text = torch.tensor(attention_mask) 
 
text_data = TensorDataset(input_ids, att_mask_text)
text_dataloader = DataLoader(text_data,batch_size=16)

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(device)
print(n_gpu)
torch.cuda.get_device_name(0)
#gpu confirmation

cuda
1


'Tesla P100-PCIE-16GB'

In [33]:
features_text = []
for step, batch in enumerate(text_dataloader):
  batch = tuple(t.to(device) for t in batch)
 # Unpack the inputs from our dataloader
  input_ids_text, attention_mask_text = batch
 # Get the features
  model.to(device)
  with torch.no_grad():
    last_hidden_states = model(input_ids_text, attention_mask=attention_mask_text)

 
  features_text_batch = (last_hidden_states[0][:,0,:]).detach().cpu().numpy()
 #append to the feature vector
  features_text.append(features_text_batch)

In [34]:
len(features_text)

1195

In [35]:
19118/16

1194.875

In [36]:
len(features_text[1194])

14

In [37]:
features_text_2 = []
for i in range(1194):
  for j in range(16):
    features_text_2.append(features_text[i][j])
for j in range(14):
  features_text_2.append(features_text[1194][j])

In [38]:
labels=train['label']

**test**

In [39]:
tokenized_res = test['normalized_tweet'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [40]:
max_len = 0
for i in tokenized_res.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized_res.values])
np.array(padded).shape

(1570, 48)

In [41]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(1570, 48)

In [42]:
## Size too big, was crashing the karnel, will have to process in batches
batch1 = attention_mask[0:500]
batch1.shape

(500, 48)

In [43]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [44]:
input_ids = torch.tensor(padded) 
att_mask_text = torch.tensor(attention_mask) 
 
text_data = TensorDataset(input_ids, att_mask_text)
text_dataloader = DataLoader(text_data,batch_size=16)

In [45]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(device)
print(n_gpu)
torch.cuda.get_device_name(0)
#gpu confirmation

cuda
1


'Tesla P100-PCIE-16GB'

In [46]:
features_text = []
for step, batch in enumerate(text_dataloader):
  batch = tuple(t.to(device) for t in batch)
 # Unpack the inputs from our dataloader
  input_ids_text, attention_mask_text = batch
 # Get the features
  model.to(device)
  with torch.no_grad():
    last_hidden_states = model(input_ids_text, attention_mask=attention_mask_text)

 
  features_text_batch = (last_hidden_states[0][:,0,:]).detach().cpu().numpy()
 #append to the feature vector
  features_text.append(features_text_batch)

In [47]:
len(features_text)

99

In [48]:
1570/16

98.125

In [49]:
len(features_text[98])

2

In [50]:
features_text_3 = []
for i in range(98):
  for j in range(16):
    features_text_3.append(features_text[i][j])
for j in range(2):
  features_text_3.append(features_text[98][j])

In [51]:
train_features=features_text_2
test_features=features_text_3
train_labels=y
test_labels=test['label']

In [52]:
 train_features=np.array(train_features)

In [53]:
train_features.shape

(19118, 1024)

In [54]:
train_labels=np.array(train_labels)

In [55]:
train_labels

array([1., 1., 1., ..., 2., 2., 2.])

In [70]:
from catboost import CatBoostClassifier


In [71]:
model = CatBoostClassifier(iterations=1000,
                           task_type="GPU",
                           devices='0:1')
model.fit(train_features,train_labels)


Learning rate set to 0.028661
0:	learn: 0.6908286	total: 91ms	remaining: 1m 30s
1:	learn: 0.6886546	total: 178ms	remaining: 1m 28s
2:	learn: 0.6864646	total: 262ms	remaining: 1m 27s
3:	learn: 0.6844423	total: 346ms	remaining: 1m 26s
4:	learn: 0.6826104	total: 434ms	remaining: 1m 26s
5:	learn: 0.6807588	total: 516ms	remaining: 1m 25s
6:	learn: 0.6788234	total: 596ms	remaining: 1m 24s
7:	learn: 0.6768865	total: 676ms	remaining: 1m 23s
8:	learn: 0.6753309	total: 757ms	remaining: 1m 23s
9:	learn: 0.6733917	total: 838ms	remaining: 1m 23s
10:	learn: 0.6717023	total: 919ms	remaining: 1m 22s
11:	learn: 0.6700874	total: 999ms	remaining: 1m 22s
12:	learn: 0.6684926	total: 1.08s	remaining: 1m 21s
13:	learn: 0.6670504	total: 1.16s	remaining: 1m 21s
14:	learn: 0.6656879	total: 1.24s	remaining: 1m 21s
15:	learn: 0.6641893	total: 1.31s	remaining: 1m 20s
16:	learn: 0.6628469	total: 1.47s	remaining: 1m 24s
17:	learn: 0.6614509	total: 1.59s	remaining: 1m 27s
18:	learn: 0.6599572	total: 1.74s	remaining: 

<catboost.core.CatBoostClassifier at 0x7fcde24280d0>

In [72]:
y_pred=model.predict(test_features)

In [73]:
from sklearn.metrics import f1_score
print(f1_score(test_labels,y_pred,average="micro"))

0.8686624203821656


### LIGHTGBM

In [60]:
pip install lightgbm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [61]:
from lightgbm import LGBMClassifier


In [62]:
lgbm = LGBMClassifier(random_state=5)


In [66]:

lgbm.fit(train_features,train_labels)
y_pred = lgbm.predict(test_features)


In [67]:
from sklearn.metrics import f1_score
print(f1_score(test_labels,y_pred,average="micro"))

0.9313375796178344


Micro F1-score (short for micro-averaged F1 score) is used to assess the quality of multi-label binary problems. 
It measures the F1-score of the aggregated contributions of all classes.