**Notebook to run on Kaggle NVIDIA Tesla P100-PCIE-16GB GPU**

In [1]:
import numpy as np
import pandas as pd

In [2]:
train=pd.read_csv("../input/preprocessedwassem/waseempreprocessed _train.csv.csv")

In [3]:
test=pd.read_csv("../input/preprocessedwassem/waseempreprocessed_test.csv")

In [4]:
train.dropna(inplace=True)
test.dropna(inplace=True)

In [5]:
train["text"] = train["text"].astype('string')
test["text"] = test["text"].astype('string')

**trying preprocessing which is specific to twitter**

In [6]:
from emoji import demojize
from nltk.tokenize import TweetTokenizer

In [7]:
tokenizer = TweetTokenizer()

In [8]:
#this is for normalising
def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token

In [9]:
def normalizeTweet(tweet):
    """ Function to Normalize tweet into standard English"""

    tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("cannot ", "can not ")
        .replace("shouldn't", "should not")
        .replace("wont", "will not")
        .replace("can't", "can not")
        .replace("ain't", "am not")
        .replace("was’t", "was not")
        .replace("weren’t", "were not")
        .replace("won't", "will not")
        .replace("wouldn’t", "would not")
        .replace("shan't", "shall not")
        .replace("musn't", "must not")
        .replace("didnt", "did not")
    )
    
    normTweet = (
        normTweet.replace(" p . m .", "  p.m.")
        .replace(" p . m ", " p.m ")
        .replace(" a . m .", " a.m.")
        .replace(" a . m ", " a.m ")
    )

    return " ".join(normTweet.split())



In [10]:
train['normalized_tweet']= train['text'].apply(lambda x:  normalizeTweet(x))


In [11]:
test['normalized_tweet']= test['text'].apply(lambda x:  normalizeTweet(x))


In [12]:
#lowercasing
for i in range(len(train)):
    Tweet = train['normalized_tweet'].iloc[i]
    train.at[i,'normalized_tweet'] = Tweet.lower()


In [13]:
for i in range(len(test)):
    Tweet = test['normalized_tweet'].iloc[i]
    test.at[i,'normalized_tweet'] = Tweet.lower()

In [14]:
pip install -U imbalanced-learn

In [15]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler


Some null values might have been missed due to casing isssue 

In [18]:
train.dropna(inplace=True)

In [19]:
X=train.drop('label',axis=1)
y=train['label'].values

In [20]:
# defining oversampling strategy
over = RandomOverSampler(sampling_strategy="not majority")

X, y = over.fit_resample(X,y)
# defining undersampling strategy
under = RandomUnderSampler(sampling_strategy="not minority")

X, y = under.fit_resample(X,y)
# the number of samples will be equalized

### IMPLEMENTING BERTWEET

In [21]:
pip install transformers

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [23]:
import torch
from transformers import AutoModel, AutoTokenizer 

bertweet = AutoModel.from_pretrained("vinai/bertweet-large")

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large")


In [24]:
model=AutoModel.from_pretrained("vinai/bertweet-large")

**on the datframe**

In [25]:
tokenized_res = X['normalized_tweet'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [26]:
max_len = 0
for i in tokenized_res.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized_res.values])
np.array(padded).shape

In [27]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

In [28]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [29]:
input_ids = torch.tensor(padded) 
att_mask_text = torch.tensor(attention_mask) 
 
text_data = TensorDataset(input_ids, att_mask_text)
text_dataloader = DataLoader(text_data,batch_size=16)

In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(device)
print(n_gpu)
torch.cuda.get_device_name(0)
#gpu confirmation

In [31]:
features_text = []
for step, batch in enumerate(text_dataloader):
  batch = tuple(t.to(device) for t in batch)
 # Unpack the inputs from our dataloader
  input_ids_text, attention_mask_text = batch
 # Get the features
  model.to(device)
  with torch.no_grad():
    last_hidden_states = model(input_ids_text, attention_mask=attention_mask_text)

 
  features_text_batch = (last_hidden_states[0][:,0,:]).detach().cpu().numpy()
 #append to the feature vector
  features_text.append(features_text_batch)

In [32]:
len(features_text)

In [33]:
19118/16

In [34]:
len(features_text[1194])

In [35]:
features_text_2 = []
for i in range(1194):
  for j in range(16):
    features_text_2.append(features_text[i][j])
for j in range(14):
  features_text_2.append(features_text[1194][j])

In [36]:
labels=train['label']

**test**

In [37]:
tokenized_res = test['normalized_tweet'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [38]:
max_len = 0
for i in tokenized_res.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized_res.values])
np.array(padded).shape

In [39]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

In [40]:
## Size too big, was crashing the kernel, will have to process in batches
batch1 = attention_mask[0:500]
batch1.shape

In [41]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [42]:
input_ids = torch.tensor(padded) 
att_mask_text = torch.tensor(attention_mask) 
 
text_data = TensorDataset(input_ids, att_mask_text)
text_dataloader = DataLoader(text_data,batch_size=16)

In [43]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(device)
print(n_gpu)
torch.cuda.get_device_name(0)
#gpu confirmation

In [44]:
features_text = []
for step, batch in enumerate(text_dataloader):
  batch = tuple(t.to(device) for t in batch)
 # Unpack the inputs from our dataloader
  input_ids_text, attention_mask_text = batch
 # Get the features
  model.to(device)
  with torch.no_grad():
    last_hidden_states = model(input_ids_text, attention_mask=attention_mask_text)

 
  features_text_batch = (last_hidden_states[0][:,0,:]).detach().cpu().numpy()
 #append to the feature vector
  features_text.append(features_text_batch)

In [45]:
len(features_text)

In [46]:
1570/16

In [47]:
len(features_text[98])

In [48]:
features_text_3 = []
for i in range(98):
  for j in range(16):
    features_text_3.append(features_text[i][j])
for j in range(2):
  features_text_3.append(features_text[98][j])

In [49]:
train_features=features_text_2
test_features=features_text_3
train_labels=y
test_labels=test['label']

In [50]:
 train_features=np.array(train_features)

In [51]:
train_features.shape

In [52]:
train_labels=np.array(train_labels)

In [53]:
train_labels

In [62]:
test.shape

In [63]:
test_labels=test['label']

In [55]:
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold

In [70]:
kf = KFold(n_splits=5, shuffle=True)
models = []

for i, (train_index, valid_index) in enumerate(kf.split(train_features)):
    X=train_features
    y=train_labels

    X_train, y_train = X[train_index], y[train_index]
    X_valid, y_valid = X[valid_index], y[valid_index]

    model = CatBoostClassifier(
        iterations = 50,
        depth = 1,
        verbose = 10
    )
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid))
    models.append(model)

In [71]:
y_pred=model.predict(test_features)

In [72]:
from sklearn.metrics import f1_score
print(f1_score(test_labels,y_pred,average="micro"))

### LIGHTGBM

In [65]:
pip install lightgbm

In [66]:
from lightgbm import LGBMClassifier


In [81]:
kf = KFold(n_splits=5, shuffle=True)
models = []

for i, (train_index, valid_index) in enumerate(kf.split(train_features)):
    X=train_features
    y=train_labels
    X_train, y_train = X[train_index], y[train_index]
    X_valid, y_valid = X[valid_index], y[valid_index]

    model = LGBMClassifier(random_state=5,num_boost_round=500 )
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid))
    models.append(model)

In [82]:
y_pred = model.predict(test_features)

In [84]:
from sklearn.metrics import f1_score
print(f1_score(test_labels,y_pred,average="micro"))

Micro F1-score (short for micro-averaged F1 score) is used to assess the quality of multi-label binary problems. 
It measures the F1-score of the aggregated contributions of all classes.