In [1]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import logging
import os, sys
import re, string

In [2]:
from verstack.stratified_continuous_split import scsplit 

In [3]:
import torch
use_cuda = True
device = torch.device("cuda" if use_cuda else "cpu")

In [4]:
device

device(type='cuda')

In [5]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [6]:
path = os.getcwd()

In [7]:
path_data_train = path + "\\covidChallenge\\data\\train.csv"

### Preparing dataset

In [8]:
# Load the training data
train_data = pd.read_csv(path_data_train)

In [9]:
#Non-relevant features, can not be used for SVMs models
train_data.drop('timestamp', axis=1, inplace=True)
train_data.drop('user_mentions', axis=1, inplace=True)
train_data.drop('urls', axis=1, inplace=True)
train_data.drop('hashtags', axis=1, inplace=True)
#train_data.drop('text', axis=1, inplace=True)
train_data.drop('id', axis=1, inplace=True)
train_data.drop('user_verified', axis=1, inplace=True)
train_data.drop('user_statuses_count', axis=1, inplace=True)
train_data.drop('user_followers_count', axis=1, inplace=True)
train_data.drop('user_friends_count', axis=1, inplace=True)

In [10]:
train_data.sample(5)

Unnamed: 0,retweet_count,text
217767,3,"Did 30,000 people spring back to life like Laz..."
19281,63,Thread 👇 https://t.co/R62Ungigv0
334354,0,"No, just assisted in spreading the virus."
536524,1,According to our analysis of the impact of #CO...
231838,0,"Aside from the people dying, existential dread..."


Before we visualize our text data I wanted to make it look better with some general helper functions to clear out things like: urls, emojis, html tags, punctuations.

In [11]:
# Some basic helper functions to clean text by removing urls, emojis, html tags and punctuations.

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)


def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

In [12]:
train_data['text'] = train_data['text'].apply(lambda x: remove_URL(x))
train_data['text'] = train_data['text'].apply(lambda x: remove_emoji(x))
train_data['text'] = train_data['text'].apply(lambda x: remove_html(x))
train_data['text'] = train_data['text'].apply(lambda x: remove_punct(x))

In [13]:
train_data.sample(5)

Unnamed: 0,retweet_count,text
368733,12,Besides strengthening Health systems working ...
204391,0,WOW I thought Americans could get free treatme...
188761,1,Heres when every state plans to end their coro...
407834,0,Now why would a friendly nation like China do ...
652591,0,Some people have it rough indeed


In [14]:
X_train, X_test, y_train, y_test = scsplit(train_data, train_data['retweet_count'], stratify=train_data['retweet_count'], train_size=0.7, test_size=0.3)

In [15]:
print("\t Train dataset shape: ", X_train.shape)
print("\t Test dataset shape: ", X_test.shape)

	 Train dataset shape:  (466043, 2)
	 Test dataset shape:  (199734, 2)


In [16]:
X_train.sample(5)

Unnamed: 0,retweet_count,text
77386,0,China wants 4 more years of Trump so they can ...
213309,1,Covid19 could have stamped a person “uninsurab...
502463,0,Hope this Is true Cunt
75913,0,Sounds like a plot
23813,0,For the first time I hear positive news from you


In [17]:
X_test.sample(5)

Unnamed: 0,retweet_count,text
573793,2,Robin73202976 You too undertake preventive mea...
84544,4,Looking to speak to anyone who has signed up f...
641007,0,The lockdown slowed the spread of the virus\nW...
294684,1,On live TV Asmali77 remarked that business own...
437496,0,Good more of this please


Re-labeling columns before feeding them into the transformer

In [18]:
X_train.columns = ["labels", "text"]
X_test.columns = ["labels", "text"]

In [19]:
X_train.sample(5)

Unnamed: 0,labels,text
301616,0,One of my friends there says city life is over
362254,0,We need crucifixion by virus Such idiot’s
22864,0,I don’t like how these Oyo numbers are using s...
307302,0,2020 Recap\nWW3 tensions\nAustralia Burning\nD...
248520,0,Wai what’s happening between kano and kaduna s...


In [20]:
X_test.sample(5)

Unnamed: 0,labels,text
110353,0,Amen sis Proud of you
403881,117,Campions de lOKlliga \n\n Ens hauria agradat ...
630943,1,COVID19 \n\nPlease think of others\n\nIts real...
200925,0,Pope Franciss Second Prayer to Mary in the tim...
101515,0,Locally made PPEs because our health workers a...


In [21]:
device

device(type='cuda')

In [22]:
torch.cuda.empty_cache()

# Enabling regression
# Setting optional model configuration
model_args = ClassificationArgs()
model_args.num_train_epochs = 1
model_args.regression = True

In [23]:
# Create a ClassificationModel
model = ClassificationModel(
    "roberta",
    "roberta-base",
    num_labels=1,
    args=model_args
)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

In [24]:
# Train the model
model.train_model(X_train)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=466043.0), HTML(value='')))




HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value='Running Epoch 0 of 1'), FloatProgress(value=0.0, max=58256.0), HTML(value='')))





RuntimeError: CUDA out of memory. Tried to allocate 12.00 MiB (GPU 0; 2.00 GiB total capacity; 1.29 GiB already allocated; 8.88 MiB free; 1.30 GiB reserved in total by PyTorch)

In [None]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(X_test)