# Text Classification using RoBERTa

### Using RoBERTa base and getting AUROC values of ~0.93 with a 100K downsampled/upsampled data set.

In [1]:
#Installs and imports
#!conda create -y -n st python pandas tqdm
#!conda activate st && conda init bash

## for mac
#!conda install -y -n st pytorch torchvision torchaudio -c pytorch-nightly

## for linux
#!conda install -y -n tf pytorch torchvision torchaudio cudatoolkit=11.6 -c pytorch -c conda-forge
#!python3 -m pip install --upgrade transformers simpletransformers wandb

In [2]:
#!pip install simpletransformers

In [3]:
from transformers import LayoutLMv2Processor
import simpletransformers
from simpletransformers.classification import ClassificationModel, ClassificationArgs

from simpletransformers.classification import ClassificationModel
from transformers import RobertaForSequenceClassification
import pandas as pd
import logging
import sklearn
import numpy as np

  device: torch.device = torch.device("cpu"),


In [4]:
#Log Results
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

### 2. Import and Transform Data

In [5]:
#Import Train and Test Datasets

df_ham = pd.read_csv('../texts/hamCleanScrubbed.txt', sep='\t', names=['labels','text']).drop_duplicates()
df_spam = pd.read_csv('../texts/spamCleanScrubbed.txt', sep='\t', names=['labels','text']).drop_duplicates()

## Drop header if needed
df_ham = df_ham.iloc[1: , :]
df_spam = df_spam.iloc[1: , :]

print('ham dataset:','\n',df_ham.head(),'\n\n','spam dataset:','\n',df_spam.head())

ham dataset: 
   labels                                               text
1    Ham  11616056 has a WIC appt on {Month} {Day} {Time...
2    Ham  1st of the Holiday Stress Busting Series, Art ...
3    Ham  21417272 has a WIC appt on {Month} {Day} {Time...
4    Ham  2 flu shots for  has an upcoming appt on {Mont...
5    Ham  31018019 has a WIC appt on {Month} {Day} at {T... 

 spam dataset: 
   labels                                               text
1   Spam  {FirstName} has a WIC appt with People's Healt...
2   Spam  {FirstName} has missed a WIC appt on  {Weekday...
3   Spam  {FirstName} missed a WIC appt. Please call {Ph...
4   Spam   {FirstName} has missed a WIC appt on {Weekday...
5   Spam  {FirstName} has a WIC appt with People's Healt...


In [6]:
## convert text to string
df_ham['labels'] = df_ham['labels'].str.lower()
df_spam['labels'] = df_spam['labels'].str.lower()

## change ham to 0 and spam to 1
df_ham['labels'] = df_ham['labels'].replace({'ham':'0', 'spam':'1'})
df_spam['labels'] = df_spam['labels'].replace({'ham':'0', 'spam':'1'})

## solve potential int errors
df_ham['labels'] = df_ham['labels'].astype(pd.Int64Dtype())
df_spam['labels'] = df_spam['labels'].astype(pd.Int64Dtype())

df_ham.reset_index(drop=True, inplace=True)
df_spam.reset_index(drop=True, inplace=True)

## confirm there's no ham in spam and vice versa
df_ham = df_ham[df_ham['labels'] == 0]
df_spam = df_spam[df_spam['labels'] == 1]

## Resample data
from sklearn.utils import resample
## Downsample majority class
df_ham_downsampled = resample(df_ham, 
                                replace=False,   ##replace=False when downsampling, True when upsampling
                                n_samples=1000)
#Upsample minority class
df_spam_upsampled = resample(df_spam, 
                                replace=True,     
                                n_samples=1000)

# Combine minority class with downsampled majority class
df_up_down_sampled = pd.concat([df_ham_downsampled, df_spam_upsampled]).drop_duplicates()

## create train/test split
train = np.random.rand(len(df_up_down_sampled)) < 0.75
df_train = df_up_down_sampled[train]

df_train.reset_index(drop=True, inplace=True)
df_train.loc[:, 'id'] = df_train.index + 1
df_train['id'] = df_train['id'].astype(pd.Int64Dtype())
df_train = df_train[['id','text','labels']]
df_train.to_csv('train.csv', index=False)

df_test = df_up_down_sampled[~train]
df_test.reset_index(drop=True, inplace=True)
df_test.loc[:,'id'] = len(df_train.index) + df_test.index + 1
df_test['id'] = df_test.loc[:,'id'].astype(pd.Int64Dtype())
df_test = df_test[['id','text','labels']]
df_test.to_csv('test.csv', index=False)

## print train/test data sizes
print("training rows: ",len(df_train.index))
print("testing rows: ",len(df_smalltest.index))

## drop individual tables to free memory
del [[df_ham,df_spam,df_ham_downsampled,df_spam_upsampled,df_up_down_sampled]]

ValueError: invalid literal for int() with base 10: '\nham\t4{firstname} has a wic appt on {month} {day} {time} am for a wic at 7640 greenback . call {phone} or email {email} for questions."'

In [None]:
## Drop ID columns for training/testing
df_train.drop(columns = 'id')
df_test.drop(columns = 'id')
print(df_train.head(),df_test.head())

In [None]:
## Check dataframe type for model
df_train.info()
df_test.info()

### 3. Create Classification Model

In [None]:
#Set and create the classification model we want to use

model_args = ClassificationArgs()
model_args.num_train_epochs = 1
model_args.learning_rate = 2e-5
model_args.train_batch_size = 16
model_args.eval_batch_size = 16
model_args.manual_seed = 42
model_args.optimized = "AdamW"
model_args.adam_epsilon = 1e-8
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True

#model = ClassificationModel('roberta', 'mariagrandury/roberta-base-finetuned-sms-spam-detection', num_labels=2, weight=[1 , len(df_ham) / len(df_spam_upsampled)], use_cuda=True, args=model_args)
model = ClassificationModel('roberta', 'mariagrandury/roberta-base-finetuned-sms-spam-detection', num_labels=2, weight=[1 , 1], args=model_args)

In [None]:
#Train the classification model using training data
model.train_model(df_train)

### 4. Evaluate the Model

- Our trained model was able to produce an accuracy of 99.3%. It accurately identified 656 spam messages, accurately identified 4311 non-spam messages, falsely identified 16 messages as spam, and 17 messages as non-spam.

In [None]:
#Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(df_test, acc=sklearn.metrics.accuracy_score)

### 5. Make Predictions on Test Data and Save to CSV

In [None]:
#Run the model on test data
print(model.config.id2label)

predictions, raw_outputs = model.predict(df_test['text'].astype(str).values.tolist())

print(predictions, raw_outputs)

from scipy.special import softmax
probabilities = softmax(raw_outputs)

In [None]:
#Export Predictions
preds = pd.DataFrame({'id': df_test.index, 'predicted': predictions})
preds.head()
preds.to_csv('predictions.csv', index=False)