In [None]:
# https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb#scrollTo=HMqQTafXEaei

In [None]:
!nvidia-smi

In [None]:
from transformers import get_linear_schedule_with_warmup, RobertaConfig, RobertaTokenizerFast, RobertaModel, RobertaPreTrainedModel
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
from collections import defaultdict
from textwrap import wrap
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
import re, os

%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

In [None]:
hyperparams = {
    'BATCH_SIZE': 16,
    'EPOCHS': 32,
    'RANDOM_SEED': 42,
    'MAX_LEN' : 128,
    'lr' : 2e-5,
    'cuda' : 'cuda:2'
}

In [None]:
np.random.seed(hyperparams['RANDOM_SEED'])
torch.manual_seed(hyperparams['RANDOM_SEED'])
device = torch.device(hyperparams['cuda'] if torch.cuda.is_available() else "cpu")
current_time = datetime.now().strftime("%Y%m%d-%I_%M%p")
outFilepath = 'out/drug_classifier/' + device.type + current_time +'/'
try:
    os.makedirs(outFilepath)
except FileExistsError:
    pass
outFilepath

In [None]:
df1 =pd.read_csv('/data/jmharja/projects/robertaForTweetAnalysis/input/Tweets_Spring_Summer_2021_coded.csv',lineterminator='\n',skipinitialspace=True,)
# df1['label']= 1
# df1.drop(df1.columns[[0, 2,3,4]], axis=1, inplace=True)
df1.shape

In [None]:
df1_pos = df1.loc[(df1['Substance'] != 'X') & (df1['Use'] != 'X') & (df1['Intent'] != 'X')]
df1_neg = df1.loc[(df1['Substance'] == 'X') & (df1['Use'] == 'X') & (df1['Intent'] == 'X')]

df1_pos = df1_pos.drop(df1_pos.columns[[0, 2, 3, 4]], axis=1).copy()
df1_neg = df1_neg.drop(df1_neg.columns[[0, 2,3,4]], axis=1).copy()
df1_pos['label']= 1
df1_neg['label']= 0
df1_neg.shape,  df1_pos.shape

In [None]:
# prediction result reviewed by Dr. King added for re-training
# df2 =pd.read_csv('/data/jmharja/projects/robertaForTweetAnalysis/input/ReviewedPrediction.csv',lineterminator='\n', skipinitialspace=True,)
# df2.rename(columns = {'tweet':'Tweet'}, inplace = True)
# df2 = df2.loc[:, ~df2.columns.str.contains('^Unnamed')]
# df2_pos = df2.loc[(df2['type'] == 1) | (df2['use'] == 1) | (df2['intent'] ==1)]
# df2_neg = df2.loc[(df2['type'] != 1) & (df2['use'] != 1) & (df2['intent'] !=1)]
# df2_pos = df2_pos.drop(df2_pos.columns[[2,3,1]], axis=1).copy()
# df2_neg = df2_neg.drop(df2_neg.columns[[2,3,1]], axis=1).copy()
# df2_pos['label']= 1
# df2_neg['label']= 0
# df2_neg.shape,  df2_pos.shape

In [None]:
# iteration (predicted result added for re-training)
# df3 =pd.read_csv('/data/jmharja/projects/robertaForTweetAnalysis/input/ReviewedPrediction_1.csv',lineterminator='\n', skipinitialspace=True,)
# df3_pos= df3.loc[df3['label']==1]
# df3_neg = df3.loc[df3['label']==0]
# df3_neg.shape, df3_pos.shape

In [None]:
# # predicted result from 202212 filtered by chatgpt added for retraining
# df4_pos =pd.read_csv('test_result/chatgpt/all_pos.csv',lineterminator='\n', skipinitialspace=True,)
# df4_neg =pd.read_csv('test_result/chatgpt/all_neg.csv',lineterminator='\n', skipinitialspace=True,)
# df4_pos.drop(df4_pos.columns[[0]], axis=1, inplace=True)
# df4_neg.drop(df4_neg.columns[[0]], axis=1, inplace=True)
# df4_pos['label']= 1
# df4_neg['label']= 0
# df4_neg.shape, df4_pos.shape

In [None]:
# df_pos = pd.concat([df1_pos, df2_pos, df3_pos, df4_pos])
# df_neg = pd.concat([df1_neg, df2_neg, df3_neg, df4_neg])
# df_pos = pd.concat([df1_pos, df2_pos, df3_pos])
# df_neg = pd.concat([df1_neg, df2_neg, df3_neg])
# df_neg.shape,  df_pos.shape

In [None]:
# df1 = df_pos[:1150]
# df2 = df_neg[:4575]
# df = pd.concat([df1,df2])
# df.shape

In [None]:
# df2 =pd.read_csv('/users/kent/jmaharja/drugAbuse/input/2020_01_31_CleanedTweets.csv',
#                 lineterminator='\n',
#                 skipinitialspace=True
#                 )
# df2.drop(df2.columns[[0, 1]], axis=1, inplace=True)
# df2.rename({'text': 'Tweet'}, axis=1, inplace=True)
# df2['label'] = 0
# df4 = df2[1160000:]
# df2 = df2[:800]
# df4 = df4[:200]
# df2.shape, df4.shape

In [None]:
# df_test =pd.read_csv('/users/kent/jmaharja/drugAbuse/finetune/test_new.csv',
# #                    error_bad_lines=False,
#                 lineterminator='\n',
#                 skipinitialspace=True
#                 )
# df_test.drop(df_test.columns[[1, 2, 3]], axis=1, inplace=True)
# # df_test.rename({'text': 'Tweet'}, axis=1, inplace=True)
# # df_test['label']=/ 1
# df_test = df_test.rename(columns={df_test.columns[1]: 'label'})
# df_test_neg = df_neg[4575:]
# df_test_pos = df_pos[1150:]
# df3 = pd.concat([df_test_pos,df_test_neg])
# df3.shape

In [None]:
df = pd.concat([df1_pos,df1_neg])
df.shape

# DATA PREPROCESSING

In [None]:
# Create the tokenizer from a trained one
tokenizer_folder = '/data/jmharja/projects/robertaForTweetAnalysis/output/oct2022/TokRoBERTa'
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_folder, max_len=hyperparams['MAX_LEN'])

In [None]:
class Tweet_DataSet(Dataset):
   def __init__(self, data, tokenizer, max_len):
    self.data = data
    self.data['Tweet'] = self.data['Tweet'].map(lambda x: self.cleaner(x))
    self.tokenizer = tokenizer
    self.max_len = max_len
    
  
   def __len__(self):
    return len(self.data)

   def cleaner(self, tweet):
#         print(tweet)
        tweet = re.sub("@[A-Za-z0-9]+","", tweet) #Remove @ sign
        tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
        tweet = " ".join(tweet.split())
        #     tweet = ''.join(c for c in tweet if c not in emoji.UNICODE_EMOJI) #Remove Emojis
        #     tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
        #     tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet) \
        #          if w.lower() in words or not w.isalpha())
        return tweet
    
        
  
   def __getitem__(self, index:int):
    data_row = self.data.iloc[index]
    tweet = data_row.Tweet
    labels = data_row['label']
    encoding = tokenizer.encode_plus(tweet,
                                     None,
                                     max_length = hyperparams['MAX_LEN'],
                                     truncation=True,
                                     pad_to_max_length=True,
                                     add_special_tokens=True,
                                     padding='max_length',
                                     return_token_type_ids=True)

    return {
      'tweet_text': tweet,
      'input_ids': torch.tensor(encoding.input_ids, dtype=torch.long),
      'attention_mask':  torch.tensor(encoding.attention_mask, dtype=torch.long),
      'token_type_ids': torch.tensor(encoding.token_type_ids, dtype=torch.long),
      'targets': torch.tensor(labels, dtype=torch.long)
    }

In [None]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=hyperparams['RANDOM_SEED'])
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=hyperparams['RANDOM_SEED'])
# df_test = df3
hyperparams['data_size'] = df_train.shape, df_val.shape, df_test.shape
df_train.shape, df_val.shape, df_test.shape

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = Tweet_DataSet(df,tokenizer=tokenizer,max_len=max_len)
  return DataLoader(ds, batch_size=batch_size,num_workers=4)

train_data_loader = create_data_loader(df_train, tokenizer, hyperparams['MAX_LEN'], hyperparams['BATCH_SIZE'])
val_data_loader = create_data_loader(df_val, tokenizer,hyperparams['MAX_LEN'], hyperparams['BATCH_SIZE'])
test_data_loader = create_data_loader(df_test, tokenizer, hyperparams['MAX_LEN'], hyperparams['BATCH_SIZE'])

In [None]:
class TweetModel(RobertaPreTrainedModel):
    def __init__(self, conf, n_classes):
        super(TweetModel, self).__init__(conf)
        self.roberta = RobertaModel.from_pretrained('/data/jmharja/projects/robertaForTweetAnalysis/output/oct2022/RoBERTaMLM/', config=conf)
        self.drop_out = torch.nn.Dropout(0.5)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.classifier = torch.nn.Linear(768, n_classes)
        
    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.roberta(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.drop_out(pooler)
        output = self.classifier(pooler)
        return output


In [None]:
config = RobertaConfig(
    vocab_size=8192,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
    hidden_size=768,
    pad_token_id=1
)

model = TweetModel(config, 2)
model = model.to(device)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=hyperparams['lr'])
total_steps = len(train_data_loader) * hyperparams['EPOCHS']
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = torch.nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0
     
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    token_type_ids = d["token_type_ids"].to(device)
    targets = d["targets"].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

    big_val, big_idx = torch.max(outputs, dim=1)
    correct_predictions += torch.sum(big_idx == targets)

    loss = loss_fn(outputs, targets)
    losses.append(loss.item())

    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double()/n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      token_type_ids = d["token_type_ids"].to(device)
      targets = d["targets"].to(device)
      outputs = model(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
      _, preds = torch.max(outputs, dim=1)
      correct_predictions += torch.sum(preds == targets)
      loss = loss_fn(outputs, targets)
      losses.append(loss.item())

  return correct_predictions.double()/n_examples, np.mean(losses)

In [None]:
print(40*"*", 'Training')
history = defaultdict(list)
best_accuracy = 0
val_acc = 0 
EPOCHS =hyperparams['EPOCHS']
for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train))
  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(model, val_data_loader,loss_fn, device, len(df_val))
  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    curr_time = datetime.now().strftime("%Y_%m_%d-%I_%M%p")
    torch.save(model.state_dict(), outFilepath +'ckpt.bin')
    best_accuracy = val_acc

In [None]:
plt.plot([tensor.cpu().numpy() for tensor in history['train_acc']], label='train accuracy')
plt.plot([tensor.cpu().numpy() for tensor in history['val_acc']], label='validation accuracy')
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1.05]);
plt.savefig(outFilepath+'training_hist.png')

In [None]:
test_acc, _ = eval_model(model, test_data_loader, loss_fn, device, len(df_test))
hyperparams['test_acc'] = test_acc.item()
test_acc.item()

# PREDICTION


In [None]:
def get_predictions(model, data_loader):
  model = model.eval()
  
  review_texts = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in data_loader:
      texts = d["tweet_text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      token_type_ids = d["token_type_ids"].to(device)
      targets = d["targets"].to(device)

      outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
      _, preds = torch.max(outputs, dim=1)
      probs = torch.nn.functional.softmax(outputs, dim=1)

      review_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(probs)
      real_values.extend(targets)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return review_texts, predictions, prediction_probs, real_values

In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(model, val_data_loader)

In [None]:
logit_roc_auc = roc_auc_score(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test.numpy(), y_pred_probs[:, 1].numpy())
plt.figure()
plt.plot(fpr, tpr, label='BERT (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig(outFilepath +'Log_ROC.png')
plt.show()
logit_roc_auc
hyperparams['logit_roc_auc'] = logit_roc_auc

In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(model, test_data_loader)

In [None]:
y_pred_probs_pd = [y.numpy() for y in y_pred_probs]
someListOfLists = list(zip(y_review_texts, y_test.numpy(), y_pred.numpy() ))
npa = np.asarray(someListOfLists)
dff = pd.DataFrame(someListOfLists, columns = ['Tweet', 'Real', 'Predicted'  ])
dff

In [None]:
dff.to_csv( outFilepath + 'test_pred.csv')

In [None]:
print(classification_report(y_test, y_pred, target_names=['Y', 'N']))

In [None]:
def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.ylabel('Actual ')
  plt.xlabel('Predicted ')
  plt.savefig( outFilepath + 'cm.png')


cm = confusion_matrix(y_pred, y_test)
df_cm = pd.DataFrame(cm, index=['Y', 'N'], columns=['Y', 'N'])
show_confusion_matrix(df_cm)

# PREDICTION ON REAL DATA

In [None]:
df_pred =pd.read_csv('/users/kent/jmaharja/drugAbuse/input/2020_01_01.csv',lineterminator='\n',   skipinitialspace=True)
df_pred.drop(df_pred.columns[[0, 1,2, 4]], axis=1, inplace=True)
df_pred = df_pred.rename(columns={df_pred.columns[0]: 'Tweet'})
df_pred['label']= 1

In [None]:
## df_pred =pd.read_csv('/users/kent/jmaharja/drugAbuse/input/2022_11.csv',lineterminator='\n',skipinitialspace=True)
# df_pred = pd.read_fwf('/users/kent/jmaharja/drugAbuse/input/2022_11.csv')
# df_pred['label']= 1

In [None]:
df_pred.shape

In [None]:
# from sklearn.utils import shuffle
# df_pred = shuffle(df_pred)

In [None]:
# df_pred = df_pred[:150000]
# df_pred = df_pred[150000:160000]
# df_pred = df_pred[160000:170000]
# df_pred = df_pred[170000:180000]
# df_pred = df_pred[180000:190000]
# df_pred = df_pred[190000:200000]
# df_pred = df_pred[230000:240000]
df_pred= df_pred[250000:260000]
# df_pred = df_pred[1200:2000]

In [None]:
pred_data_loader = create_data_loader(df_pred, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(model, pred_data_loader)

In [None]:
y_pred_probs_pd = [y.numpy() for y in y_pred_probs]
someListOfLists = list(zip(y_review_texts, y_test.numpy(), y_pred.numpy(), y_pred_probs[:, 1:].numpy().squeeze(), y_pred_probs_pd ))
npa = np.asarray(someListOfLists)
dff = pd.DataFrame(someListOfLists, columns = ['tweet', 'Real', 'Predicted', 'Pred-prob', 'All Pred-probs' ])
dff

In [None]:
cm = confusion_matrix(y_pred, y_test)
df_cm = pd.DataFrame(cm, index=['Y', 'N'], columns=['Y', 'N'])
show_confusion_matrix(df_cm)

In [None]:
dff[dff['Predicted']==1]

In [None]:
dff[dff['Predicted']==0]

In [None]:
ones = dff[dff['Predicted']==1]['tweet']
ones.to_csv( outFilepath+'new_test_pred_ones.csv')

In [None]:
# zeros = dff[dff['Predicted']==0]['tweet']
# zeros.to_csv('test_result/2023_neg.csv')

In [None]:
dff[[ "tweet", "Predicted"]].to_csv(outFilepath+"new_test_pred.csv)

In [None]:
current_time

# POST REVIEW

In [None]:

hyperparams