In [1]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import wget
from zipfile import ZipFile
import string
import preprocessor.api as p
import warnings
warnings.filterwarnings("ignore")
import re



In [21]:
lexs = {'atheism': ('Lexicon_Atheism', 1),
        'climate': ('Lexicon_Climate', 2),
       'feminism': ('Lexicon_Feminism',3),
       'hillary': ('Lexicon_Hillary', 4),
       'abortion': ('Lexicon_Abortion', 5)}

target_word = 'atheism'
lexicon_col = lexs.get(target_word)[0]
target_col = lexs.get(target_word)[1]
filename1 = 'raw_train_'+target_word+'.csv'
filename2 = 'raw_val_'+target_word+'.csv'
filename3 = 'raw_test_'+target_word+'.csv'
filename_lexicon = 'stance_lexicon_emnlp.csv'

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7fa5ad5234f0>

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [None]:
#url = 'http://alt.qcri.org/semeval2016/task6/data/uploads/stancedataset.zip'
#filename = wget.download(url)
#with ZipFile(filename, 'r') as zipObj:
#    zipObj.extractall()

In [None]:
def upload_data (path_train, path_test):
    #train data
    dt = pd.read_csv(path_train, engine='python' )
    raw_train, raw_val = train_test_split(dt, test_size=0.15, random_state=1)    
    raw_train.to_csv('raw_train.csv', index=False) #saving all train data
    raw_val.to_csv('raw_val.csv', index=False) #saving 15% val data
    #test data
    dt2 = pd.read_csv(path_test, engine='python' )
    raw_test = dt2[dt2['Target']!= 'Donald Trump'] #excluding trump target
    raw_test.to_csv('raw_test.csv', index=False) #saving all test data
    return raw_train, raw_val, raw_test

In [None]:
path_train = 'StanceDataset/train.csv'
path_test = 'StanceDataset/test.csv'

raw_train, raw_val, raw_test = upload_data (path_train, path_test)

In [None]:
tar = {'Legalization of Abortion':'abortion',
     'Feminist Movement': 'feminism',
     'Hillary Clinton': 'hillary',
     'Climate Change is a Real Concern': 'climate',
     'Atheism': 'atheism'     
    }

#train, validation and test datasets, filtered by targets, overall 3 datasets * 5 targets = 15 files
for key in tar.keys():
        filt_tr = raw_train[raw_train['Target']=='{}'.format(key)]           
        filt_tr.to_csv('raw_train' +'_'+ tar.get('{}'.format(key))+'.csv',index=False)
        
        filt_val = raw_val[raw_val['Target']=='{}'.format(key)]           
        filt_val.to_csv('raw_val' +'_'+ tar.get('{}'.format(key))+'.csv',index=False)
        
        filt_tst = raw_test[raw_test['Target']=='{}'.format(key)]           
        filt_tst.to_csv('raw_test' +'_'+ tar.get('{}'.format(key))+'.csv',index=False)

In [129]:
# Data Cleaning

def Data_Clean(strings)::x
    p.set_options('urls','emojis','reserved_words')
    clean_data = p.clean(strings) # using lib to clean URL,hashtags...
    clean_data = re.findall(r"[A-Za-z]+|[,.!?&/<>=$]",clean_data)
#    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
#    clean_data = [re_punc.sub('', w) for w in clean_data]
#    clean_data = [x.lower() for x in clean_data]

    return " ".join(clean_data)

In [105]:
# Data Loading

def Load_Transform_Data(filename):
    #Loading data
    filename = [filename]
    concat_text = pd.DataFrame()
    raw_text = pd.read_csv(filename[0],usecols=[0], encoding='ISO-8859-1', engine='python')
    raw_label = pd.read_csv(filename[0],usecols=[2], encoding='ISO-8859-1', engine='python')
    raw_label2 = pd.read_csv(filename[0],usecols=[4], encoding='ISO-8859-1', engine='python')
    raw_target = pd.read_csv(filename[0],usecols=[1], encoding='ISO-8859-1', engine='python')
    #Transforming data
    label = pd.DataFrame.replace(raw_label,['FAVOR','NONE','AGAINST'], [1,2,0])
    label2 = pd.DataFrame.replace(raw_label2,['pos','other','neg'], [1,2,0])
    target = pd.DataFrame.replace(raw_target,['Atheism','Climate Change is a Real Concern','Feminist Movement',\
                                              'Hillary Clinton','Legalization of Abortion'], [4,3,2,1,0])
    
    concat_text = pd.concat([raw_text, target, label, label2], axis=1)
    
    return(concat_text)

In [None]:
#fnm1 = 'train.csv'
#fnm2 = 'test_A.csv'
#train = Load_Transform_Data(fnm1)
#df_train, df_val = train_test_split(train, test_size=0.15, random_state=RANDOM_SEED)
#df_test = Load_Transform_Data(fnm2)

In [135]:
df_train = Load_Transform_Data(filename1)
df_val = Load_Transform_Data(filename2)
df_test = Load_Transform_Data(filename3)

In [136]:
df_train['Tweet'] = df_train['Tweet'].apply(Data_Clean)
df_val['Tweet'] = df_val['Tweet'].apply(Data_Clean)
df_test['Tweet'] = df_test['Tweet'].apply(Data_Clean)

In [6]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer=BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [145]:
class GPReviewDataset(Dataset):
    
    def __init__(self, doc, targets, stance, sentiment, length, tokenizer, max_len):
        self.doc = doc
        self.targets = targets
        self.stance = stance
        self.sentiment = sentiment
        self.length = length
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.doc)
  
    def __getitem__(self, item):
        doc = str(self.doc[item])
        target = self.targets[item]
        stance = self.stance[item]
        sentiment = self.sentiment[item]
        length = self.length[item]

        encoding = self.tokenizer.encode_plus(
                    doc,
                    add_special_tokens=True,
                    max_length=self.max_len,
                    return_token_type_ids=False,
                    pad_to_max_length=True,
                    return_attention_mask=True,
                    return_tensors='pt',
                    TOKENIZERS_PARALLELISM = True
                    )

        return {
            'doc_text': doc,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long),
            'stance': torch.tensor(stance, dtype=torch.long),
            'sentiment': torch.tensor(sentiment, dtype=torch.long),
            'length': torch.tensor(length, dtype=torch.long)
            }

In [147]:
def create_data_loader(df, tokenizer, max_len,length, batch_size):
    ds = GPReviewDataset(
        doc=df['Tweet'].to_numpy(),
        targets=df['Target'].to_numpy(),
        stance = df['Stance'].to_numpy(),
        sentiment = df['Sentiment'].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len,
        length = length
      )
    
    return DataLoader(
        ds,
        batch_size=batch_size,
        )

In [138]:
x_train_len = np.array([len(xi.split()) for xi in df_train.iloc[:,0]])
x_val_len = np.array([len(xi.split()) for xi in df_val.iloc[:,0]])
x_test_len = np.array([len(xi.split()) for xi in df_test.iloc[:,0]])

In [224]:
MAX_LEN = 50
BATCH_SIZE = 16

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, x_train_len, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, x_val_len, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, x_test_len, BATCH_SIZE)

In [230]:
linear_tar_size = 150
linear_stc_size = 300
linear_sent_size = 250
lambd = 0.7

In [231]:
def Attention_Sentiment(hidden_unit, last_unit, W_h, W_z, b_tanh, v, length):
    
    s1 = hidden_unit.size(0) # batch size 16
    s2 = hidden_unit.size(1) #sequence length 50
    s3 = hidden_unit.size(2) # hidden dimension 768
    
    # 16x50x768 x 768x768 + 16x768 x 768x768 +1x768 = 16x50x768 + 16x1x768 + 1x768 = 16x50x768
    m1 = torch.mm(hidden_unit.contiguous().view(-1,s3),W_h).view(-1, s2, s3)
    # 16x768 x 768x768 = 16x1x768
    m2 = torch.mm(last_unit.contiguous().view(-1,s3),W_z).view(-1, 1, s3)
#   16x768 x 768x768 = 16x768

    # 16x50x768 + 16x1x768 + 1x768 = 16x50x768
    sum_tanh = nn.functional.tanh(m1 + m2 + b_tanh.unsqueeze(0))
    
    #16x50x768 x 768x1 = 16x50x1->16x50
    u = torch.mm(sum_tanh.contiguous().view(-1,s3),v.unsqueeze(1)).view(-1,s2,1).squeeze(2)

    for i in range(len(length)):
        u[i, length[i]:] = torch.Tensor([-1e6])
    
    # alphas size 16x50
    alphas = nn.functional.softmax(u)        

   
    #16x1x50 x 16x50x768 = 16x768
    context = torch.bmm(alphas.unsqueeze(1), hidden_unit).squeeze(1)

    return context, alphas

In [232]:
def Attention_Stance(hidden_unit,last_unit, W_h, W_z, b_tanh, v, length):
    

    s1 = hidden_unit.size(0) # batch size 16
    s2 = hidden_unit.size(1) #sequence length 50
    s3 = hidden_unit.size(2) # hidden dimension 768
    
#    word_tensor = torch.zeros(s1,s2,300).to(device) #16x?x300
#    word_tensor[:,:,:] = target_word

    

    # 16x50x768 x 768x768 = 16x50x768
    m1 = torch.mm(hidden_unit.contiguous().view(-1,s3),W_h).view(-1, s2, s3)
    #16x768 x 768x768 = 16x768 = 16x1x768
    m2 = torch.mm(last_unit.contiguous().view(-1,s3),W_z).view(-1, 1, s3)
    #16x50x768 + 16x1x768 + 1x768 = 16x50x768
    sum_tanh = nn.functional.tanh(m1 + m2 + b_tanh.unsqueeze(0))

    #16x50x768 x 768x1 = 16x50
    u = torch.mm(sum_tanh.contiguous().view(-1,s3),v.unsqueeze(1)).view(-1,s2,1).squeeze(2)
    
    for i in range(len(length)):
        u[i, length[i]:] = torch.Tensor([-1e6])

  
    # alphas size 16x50
    alphas = nn.functional.softmax(u)        

    # context size 16x1x50 x 16x50x768 = 16x1x768 = 16x768
    context = torch.bmm(alphas.unsqueeze(1), hidden_unit).squeeze(1)

    return context, alphas

In [233]:
class SentimentClassifier(nn.Module):
    def __init__(self):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.bert_stc = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        
        self.drop = nn.Dropout(p=0.3)

        self.linear_stc = nn.Linear(self.bert.config.hidden_size*2, linear_stc_size)
        self.linear_sent = nn.Linear(self.bert.config.hidden_size, linear_sent_size)
        

        self.out_stc = nn.Linear(linear_stc_size, 3)
        self.out_sent = nn.Linear(linear_sent_size, 3)
        
        self.relu = nn.ReLU()
        
        self.W_h = nn.Parameter(torch.rand([self.bert.config.hidden_size,self.bert.config.hidden_size],requires_grad=True))
        self.W_z = nn.Parameter(torch.rand([self.bert.config.hidden_size,self.bert.config.hidden_size],requires_grad=True))
        self.b_tanh = nn.Parameter(torch.rand(self.bert.config.hidden_size,requires_grad=True))
        self.v = nn.Parameter(torch.rand(self.bert.config.hidden_size,requires_grad=True))
        
        self.W_h2 = nn.Parameter(torch.rand([self.bert.config.hidden_size,self.bert.config.hidden_size],requires_grad=True))
        self.W_z2 = nn.Parameter(torch.rand([768,self.bert.config.hidden_size],requires_grad=True))
        self.b_tanh2 = nn.Parameter(torch.rand(self.bert.config.hidden_size,requires_grad=True))
        self.v2 = nn.Parameter(torch.rand(self.bert.config.hidden_size,requires_grad=True))
    
    def forward(self, input_ids, attention_mask, x_len):
        output, pooled_output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        
        output1, pooled_output1 = self.bert_stc(
          input_ids=input_ids,
          attention_mask=attention_mask
        )

        drop1 = self.drop(pooled_output)
        drop2 = self.drop(pooled_output1)
        drop3 = self.drop(output)
        drop4 = self.drop(output1)
        
        atten, alphas = Attention_Sentiment(drop3, drop1,self.W_h,self.W_z,self.b_tanh,self.v,x_len)
        atten2, alphas_main = Attention_Stance(drop4, drop2, self.W_h2,self.W_z2,self.b_tanh2,self.v2,x_len)
        
        linear = self.relu(self.linear_sent(atten)) # 16x768->16x250
        out_sent = self.out_sent(self.drop(linear))#16x250->16x3
        
        combine = torch.cat((atten,atten2),1) # 16x768+16x768 -> 16x1536


        lin2 = self.relu(self.linear_stc(combine))#16x1536->16x550
        
        out_stc = self.out_stc(self.drop(lin2))#16x5sq50->16x3

        
        
        return out_sent, out_stc
        

In [234]:
def train_epoch(
  model, 
  data_loader, 
  loss_fn, 
  optimizer, 
  device, 
  scheduler, 
  n_examples
    ):
    model = model.train()

    losses = []
    correct_predictions_sent = 0
    correct_predictions_stc = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)
        stance = d['stance'].to(device)
        sentiment = d['sentiment'].to(device)
        x_len = d['length'].to(device)

        out_sent, out_stc = model(
          input_ids=input_ids,
          attention_mask=attention_mask,
            x_len = x_len
        )

        _, pred_sent = torch.max(out_sent, dim=1)
        _, pred_stc = torch.max(out_stc, dim=1)
        

        loss = (1-lambd)*loss_fn(out_sent, sentiment) + lambd*loss_fn(out_stc, stance)
        

        correct_predictions_sent += torch.sum(pred_sent == sentiment)
        correct_predictions_stc += torch.sum(pred_stc==stance)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions_sent.double() / n_examples, correct_predictions_stc.double() / n_examples,np.mean(losses)

In [235]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions_sent = 0
    correct_predictions_stc = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            stance = d['stance'].to(device)
            sentiment = d['sentiment'].to(device)
            x_len = d['length'].to(device)

            out_sent, out_stc = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            x_len = x_len
            )
            _, pred_sent = torch.max(out_sent, dim=1)
            _, pred_stc = torch.max(out_stc, dim=1)

            loss = (1-lambd)*loss_fn(out_sent, sentiment) + lambd*loss_fn(out_stc, stance)
        

            correct_predictions_sent += torch.sum(pred_sent == sentiment)
            correct_predictions_stc += torch.sum(pred_stc == stance)
            losses.append(loss.item())

    return correct_predictions_sent.double() / n_examples, correct_predictions_stc.double() / n_examples, np.mean(losses)

In [236]:
def get_predictions(model, data_loader):
    model = model.eval()

    review_texts = []
    predictions_sent = []
    prediction_probs_sent = []
    real_values_sent = []
    
    predictions_stc = []
    prediction_probs_stc = []
    real_values_stc = []

    with torch.no_grad():
        for d in data_loader:

            texts = d["doc_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            stance = d['stance'].to(device)
            sentiment = d['sentiment'].to(device)
            x_len = d['length'].to(device)

            out_sent, out_stc = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            x_len = x_len
            )
            _, pred_sent = torch.max(out_sent, dim=1)
            _, pred_stc = torch.max(out_stc, dim=1)

            probs_sent = F.softmax(out_sent, dim=1)
            probs_stc = F.softmax(out_stc, dim=1)

            review_texts.extend(texts)
            predictions_sent.extend(pred_sent)
            prediction_probs_sent.extend(probs_sent)
            real_values_sent.extend(sentiment)
            
            predictions_stc.extend(pred_stc)
            prediction_probs_stc.extend(probs_stc)
            real_values_stc.extend(stance)

    predictions_sent = torch.stack(predictions_sent).to(device)
    prediction_probs_sent = torch.stack(prediction_probs_sent).to(device)
    real_values_sent = torch.stack(real_values_sent).to(device)
    
    predictions_stc = torch.stack(predictions_stc).to(device)
    prediction_probs_stc = torch.stack(prediction_probs_stc).to(device)
    real_values_stc = torch.stack(real_values_stc).to(device)
    
    return review_texts, predictions_sent, prediction_probs_sent, real_values_sent, predictions_stc, prediction_probs_stc, real_values_stc

In [239]:
EPOCHS = 1


class_names = ['abortion', 'hillary', 'feminism', 'climate', 'atheism']
class_names1 = ['AGAINST','FAVOR', 'NONE']
class_names2 = ['neg', 'pos','other']

model = SentimentClassifier().to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay = 1e-2, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = nn.CrossEntropyLoss().to(device)
history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 20)

    train_acc_sent, train_acc_stc, train_loss = train_epoch(model,
        train_data_loader,    
        loss_fn, 
        optimizer, 
        device, 
        scheduler, 
        len(df_train)
        )

    print(f'Train loss {train_loss} accuracy_sent {train_acc_sent} accuracy_stc {train_acc_stc}')
#    print(f'Train loss {train_loss} accuracy_sent {train_acc_sent}')

    val_acc_sent,val_acc_stc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn, 
        device, 
        len(df_val)
        )

    print(f'Val loss {val_loss} accuracy_sent {val_acc_sent} accuracy_stc {val_acc_stc}')
#    print(f'Val loss {val_loss} accuracy_sent {val_acc_sent}')
    print()

    history['train_acc_sent'].append(train_acc_sent)
    history['train_acc_stc'].append(train_acc_stc)
    history['train_loss'].append(train_loss)
    history['val_acc_sent'].append(val_acc_sent)
    history['val_acc_stc'].append(val_acc_stc)
    history['val_loss'].append(val_loss)

    if val_acc_stc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc_stc
        
    test_acc_sent,test_acc_stc, _ = eval_model(
          model,
          test_data_loader,
          loss_fn,
          device,
          len(df_test)
        )

print('\nTest Accuracy_sent:\n')
print(test_acc_sent.item())
print('\nTest Accuracy_stc:\n')
print(test_acc_stc.item())

y_review_texts, y_pred_sent, y_pred_probs_sent, y_test_sent, y_pred_stc, y_pred_probs_stc, y_test_stc= get_predictions(model,test_data_loader)

print('\n Sentiment prediction:\n')
print(classification_report(y_test_sent, y_pred_sent, target_names=class_names2))
print('Stance prediction:\n')
print(classification_report(y_test_stc, y_pred_stc, target_names=class_names1))

Epoch 1/1
--------------------


KeyboardInterrupt: 