In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import torchvision
import torch.optim as optim

import numpy as np
from sklearn.manifold import TSNE

import argparse, sys, os

import torch
from torchtext import data, datasets
import random
torch.backends.cudnn.deterministic = True

import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.nn.init as init
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

import time
from collections import Counter
import matplotlib.pyplot as plt 


In [2]:
from google.colab import files
uploaded = files.upload()

In [3]:
import pandas as pd

import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('stopwords')


twitter = pd.read_csv("twitter.csv")
forum = pd.read_csv("Forum.csv")
wiki = pd.read_csv("Wiki.csv")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# remove puctunations, https
import re
def  clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z]+)|([^A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
    return df

twitter = clean_text(twitter, 'text')
forum = clean_text(forum, 'text')
wiki = clean_text(wiki, 'text')

stopwords = nltk.corpus.stopwords.words('english')
twitter['text'] = twitter['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
forum['text'] = forum['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
wiki['text'] = wiki['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))


In [5]:
twitter['domain'] = 0
forum['domain'] = 0
wiki['domain'] = 1

In [6]:
twitter = twitter[['text', 'label', 'domain']]
forum = forum[['text', 'label', 'domain']]
wiki = wiki[['text', 'label', 'domain']]

In [7]:
forum.to_csv('F.csv', index=False)
twitter.to_csv('T.csv', index=False)
wiki.to_csv('W.csv', index=False)

#tokenize

In [8]:
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [9]:
TEXT = data.Field(tokenize = 'spacy', batch_first=True)
LABEL = data.LabelField(dtype=torch.float)
DOMAIN = data.LabelField(dtype=torch.float)

In [10]:
#loading custom dataset
T =data.TabularDataset(path = 'T.csv',format = 'csv',fields = [('text', TEXT),('label',LABEL),('domain',DOMAIN)],skip_header = True)

#print preprocessed text
print(vars(T.examples[0]))

{'text': ['paid', 'hold', 'monkey', 'look', 'monkey', 'real', 'af'], 'label': '0.0', 'domain': '0'}


In [11]:
#loading custom dataset
W =data.TabularDataset(path = 'W.csv',format = 'csv',fields = [('text', TEXT),('label',LABEL),('domain',DOMAIN)],skip_header = True)

#print preprocessed text
print(vars(W.examples[0]))

{'text': ['right', 'fine', 'did', 'nt', 'even', 'start', 'discussion', 'joined', 'every', 'time', 'someone', 'tries', 'get', 'article', 'neutral', 'bullheaded', 'evolutionist', 'jumps', 'throat', 'stupid', 'evolution', 'proven', 'you', 're', 'idiot'], 'label': '1.0', 'domain': '1'}


In [12]:
#loading custom dataset
F =data.TabularDataset(path = 'F.csv',format = 'csv',fields = [('text', TEXT),('label',LABEL),('domain',DOMAIN)],skip_header = True)

#print preprocessed text
print(vars(F.examples[0]))

{'text': ['watch', 'video', 'minutes', 'see', 'eventually', 'biracial', 'minority', 'butchered'], 'label': '1.0', 'domain': '0'}


In [13]:
F.examples[0].label

'1.0'

In [14]:
T_train_data, T_test_data = T.split(split_ratio=0.8, random_state = random.seed(SEED))
F_train_data, F_test_data = F.split(split_ratio=0.8, random_state = random.seed(SEED))
W_train_data, W_test_data = W.split(split_ratio=0.8, random_state = random.seed(SEED))

In [15]:
T_train_data, T_val_data = T_train_data.split(split_ratio=0.8, random_state = random.seed(SEED))
F_train_data, F_val_data = F_train_data.split(split_ratio=0.8, random_state = random.seed(SEED))
W_train_data, W_val_data = W_train_data.split(split_ratio=0.8, random_state = random.seed(SEED))

#convert texts into integer sequences.

In [16]:
#initialize glove embeddings
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(T_train_data, F_train_data, W_train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_) 
LABEL.build_vocab(T_train_data, F_train_data, W_train_data)
DOMAIN.build_vocab(T_train_data, F_train_data, W_train_data)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#No. of unique tokens in DOMAIN
print("Size of DOMAIN vocabulary:",len(DOMAIN.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
print(TEXT.vocab.stoi)   

Size of TEXT vocabulary: 25002
Size of LABEL vocabulary: 2
Size of DOMAIN vocabulary: 2
[('nt', 3393), ('like', 1938), ('i', 1544), ('fuck', 1457), ('u', 1445), ('do', 1282), ('pig', 1278), ('know', 1185), ('ass', 1175), ('nigger', 1174)]


In [17]:
#check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 64

#Load an iterator
T_train_iterator, T_valid_iterator, T_test_iterator = data.BucketIterator.splits(
    (T_train_data, T_val_data, T_test_data), 
    batch_size = BATCH_SIZE, 
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

F_train_iterator, F_valid_iterator, F_test_iterator = data.BucketIterator.splits(
    (F_train_data, F_val_data, F_test_data), 
    batch_size = BATCH_SIZE, 
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

W_train_iterator, W_valid_iterator, W_test_iterator = data.BucketIterator.splits(
    (W_train_data, W_val_data, W_test_data), 
    batch_size = BATCH_SIZE, 
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [18]:
class GradReverse(torch.autograd.Function):
    """
    Extension of grad reverse layer
    """
    @staticmethod
    def forward(ctx, x, constant):
        ctx.constant = constant
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        grad_output = grad_output.neg() * ctx.constant
        return grad_output, None

    def grad_reverse(x, constant):
        return GradReverse.apply(x, constant)

class Extractor(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.conv_0 = nn.Conv2d(in_channels = 1, 
                                out_channels = n_filters, 
                                kernel_size = (filter_sizes, embedding_dim))
        
        
        
        self.fc = nn.Linear(n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)

        self.relu = nn.ReLU(inplace=True)
        
    def forward(self, text):
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved_0 = self.relu(self.conv_0(embedded).squeeze(3))
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        
        #pooled_n = [batch size, n_filters]
        cat = self.dropout(pooled_0)
        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return cat

class Class_classifier(nn.Module):

    def __init__(self, n_filters, output_dim):
        super(Class_classifier, self).__init__()
        self.fc = nn.Linear(n_filters, output_dim)

    def forward(self, input):
        x = self.fc(input)
        return x

class Domain_classifier(nn.Module):

    def __init__(self, n_filters, output_dim):
        super(Domain_classifier, self).__init__()
        self.fc = nn.Linear(n_filters, output_dim)

    def forward(self, input, constant):
        input = GradReverse.grad_reverse(input, constant)
        return self.fc(input)


In [19]:
def optimizer_scheduler(optimizer, p):
    """
    Adjust the learning rate of optimizer
    :param optimizer: optimizer for updating parameters
    :param p: a variable for adjusting learning rate
    :return: optimizer
    """
    for param_group in optimizer.param_groups:
        param_group['lr'] = 0.01 / (1. + 10 * p) ** 0.75

    return optimizer

In [20]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = 1
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

extractor = Extractor(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
class_classifier = Class_classifier(N_FILTERS, OUTPUT_DIM)
domain_classifier = Domain_classifier(N_FILTERS, OUTPUT_DIM)

pretrained_embeddings = TEXT.vocab.vectors

extractor.embedding.weight.data.copy_(pretrained_embeddings)


tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [ 0.3181,  1.0088, -0.2626,  ..., -0.0162,  1.8295, -0.5739],
        ...,
        [-0.7585,  1.9048, -0.4948,  ...,  1.8588,  0.9485, -1.1707],
        [-0.5974,  0.3798, -0.0165,  ..., -0.6669,  0.2105, -0.8549],
        [ 0.3516, -0.4719,  0.0028,  ...,  0.2484, -0.6018,  0.1988]])

In [21]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

extractor.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
extractor.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [22]:
criterion = nn.BCEWithLogitsLoss()

extractor = extractor.to(device)
class_classifier = class_classifier.to(device)
domain_classifier = domain_classifier.to(device)
criterion = criterion.to(device)

In [23]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division

    return correct

In [24]:
def train(training_mode, extractor, class_classifier, domain_classifier, criterionn,
          source_data1, source_data2, target_data, optimizer, epoch):
  
    epoch_loss = 0
    epoch_acc = 0

    # setup models
    extractor.train()
    class_classifier.train()
    domain_classifier.train()

    # steps
    start_steps = epoch * len(source_data1)
    total_steps = 10 * len(source_data1)

    for batch_idx,(sdata1, sdata2, tdata) in enumerate(zip(source_data1, source_data2, target_data)):
        if training_mode == 'dann':
            # setup hyperparameters
            p = float(batch_idx + start_steps) / total_steps
            constant = 2. / (1. + np.exp(-gamma * p)) - 1
            # setup optimizer
            optimizer = optimizer_scheduler(optimizer, p)
            optimizer.zero_grad()

            # compute the output of source domain and target domain
            src_feature1 = extractor(sdata1.text)
            src_feature2 = extractor(sdata2.text)
            tgt_feature = extractor(tdata.text)

             # compute the class loss of src_feature
            class_pred1 = class_classifier(src_feature1).squeeze(1)
            class_pred2 = class_classifier(src_feature2).squeeze(1)

            class_loss1 = criterion(class_pred1, sdata1.label)
            class_loss2 = criterion(class_pred2, sdata2.label)

            # compute the domain loss of src_feature and target_feature
            tgt_preds = domain_classifier(tgt_feature, constant).squeeze(1)
            src_pred1 = domain_classifier(src_feature1, constant).squeeze(1)
            src_pred2 = domain_classifier(src_feature2, constant).squeeze(1)
            
            tgt_loss = criterion(tgt_preds, tdata.domain)
            src_loss1 = criterion(src_pred1, sdata1.domain)
            src_loss2 = criterion(src_pred2, sdata2.domain)
            
            domain_loss = tgt_loss + src_loss1 + src_loss2
            class_loss = class_loss1 + class_loss2

            loss = class_loss + theta * domain_loss

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
    return epoch_loss / len(sdata1)    

In [25]:
def test(extractor, class_classifier, domain_classifier, source_data1, source_data2, target_data):
 
    # setup the network
    extractor.eval()
    class_classifier.eval()
    domain_classifier.eval()
    source_correct1 = 0.0
    source_correct2 = 0.0
    target_correct = 0.0
    domain_correct = 0.0
    tgt_correct = 0.0
    src_correct1 = 0.0
    src_correct2 = 0.0

    for batch_idx, sdata in enumerate(source_data1):
        # setup hyperparameters
        p = float(batch_idx) / len(source_data1)
        constant = 2. / (1. + np.exp(-10 * p)) - 1.

        output1 = class_classifier(extractor(sdata.text)).squeeze(1)
        s1_class_acc = binary_accuracy(output1, sdata.label)

        src_preds = domain_classifier(extractor(sdata.text), constant).squeeze(1)
        s1_domain_acc = binary_accuracy(src_preds, sdata.domain)

    for batch_idx, sdata in enumerate(source_data2):
        # setup hyperparameters
        p = float(batch_idx) / len(source_data2)
        constant = 2. / (1. + np.exp(-10 * p)) - 1.

        output2 = class_classifier(extractor(sdata.text)).squeeze(1)
        s2_class_acc = binary_accuracy(output1, sdata.label)

        src2_preds = domain_classifier(extractor(sdata.text), constant).squeeze(1)
        s2_domain_acc = binary_accuracy(src2_preds, sdata.domain)

    for batch_idx, tdata in enumerate(target_data):
        # setup hyperparameters
        p = float(batch_idx) / len(target_dataloader)
        constant = 2. / (1. + np.exp(-10 * p)) - 1

        output3 = class_classifier(extractor(tdata.text)).squeeze(1)
        t_class_acc = binary_accuracy(output1, tdata.label)

        t_preds = domain_classifier(extractor(tdata.text), constant).squeeze(1)
        t_domain_acc = binary_accuracy(t_preds, tdata.domain)

    domain_correct = s1_domain_acc + s2_domain_acc + t_domain_acc

    print('\nSource1 Accuracy(classification): {}/{} ({:.4f}%)\nSource2 Accuracy(classification): {}/{} ({:.4f}%)\nTarget Accuracy(classification): {}/{} ({:.4f}%)\n'
          'Domain Accuracy: {}/{} ({:.4f}%)\n'.
        format(
        s1_class_acc, len(source_data1), 100. * float(s1_class_acc) / len(source_data1),
        s2_class_acc, len(source_data2), 100. * float(s2_class_acc) / len(source_data2),
        t_class_acc, len(target_data), 100. * float(t_class_acc) / len(target_data),
        domain_correct, len(source_data1) + len(source_data2) + len(target_data.data),
        100. * float(domain_correct) / (len(source_data1) + len(source_data2) 
                                        + len(target_data))
    ))
    acc_list1.append(100. * float(s1_class_acc) / len(source_data1))
    acc_list2.append(100. * float(s1_class_acc) / len(source_data2))
    acc_list3.append(100. * float(t_class_acc) / len(target_data))
    acc_list4.append(100. * float(domain_correct) / (len(source_data1) + len(source_data2) 
                                        + len(target_data)))


In [26]:
def main():

    # prepare the source data and target data
    src_train_data1 = T_train_iterator
    src_test_data1 = T_test_iterator
    src_train_data2 = F_train_iterator
    src_test_data2 = F_test_iterator
    
    tgt_train_data = W_train_iterator
    tgt_test_data = W_test_iterator

    for epoch in range(100):
    
        print('Epoch: {}'.format(epoch))
        train('dann', extractor, class_classifier, domain_classifier, criterion,
                    src_train_data1, src_train_data2, tgt_train_data, optimizer, epoch)
        test(extractor, class_classifier, domain_classifier, src_test_data1, src_test_data2, tgt_test_data)
        
total_loss, d_loss, c_loss = [],[],[]
acc_list1, acc_list2, acc_list3, acc_list4 = [],[],[],[]
if __name__ == '__main__':
    gamma = 10
    theta = 1
    batch_size = 48
    time_start=time.time()
    main()
    time_end=time.time()
    print('total run time: (min)',(time_end-time_start)/60.)

Epoch: 0


TypeError: ignored

In [None]:
plt.plot(range(len(total_loss)),total_loss,c='r',label='total_loss')
plt.plot(d_loss,c='b',label='domain_loss')
plt.plot(c_loss,c='y',label='clf_loss')
plt.title('target domain: session2')
plt.legend(loc='best')
plt.show()

In [None]:
print('max target accuracy: ',max(acc_list3))
plt.plot(range(len(acc_list1)),acc_list1,c='r',label='source1_acc')
plt.plot(acc_list2,c='b',label='source2_acc')
plt.plot(acc_list3,c='g',label='target_acc')
plt.plot(acc_list4,c='y',label='domain_acc')
plt.axhline(max(acc_list3),c='b',linestyle='--')
plt.title('target domain: session2')
plt.legend(loc='best')
plt.show()