# RATIO 2019 - Benchmarking Workshop

In [1]:
%load_ext autoreload
import os
os.environ['CUDA_VISIBLE_DEVICES']='6'
os.environ['CUDA_LAUNCH_BLOCKING']='1'

from torch.utils.data import DataLoader,Dataset
from torch.autograd import Variable
import matplotlib.pyplot as plt
import torchvision.utils
import numpy as np
import time
import copy
from torch.optim import lr_scheduler
import os
from PIL import Image
import torch
from torch.autograd import Variable
import PIL.ImageOps    
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import pandas as pd 

In [2]:
import pandas as pd
import csv
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize

In [3]:
from sklearn.model_selection import train_test_split
import nltk
nltk.download('wordnet')
def get_train_test_sets(df):
    X = df[['argument1', 'argument2', 'topic']]
    y = df[['is_same_side']]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1, shuffle=True)
    return X_train, X_test, y_train, y_test

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Task 1 - Same Side Clasiification



In [6]:
data_cross_path = 'data/same-side-classification/cross-topic/{}.csv'
data_within_path = 'data/same-side-classification/within-topic/{}.csv'

### Load within-topics and cross-topics data

In [7]:
cross_traindev_df = pd.read_csv(data_cross_path.format('training'), 
                                quotechar='"',quoting=csv.QUOTE_ALL,encoding='utf-8',escapechar='\\',doublequote=False, dtype={'topic': 'str'}, index_col='id')
cross_test_df =  pd.read_csv(data_cross_path.format('test'), 
                                quotechar='"',quoting=csv.QUOTE_ALL,encoding='utf-8',escapechar='\\',doublequote=False, dtype={'topic': 'str'}, index_col='id')

within_traindev_df =  pd.read_csv(data_within_path.format('training'), 
                                quotechar='"',quoting=csv.QUOTE_ALL,encoding='utf-8',escapechar='\\',doublequote=False, dtype={'topic': 'str'}, index_col='id')
within_test_df =  pd.read_csv(data_within_path.format('test'), 
                                quotechar='"',quoting=csv.QUOTE_ALL,encoding='utf-8',escapechar='\\',doublequote=False, dtype={'topic': 'str'}, index_col='id')

In [8]:
# Adding a tag for the topics in focus: "gay marriage" and "abortion"
def add_tag(row):
    title = row['topic'].lower().strip()
    if title.find('abortion') > -1 :
        row['tag'] = 'abortion'
    elif title.find('gay marriage') > -1 :
        row['tag'] = 'gay marriage'
    else:
        row['tag'] = 'NA'
    return row

cross_traindev_df = cross_traindev_df.apply(add_tag, axis=1)
cross_test_df = cross_test_df.apply(add_tag, axis=1)

within_traindev_df = within_traindev_df.apply(add_tag, axis=1)
within_test_df = within_test_df.apply(add_tag, axis=1)



In [9]:
within_traindev_df['tag'].unique()

array(['gay marriage', 'abortion'], dtype=object)

In [41]:
within_traindev_df[(within_traindev_df['tag'] == 'gay marriage') & (within_traindev_df['tag'] == 'gay marriage')]

Unnamed: 0_level_0,argument1,argument1_id,argument2,argument2_id,debate_id,is_same_side,topic,tag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
85249,"gay marriage devalues marriage, frequency of o...",d2f4b1cd-2019-04-17T11:47:27Z-00063-000,being unaccustomed to gay marriage is no argument,d2f4b1cd-2019-04-17T11:47:27Z-00063-000,d2f4b1cd-2019-04-17T11:47:27Z,False,"gay marriage, debate on same sex marriage",gay marriage
84168,marriage is defined as between a man and woman,d2f4b1cd-2019-04-17T11:47:27Z-00092-000,marriage is celebrated because of the assumpti...,d2f4b1cd-2019-04-17T11:47:27Z-00092-000,d2f4b1cd-2019-04-17T11:47:27Z,False,"gay marriage, debate on same sex marriage",gay marriage
85569,"al rantell, a homosexual talk-show host in la....",d2f4b1cd-2019-04-17T11:47:27Z-00154-000,denying marriage to infertile would be too costly,d2f4b1cd-2019-04-17T11:47:27Z-00154-000,d2f4b1cd-2019-04-17T11:47:27Z,False,"gay marriage, debate on same sex marriage",gay marriage
89518,children have a claim to biological parents,d2f4b1cd-2019-04-17T11:47:27Z-00028-000,gay spouses can helpfully adopt orphaned kids....,d2f4b1cd-2019-04-17T11:47:27Z-00028-000,d2f4b1cd-2019-04-17T11:47:27Z,False,"gay marriage, debate on same sex marriage",gay marriage
6353,many laws already give civil unions equal bene...,40f91664-2019-04-17T11:47:29Z-00064-000,civil partners can only file taxes in register...,40f91664-2019-04-17T11:47:29Z-00064-000,40f91664-2019-04-17T11:47:29Z,True,civil unions vs. gay marriage,gay marriage
92755,i agree with all my opponents rules and regula...,ea2e9a61-2019-04-18T17:43:10Z-00008-000,debate meaning and purpose:i feel it is import...,ea2e9a61-2019-04-18T17:43:10Z-00008-000,ea2e9a61-2019-04-18T17:43:10Z,False,gay marriage should be legal,gay marriage
74517,gay parenting is just as good as straight pare...,d2f4b1cd-2019-04-17T11:47:27Z-00033-000,gay marriage is no worse for institution than ...,d2f4b1cd-2019-04-17T11:47:27Z-00033-000,d2f4b1cd-2019-04-17T11:47:27Z,True,"gay marriage, debate on same sex marriage",gay marriage
93109,gay marriage is not the same as marriage equal...,eb8b217-2019-04-18T17:33:54Z-00006-000,gay marriage is not the same as marriage equal...,eb8b217-2019-04-18T17:33:54Z-00006-000,eb8b217-2019-04-18T17:33:54Z,False,gay marriage is not the same as marriage equality,gay marriage
9406,civil unions give gays equal benefits w/o chan...,40f91664-2019-04-17T11:47:29Z-00050-000,gay marriage is a negligible change to institu...,40f91664-2019-04-17T11:47:29Z-00050-000,40f91664-2019-04-17T11:47:29Z,False,civil unions vs. gay marriage,gay marriage
85959,gay marriage devalues the institution of marriage,d2f4b1cd-2019-04-17T11:47:27Z-00064-000,gay marriage doesn't weaken desire of straight...,d2f4b1cd-2019-04-17T11:47:27Z-00064-000,d2f4b1cd-2019-04-17T11:47:27Z,False,"gay marriage, debate on same sex marriage",gay marriage


### Approach

In [13]:
%autoreload
from cnn_siamese import *
from cnn_siamese_train import *
import torchtext.data as data

In [11]:
X_train, X_dev, y_train, y_dev = get_train_test_sets(cross_traindev_df)

In [12]:
x1_train, x2_train, y_train = df_to_lists(X_train, y_train)

In [14]:
x1_dev, x2_dev, y_dev = df_to_lists(X_dev, y_dev)

In [15]:
text_field = data.Field(lower=True)
label_field = data.Field(sequential=False)

In [16]:
train_data = list(zip(x1_train, x2_train))
dev_data   = list(zip(x1_dev, x2_dev))

train_data = list(zip(train_data, y_train))
dev_data   = list(zip(dev_data, y_dev))

In [17]:
train_ds = MR(text_field, label_field, train_data)
dev_ds = MR(text_field, label_field, dev_data)

In [18]:
text_field.build_vocab(train_ds, dev_ds)
label_field.build_vocab(train_ds, dev_ds)

In [19]:
train_iter, dev_iter = data.Iterator.splits((train_ds, dev_ds), batch_sizes=(16, len(dev_ds)))

In [20]:
vocab_len = len(text_field.vocab)

cnn = CNN_Siamese(vocab_len)
cnn = cnn.cuda()

In [21]:
train(train_iter, dev_iter, cnn)

RuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/aten/src/THCUNN/generic/ClassNLLCriterion.cu:111

### Test Model:

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [27]:
torch.tensor([1.2, 3.0]).cuda()

RuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/aten/src/THC/generic/THCTensorCopy.cpp:20

In [17]:
# Load the saved model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SiameseNetwork().to(device)
model.load_state_dict(torch.load("/model.pt"))

In [19]:
test_dataset = SiameseNetworkDataset(x1_dev, x2_dev, y_dev)
test_dataloader = DataLoader(test_dataset, shuffle=True, num_workers=8,batch_size=1)

In [20]:
# Print the sample outputs to view its dissimilarity
counter=0
list_0 = torch.FloatTensor([[0]])
list_1 = torch.FloatTensor([[1]])
counter = 0
for i, data in enumerate(test_dataloader,0): 
    x0, x1 , label = data
    concatenated = torch.cat((x0,x1),0)
    output1,output2 = model(x0.to(device),x1.to(device))
    eucledian_distance = F.pairwise_distance(output1, output2)
    if label==list_0:
        label="Same Side"
    else:
        label="Different Side"
    print('Dissimilarity: {:.2f} Label: {}'.format(eucledian_distance.item(),label))
    counter +=1
    if counter > 10:
        break

Dissimilarity: 0.04 Label: Same Side
Dissimilarity: 0.71 Label: Same Side
Dissimilarity: 0.19 Label: Different Side
Dissimilarity: 0.00 Label: Different Side
Dissimilarity: 0.41 Label: Same Side
Dissimilarity: 0.45 Label: Different Side
Dissimilarity: 0.00 Label: Same Side
Dissimilarity: 0.84 Label: Different Side
Dissimilarity: 0.79 Label: Same Side
Dissimilarity: 0.57 Label: Different Side
Dissimilarity: 0.45 Label: Different Side


In [21]:
accuracy=0
counter=0
correct=0
for i, data in enumerate(test_dataloader,0): 
    x0, x1 , label = data
    # onehsot applies in the output of 128 dense vectors which is then converted to 2 dense vectors
    output1,output2 = model(x0.to(device),x1.to(device))
    res=torch.abs(output1.cuda() - output2.cuda())
    label=label[0].tolist()
    label=int(label[0])
    result=torch.max(res,1)[1][0][0][0].data[0].tolist()
    if label == result:
        correct=correct+1
        counter=counter+1
    
accuracy=(correct/len(test_dataloader))*100
print("Accuracy:{}%".format(accuracy))

  # This is added back by InteractiveShellApp.init_path()


Accuracy:13.475293475293476%
